gbk.c (5612B)
1 2 #include "wc.h" 3 #include "gbk.h" 4 #include "search.h" 5 #include "wtf.h" 6 #ifdef USE_UNICODE 7 #include "ucs.h" 8 #endif 9 10 #include "map/gb2312_gbk.map" 11 12 #define C0 WC_GBK_MAP_C0 13 #define GL WC_GBK_MAP_GL 14 #define C1 WC_GBK_MAP_C1 15 #define LB WC_GBK_MAP_LB 16 #define UB WC_GBK_MAP_UB 17 #define C80 WC_GBK_MAP_80 18 19 wc_uint8 WC_GBK_MAP[ 0x100 ] = { 20 C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, 21 C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, 22 GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, 23 GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, 24 LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, 25 LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, 26 LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, 27 LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, C0, 28 29 C80,UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, 30 UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, 31 UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, 32 UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, 33 UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, 34 UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, 35 UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, 36 UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, C1, 37 }; 38 39 wc_ccs 40 wc_gb2312_or_gbk(wc_uint16 code) { 41 return wc_map_range_search(code, 42 gb2312_gbk_map, N_gb2312_gbk_map) 43 ? WC_CCS_GBK : WC_CCS_GB_2312; 44 } 45 46 wc_wchar_t 47 wc_gbk_to_cs128w(wc_wchar_t cc) 48 { 49 cc.code = WC_GBK_N(cc.code); 50 if (cc.code < 0x4000) 51 cc.ccs = WC_CCS_GBK_1; 52 else { 53 cc.ccs = WC_CCS_GBK_2; 54 cc.code -= 0x4000; 55 } 56 cc.code = WC_N_CS128W(cc.code); 57 return cc; 58 } 59 60 wc_wchar_t 61 wc_cs128w_to_gbk(wc_wchar_t cc) 62 { 63 cc.code = WC_CS128W_N(cc.code); 64 if (cc.ccs == WC_CCS_GBK_2) 65 cc.code += 0x4000; 66 cc.ccs = WC_CCS_GBK; 67 cc.code = WC_N_GBK(cc.code); 68 return cc; 69 } 70 71 wc_uint32 72 wc_gbk_to_N(wc_uint32 c) 73 { 74 if (c <= 0xA1A0) /* 0x8140 - 0xA1A0 */ 75 return WC_GBK_N(c); 76 if (c <= 0xA2AA) /* 0xA240 - 0xA2A0, 0xA2A1 - 0xA2AA */ 77 return WC_GBK_N(c) - ((c >> 8) - 0xA1) * 0x5E; 78 if (c <= 0xA6A0) /* 0xA240 - 0xA6A0 */ 79 return WC_GBK_N(c) - ((c >> 8) - 0xA1) * 0x5E + 0x0A; 80 if (c <= 0xA6F5) /* 0xA6E0 - 0xA6F5 */ 81 return WC_GBK_N(c) - ((c >> 8) - 0xA1) * 0x5E + 0x0A - 0x3F; 82 if (c <= 0xA8A0) /* 0xA7A0 - 0xA8A0 */ 83 return WC_GBK_N(c) - ((c >> 8) - 0xA1) * 0x5E + 0x0A + 0x16; 84 if (c <= 0xA8C0) /* 0xA8BB - 0xA8C0 */ 85 return WC_GBK_N(c) - ((c >> 8) - 0xA1) * 0x5E + 0x0A + 0x16 - 0x1A; 86 /* 0xA940 - 0xFEA0 */ 87 return WC_GBK_N(c) - ((c >> 8) - 0xA1) * 0x5E + 0x0A + 0x16 + 0x06; 88 } 89 90 Str 91 wc_conv_from_gbk(Str is, wc_ces ces) 92 { 93 Str os; 94 wc_uchar *sp = (wc_uchar *)is->ptr; 95 wc_uchar *ep = sp + is->length; 96 wc_uchar *p; 97 int state = WC_GBK_NOSTATE; 98 wc_uint32 gbk; 99 100 for (p = sp; p < ep && *p < 0x80; p++) 101 ; 102 if (p == ep) 103 return is; 104 os = Strnew_size(is->length); 105 if (p > sp) 106 Strcat_charp_n(os, (char *)is->ptr, (int)(p - sp)); 107 108 for (; p < ep; p++) { 109 switch (state) { 110 case WC_GBK_NOSTATE: 111 switch (WC_GBK_MAP[*p]) { 112 case UB: 113 state = WC_GBK_MBYTE1; 114 break; 115 case C80: 116 wtf_push(os, WC_CCS_GBK_80, *p); 117 break; 118 case C1: 119 wtf_push_unknown(os, p, 1); 120 break; 121 default: 122 Strcat_char(os, (char)*p); 123 break; 124 } 125 break; 126 case WC_GBK_MBYTE1: 127 if (WC_GBK_MAP[*p] & LB) { 128 gbk = ((wc_uint32)*(p-1) << 8) | *p; 129 if (*(p-1) >= 0xA1 && *p >= 0xA1) 130 wtf_push(os, wc_gb2312_or_gbk(gbk), gbk); 131 else 132 wtf_push(os, WC_CCS_GBK, gbk); 133 } else 134 wtf_push_unknown(os, p-1, 2); 135 state = WC_GBK_NOSTATE; 136 break; 137 } 138 } 139 switch (state) { 140 case WC_GBK_MBYTE1: 141 wtf_push_unknown(os, p-1, 1); 142 break; 143 } 144 return os; 145 } 146 147 void 148 wc_push_to_gbk(Str os, wc_wchar_t cc, wc_status *st) 149 { 150 while (1) { 151 switch (cc.ccs) { 152 case WC_CCS_US_ASCII: 153 Strcat_char(os, (char)cc.code); 154 return; 155 case WC_CCS_GB_2312: 156 Strcat_char(os, (char)((cc.code >> 8) | 0x80)); 157 Strcat_char(os, (char)((cc.code & 0xff) | 0x80)); 158 return; 159 case WC_CCS_GBK_80: 160 Strcat_char(os, (char)(cc.code | 0x80)); 161 return; 162 case WC_CCS_GBK_1: 163 case WC_CCS_GBK_2: 164 cc = wc_cs128w_to_gbk(cc); 165 case WC_CCS_GBK: 166 Strcat_char(os, (char)(cc.code >> 8)); 167 Strcat_char(os, (char)(cc.code & 0xff)); 168 return; 169 case WC_CCS_UNKNOWN_W: 170 if (!WcOption.no_replace) 171 Strcat_charp(os, WC_REPLACE_W); 172 return; 173 case WC_CCS_UNKNOWN: 174 if (!WcOption.no_replace) 175 Strcat_charp(os, WC_REPLACE); 176 return; 177 default: 178 #ifdef USE_UNICODE 179 if (WcOption.ucs_conv) 180 cc = wc_any_to_any_ces(cc, st); 181 else 182 #endif 183 cc.ccs = WC_CCS_IS_WIDE(cc.ccs) ? WC_CCS_UNKNOWN_W : WC_CCS_UNKNOWN; 184 continue; 185 } 186 } 187 } 188 189 Str 190 wc_char_conv_from_gbk(wc_uchar c, wc_status *st) 191 { 192 static Str os; 193 static wc_uchar gbku; 194 wc_uint32 gbk; 195 196 if (st->state == -1) { 197 st->state = WC_GBK_NOSTATE; 198 os = Strnew_size(8); 199 } 200 201 switch (st->state) { 202 case WC_GBK_NOSTATE: 203 switch (WC_GBK_MAP[c]) { 204 case UB: 205 gbku = c; 206 st->state = WC_GBK_MBYTE1; 207 return NULL; 208 case C80: 209 wtf_push(os, WC_CCS_GBK_80, c); 210 break; 211 case C1: 212 break; 213 default: 214 Strcat_char(os, (char)c); 215 break; 216 } 217 break; 218 case WC_GBK_MBYTE1: 219 if (WC_GBK_MAP[c] & LB) { 220 gbk = ((wc_uint32)gbku << 8) | c; 221 if (gbku >= 0xA1 && c >= 0xA1) 222 wtf_push(os, wc_gb2312_or_gbk(gbk), gbk); 223 else 224 wtf_push(os, WC_CCS_GBK, gbk); 225 } 226 break; 227 } 228 st->state = -1; 229 return os; 230 }