gb18030.c (9567B)
1 2 #include "wc.h" 3 #include "gb18030.h" 4 #include "search.h" 5 #include "wtf.h" 6 #ifdef USE_UNICODE 7 #include "ucs.h" 8 #endif 9 #include "map/gb18030_ucs.map" 10 11 #define C0 WC_GB18030_MAP_C0 12 #define GL WC_GB18030_MAP_GL 13 #define C1 WC_GB18030_MAP_C1 14 #define LB WC_GB18030_MAP_LB 15 #define UB WC_GB18030_MAP_UB 16 #define L4 WC_GB18030_MAP_L4 17 18 wc_uint8 WC_GB18030_MAP[ 0x100 ] = { 19 C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, 20 C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, 21 GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, 22 L4, L4, L4, L4, L4, L4, L4, L4, L4, L4, GL, GL, GL, GL, GL, GL, 23 LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, 24 LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, 25 LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, 26 LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, C0, 27 28 LB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, 29 UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, 30 UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, 31 UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, 32 UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, 33 UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, 34 UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, 35 UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, C1, 36 }; 37 38 wc_wchar_t 39 wc_gbk_ext_to_cs128w(wc_wchar_t cc) 40 { 41 cc.code = WC_GBK_N(cc.code); 42 if (cc.code < 0x4000) 43 cc.ccs = WC_CCS_GBK_EXT_1; 44 else { 45 cc.ccs = WC_CCS_GBK_EXT_2; 46 cc.code -= 0x4000; 47 } 48 cc.code = WC_N_CS128W(cc.code); 49 return cc; 50 } 51 52 wc_wchar_t 53 wc_cs128w_to_gbk_ext(wc_wchar_t cc) 54 { 55 cc.code = WC_CS128W_N(cc.code); 56 if (cc.ccs == WC_CCS_GBK_EXT_2) 57 cc.code += 0x4000; 58 cc.ccs = WC_CCS_GBK_EXT; 59 cc.code = WC_N_GBK(cc.code); 60 return cc; 61 } 62 63 static wc_ccs 64 wc_gbk_or_gbk_ext(wc_uint16 code) { 65 return wc_map3_range_search(code, 66 gbk_ext_ucs_map, N_gbk_ext_ucs_map) 67 ? WC_CCS_GBK_EXT : WC_CCS_GBK; 68 } 69 70 #ifdef USE_UNICODE 71 wc_uint32 72 wc_gb18030_to_ucs(wc_wchar_t cc) 73 { 74 wc_map3 *map; 75 76 switch (WC_CCS_SET(cc.ccs)) { 77 case WC_CCS_GBK_EXT_1: 78 case WC_CCS_GBK_EXT_2: 79 cc = wc_cs128w_to_gbk_ext(cc); 80 case WC_CCS_GBK_EXT: 81 map = wc_map3_range_search((wc_uint16)cc.code, 82 gbk_ext_ucs_map, N_gbk_ext_ucs_map); 83 if (map) 84 return map->code3 + WC_GBK_N(cc.code) - WC_GBK_N(map->code2); 85 return WC_C_UCS4_ERROR; 86 case WC_CCS_GB18030: 87 break; 88 default: 89 return wc_any_to_ucs(cc); 90 } 91 if (cc.code >= WC_C_GB18030_UCS2 && cc.code <= WC_C_GB18030_UCS2_END) { 92 int i, min = 0, max = N_ucs_gb18030_map - 1; 93 94 cc.code = WC_GB18030_N(cc.code) - WC_GB18030_N(WC_C_GB18030_UCS2); 95 if (cc.code >= ucs_gb18030_map[max].code3) 96 i = max; 97 else { 98 while(1) { 99 i = (min + max) / 2; 100 if (min == max) 101 break; 102 if (cc.code < ucs_gb18030_map[i].code3) 103 max = i - 1; 104 else if (cc.code >= ucs_gb18030_map[i+1].code3) 105 min = i + 1; 106 else 107 break; 108 } 109 } 110 return ucs_gb18030_map[i].code + cc.code - ucs_gb18030_map[i].code3; 111 } 112 if (cc.code >= WC_C_GB18030_UCS4 && cc.code <= WC_C_GB18030_UCS4_END) 113 return WC_GB18030_N(cc.code) - WC_GB18030_N(WC_C_GB18030_UCS4) 114 + 0x10000; 115 return WC_C_UCS4_ERROR; 116 } 117 118 wc_wchar_t 119 wc_ucs_to_gb18030(wc_uint32 ucs) 120 { 121 wc_wchar_t cc; 122 wc_map3 *map; 123 124 if (ucs <= WC_C_UCS2_END) { 125 map = wc_map3_range_search((wc_uint16)ucs, 126 ucs_gbk_ext_map, N_ucs_gbk_ext_map); 127 if (map) { 128 cc.code = WC_GBK_N(map->code3) + ucs - map->code; 129 cc.code = WC_N_GBK(cc.code); 130 cc.ccs = WC_CCS_GBK_EXT; 131 return cc; 132 } 133 map = wc_map3_range_search((wc_uint16)ucs, 134 ucs_gb18030_map, N_ucs_gb18030_map); 135 if (map) { 136 cc.code = map->code3 + ucs - map->code + WC_GB18030_N(WC_C_GB18030_UCS2); 137 cc.code = WC_N_GB18030(cc.code); 138 if (WcOption.gb18030_as_ucs) 139 cc.ccs = WC_CCS_GB18030 | (wc_ucs_to_ccs(ucs) & ~WC_CCS_A_SET); 140 else 141 cc.ccs = WC_CCS_GB18030_W; 142 return cc; 143 } 144 } else if (ucs <= WC_C_UNICODE_END) { 145 cc.code = ucs - 0x10000 + WC_GB18030_N(WC_C_GB18030_UCS4); 146 cc.code = WC_N_GB18030(cc.code); 147 if (WcOption.gb18030_as_ucs) 148 cc.ccs = WC_CCS_GB18030 | (wc_ucs_to_ccs(ucs) & ~WC_CCS_A_SET); 149 else 150 cc.ccs = WC_CCS_GB18030_W; 151 return cc; 152 } 153 cc.ccs = WC_CCS_UNKNOWN; 154 return cc; 155 } 156 #endif 157 158 Str 159 wc_conv_from_gb18030(Str is, wc_ces ces) 160 { 161 Str os; 162 wc_uchar *sp = (wc_uchar *)is->ptr; 163 wc_uchar *ep = sp + is->length; 164 wc_uchar *p; 165 int state = WC_GB18030_NOSTATE; 166 wc_uint32 gbk; 167 wc_wchar_t cc; 168 #ifdef USE_UNICODE 169 wc_uint32 ucs; 170 #endif 171 172 for (p = sp; p < ep && *p < 0x80; p++) 173 ; 174 if (p == ep) 175 return is; 176 os = Strnew_size(is->length); 177 if (p > sp) 178 Strcat_charp_n(os, (char *)is->ptr, (int)(p - sp)); 179 180 for (; p < ep; p++) { 181 switch (state) { 182 case WC_GB18030_NOSTATE: 183 switch (WC_GB18030_MAP[*p]) { 184 case UB: 185 state = WC_GB18030_MBYTE1; 186 break; 187 case C1: 188 wtf_push_unknown(os, p, 1); 189 break; 190 default: 191 Strcat_char(os, (char)*p); 192 break; 193 } 194 break; 195 case WC_GB18030_MBYTE1: 196 if (WC_GB18030_MAP[*p] & LB) { 197 gbk = ((wc_uint32)*(p-1) << 8) | *p; 198 if (wc_gbk_or_gbk_ext(gbk) == WC_CCS_GBK_EXT) 199 wtf_push(os, WC_CCS_GBK_EXT, gbk); 200 else if (*(p-1) >= 0xA1 && *p >= 0xA1) 201 wtf_push(os, wc_gb2312_or_gbk(gbk), gbk); 202 else 203 wtf_push(os, WC_CCS_GBK, gbk); 204 } else if (WC_GB18030_MAP[*p] == L4) { 205 state = WC_GB18030_MBYTE2; 206 break; 207 } else 208 wtf_push_unknown(os, p-1, 2); 209 state = WC_GB18030_NOSTATE; 210 break; 211 case WC_GB18030_MBYTE2: 212 if (WC_GB18030_MAP[*p] == UB) { 213 state = WC_GB18030_MBYTE3; 214 break; 215 } else 216 wtf_push_unknown(os, p-2, 3); 217 state = WC_GB18030_NOSTATE; 218 break; 219 case WC_GB18030_MBYTE3: 220 if (WC_GB18030_MAP[*p] == L4) { 221 cc.ccs = WC_CCS_GB18030_W; 222 cc.code = ((wc_uint32)*(p-3) << 24) 223 | ((wc_uint32)*(p-2) << 16) 224 | ((wc_uint32)*(p-1) << 8) 225 | *p; 226 #ifdef USE_UNICODE 227 if (WcOption.gb18030_as_ucs && 228 (ucs = wc_gb18030_to_ucs(cc)) != WC_C_UCS4_ERROR) 229 wtf_push(os, WC_CCS_GB18030 | (wc_ucs_to_ccs(ucs) & ~WC_CCS_A_SET), cc.code); 230 else 231 #endif 232 wtf_push(os, cc.ccs, cc.code); 233 } else 234 wtf_push_unknown(os, p-3, 4); 235 state = WC_GB18030_NOSTATE; 236 break; 237 } 238 } 239 switch (state) { 240 case WC_GB18030_MBYTE1: 241 wtf_push_unknown(os, p-1, 1); 242 break; 243 case WC_GB18030_MBYTE2: 244 wtf_push_unknown(os, p-2, 2); 245 break; 246 case WC_GB18030_MBYTE3: 247 wtf_push_unknown(os, p-3, 3); 248 break; 249 } 250 return os; 251 } 252 253 void 254 wc_push_to_gb18030(Str os, wc_wchar_t cc, wc_status *st) 255 { 256 while (1) { 257 switch (WC_CCS_SET(cc.ccs)) { 258 case WC_CCS_US_ASCII: 259 Strcat_char(os, (char)cc.code); 260 return; 261 case WC_CCS_GB_2312: 262 Strcat_char(os, (char)((cc.code >> 8) | 0x80)); 263 Strcat_char(os, (char)((cc.code & 0xff) | 0x80)); 264 return; 265 case WC_CCS_GBK_1: 266 case WC_CCS_GBK_2: 267 cc = wc_cs128w_to_gbk(cc); 268 case WC_CCS_GBK: 269 Strcat_char(os, (char)(cc.code >> 8)); 270 Strcat_char(os, (char)(cc.code & 0xff)); 271 return; 272 case WC_CCS_GBK_EXT_1: 273 case WC_CCS_GBK_EXT_2: 274 cc = wc_cs128w_to_gbk(cc); 275 case WC_CCS_GBK_EXT: 276 Strcat_char(os, (char)(cc.code >> 8)); 277 Strcat_char(os, (char)(cc.code & 0xff)); 278 return; 279 case WC_CCS_GB18030: 280 Strcat_char(os, (char)((cc.code >> 24) & 0xff)); 281 Strcat_char(os, (char)((cc.code >> 16) & 0xff)); 282 Strcat_char(os, (char)((cc.code >> 8) & 0xff)); 283 Strcat_char(os, (char)(cc.code & 0xff)); 284 return; 285 case WC_CCS_UNKNOWN_W: 286 if (!WcOption.no_replace) 287 Strcat_charp(os, WC_REPLACE_W); 288 return; 289 case WC_CCS_UNKNOWN: 290 if (!WcOption.no_replace) 291 Strcat_charp(os, WC_REPLACE); 292 return; 293 default: 294 #ifdef USE_UNICODE 295 if (WcOption.ucs_conv) 296 cc = wc_any_to_any_ces(cc, st); 297 else 298 #endif 299 cc.ccs = WC_CCS_IS_WIDE(cc.ccs) ? WC_CCS_UNKNOWN_W : WC_CCS_UNKNOWN; 300 continue; 301 } 302 } 303 } 304 305 Str 306 wc_char_conv_from_gb18030(wc_uchar c, wc_status *st) 307 { 308 static Str os; 309 static wc_uchar gb[4]; 310 wc_uint32 gbk; 311 wc_wchar_t cc; 312 #ifdef USE_UNICODE 313 wc_uint32 ucs; 314 #endif 315 316 if (st->state == -1) { 317 st->state = WC_GB18030_NOSTATE; 318 os = Strnew_size(8); 319 } 320 321 switch (st->state) { 322 case WC_GB18030_NOSTATE: 323 switch (WC_GB18030_MAP[c]) { 324 case UB: 325 gb[0] = c; 326 st->state = WC_GB18030_MBYTE1; 327 return NULL; 328 case C1: 329 break; 330 default: 331 Strcat_char(os, (char)c); 332 break; 333 } 334 break; 335 case WC_GB18030_MBYTE1: 336 if (WC_GB18030_MAP[c] & LB) { 337 gbk = ((wc_uint32)gb[0] << 8) | c; 338 if (wc_gbk_or_gbk_ext(gbk) == WC_CCS_GBK_EXT) 339 wtf_push(os, WC_CCS_GBK_EXT, gbk); 340 else if (gb[0] >= 0xA1 && c >= 0xA1) 341 wtf_push(os, wc_gb2312_or_gbk(gbk), gbk); 342 else 343 wtf_push(os, WC_CCS_GBK, gbk); 344 } else if (WC_GB18030_MAP[c] == L4) { 345 gb[1] = c; 346 st->state = WC_GB18030_MBYTE2; 347 return NULL; 348 } 349 break; 350 case WC_GB18030_MBYTE2: 351 if (WC_GB18030_MAP[c] == UB) { 352 gb[2] = c; 353 st->state = WC_GB18030_MBYTE3; 354 return NULL; 355 } 356 break; 357 case WC_GB18030_MBYTE3: 358 if (WC_GB18030_MAP[c] == L4) { 359 cc.ccs = WC_CCS_GB18030_W; 360 cc.code = ((wc_uint32)gb[0] << 24) 361 | ((wc_uint32)gb[1] << 16) 362 | ((wc_uint32)gb[2] << 8) 363 | c; 364 #ifdef USE_UNICODE 365 if (WcOption.gb18030_as_ucs && 366 (ucs = wc_gb18030_to_ucs(cc)) != WC_C_UCS4_ERROR) 367 wtf_push(os, WC_CCS_GB18030 | (wc_ucs_to_ccs(ucs) & ~WC_CCS_A_SET), cc.code); 368 else 369 #endif 370 wtf_push(os, cc.ccs, cc.code); 371 } 372 break; 373 } 374 st->state = -1; 375 return os; 376 }