detect.c (13154B)
1 2 #include "wc.h" 3 #include "iso2022.h" 4 #include "sjis.h" 5 #include "big5.h" 6 #include "hz.h" 7 #include "viet.h" 8 #ifdef USE_UNICODE 9 #include "utf8.h" 10 #include "utf7.h" 11 #endif 12 13 wc_uint8 WC_DETECT_MAP[ 0x100 ] = { 14 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 22 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 23 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 24 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 25 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 26 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 27 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 28 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 29 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 30 }; 31 32 #define DETECT_NORMAL 0 33 #define DETECT_POSSIBLE 1 34 #define DETECT_OK 2 35 #define DETECT_BROKEN 4 36 #define DETECT_ERROR 8 37 #define SET_DETECT(x,y) ((x) |= (y)) 38 #define SET_BROKEN_ERROR(x) ((x) = ((x) & DETECT_BROKEN) ? DETECT_ERROR : ((x) | DETECT_BROKEN)) 39 40 void 41 wc_create_detect_map(wc_ces ces, wc_bool esc) 42 { 43 static wc_ces detect_ces = WC_CES_US_ASCII; 44 int i; 45 46 if (ces != detect_ces) { 47 if (ces & WC_CES_T_VIET) { 48 wc_uint8 *map = NULL; 49 switch (ces) { 50 case WC_CES_TCVN_5712: 51 map = wc_c0_tcvn57122_map; 52 break; 53 case WC_CES_VISCII_11: 54 map = wc_c0_viscii112_map; 55 break; 56 case WC_CES_VPS: 57 map = wc_c0_vps2_map; 58 break; 59 } 60 for (i = 0; i < 0x20; i++) 61 WC_DETECT_MAP[i] = map[i] ? 1 : 0; 62 } else { 63 for (i = 0; i < 0x20; i++) 64 WC_DETECT_MAP[i] = 0; 65 WC_DETECT_MAP[WC_C_HZ_TILDA] = (ces == WC_CES_HZ_GB_2312) ? 1 : 0; 66 #ifdef USE_UNICODE 67 WC_DETECT_MAP[WC_C_UTF7_PLUS] = (ces == WC_CES_UTF_7) ? 1 : 0; 68 #endif 69 } 70 detect_ces = ces; 71 } 72 WC_DETECT_MAP[WC_C_ESC] = (esc || (ces & WC_CES_T_ISO_2022)) ? 1 : 0; 73 return; 74 } 75 76 wc_ces 77 wc_auto_detect(char *is, size_t len, wc_ces hint) 78 { 79 wc_uchar *p = (wc_uchar *)is; 80 wc_uchar *ep = p + len; 81 wc_uchar *q; 82 wc_ces euc = 0, priv = 0; 83 wc_status st; 84 int euc_state = 0, sjis_state = 0, big5_state = 0, hz_state = 0; 85 int iso_detect = DETECT_ERROR, euc_detect = DETECT_ERROR, 86 sjis_detect = DETECT_ERROR, big5_detect = DETECT_ERROR, 87 hz_detect = DETECT_ERROR, latin_detect = DETECT_ERROR, 88 priv_detect = DETECT_ERROR; 89 int possible = 0; 90 wc_bool iso2022jp2 = WC_FALSE, iso2022jp3 = WC_FALSE, 91 iso2022cn = WC_FALSE, iso2022kr = WC_FALSE, ok = WC_FALSE; 92 #ifdef USE_UNICODE 93 int utf8_state = 0; 94 int utf8_detect = DETECT_ERROR; 95 int utf8_next = 0; 96 #endif 97 98 wc_create_detect_map(hint, WC_TRUE); 99 for (; p < ep && ! WC_DETECT_MAP[*p]; p++) 100 ; 101 if (p == ep) 102 return hint; 103 104 switch (hint) { 105 case WC_CES_ISO_2022_JP: 106 case WC_CES_ISO_2022_JP_2: 107 case WC_CES_ISO_2022_JP_3: 108 case WC_CES_EUC_JP: 109 case WC_CES_SHIFT_JIS: 110 case WC_CES_SHIFT_JISX0213: 111 euc = WC_CES_EUC_JP; 112 euc_state = WC_EUC_NOSTATE; 113 sjis_state = WC_SJIS_NOSTATE; 114 iso_detect = euc_detect = sjis_detect = DETECT_NORMAL; 115 possible = 3; 116 break; 117 case WC_CES_ISO_2022_CN: 118 case WC_CES_EUC_CN: 119 euc = WC_CES_EUC_CN; 120 euc_state = WC_EUC_NOSTATE; 121 big5_state = WC_BIG5_NOSTATE; 122 iso_detect = euc_detect = big5_detect = DETECT_NORMAL; 123 possible = 3; 124 break; 125 case WC_CES_EUC_TW: 126 case WC_CES_BIG5: 127 euc = WC_CES_EUC_TW; 128 euc_state = WC_EUC_NOSTATE; 129 big5_state = WC_BIG5_NOSTATE; 130 iso_detect = euc_detect = big5_detect = DETECT_NORMAL; 131 possible = 3; 132 break; 133 case WC_CES_HZ_GB_2312: 134 euc = WC_CES_EUC_CN; 135 euc_state = WC_EUC_NOSTATE; 136 hz_state = WC_HZ_NOSTATE; 137 iso_detect = euc_detect = big5_detect = hz_detect = DETECT_NORMAL; 138 possible = 4; 139 break; 140 case WC_CES_ISO_2022_KR: 141 case WC_CES_EUC_KR: 142 euc = WC_CES_EUC_KR; 143 euc_state = WC_EUC_NOSTATE; 144 iso_detect = euc_detect = DETECT_NORMAL; 145 possible = 3; 146 break; 147 #ifdef USE_UNICODE 148 case WC_CES_UTF_8: 149 iso_detect = DETECT_NORMAL; 150 possible = 1; 151 break; 152 #endif 153 case WC_CES_US_ASCII: 154 iso_detect = latin_detect = DETECT_NORMAL; 155 possible = 2; 156 break; 157 default: 158 if (hint & WC_CES_T_ISO_8859) { 159 iso_detect = latin_detect = DETECT_NORMAL; 160 possible = 2; 161 } else { 162 iso_detect = priv_detect = DETECT_NORMAL; 163 priv = hint; /* for TVCN, VISCII, VPS */ 164 possible = 2; 165 } 166 break; 167 } 168 #ifdef USE_UNICODE 169 if (priv_detect == DETECT_ERROR) { 170 utf8_detect = DETECT_NORMAL; 171 possible++; 172 } 173 #endif 174 175 wc_input_init(WC_CES_US_ASCII, &st); 176 177 for (; p < ep; p++) { 178 if (possible == 0 || (possible == 1 && ok)) 179 break; 180 if (iso_detect != DETECT_ERROR) { 181 switch (*p) { 182 case WC_C_ESC: 183 if (*(p+1) == WC_C_MBCS) { 184 q = p; 185 if (! wc_parse_iso2022_esc(&q, &st)) 186 break; 187 if (st.design[0] == WC_CCS_JIS_C_6226 || 188 st.design[0] == WC_CCS_JIS_X_0208) 189 ; 190 else if (st.design[0] == WC_CCS_JIS_X_0213_1 || 191 st.design[0] == WC_CCS_JIS_X_0213_2) 192 iso2022jp3 = WC_TRUE; 193 else if (WC_CCS_TYPE(st.design[0]) == WC_CCS_A_CS94W) 194 iso2022jp2 = WC_TRUE; 195 if (st.design[1] == WC_CCS_KS_X_1001) 196 iso2022kr = WC_TRUE; 197 else if (st.design[1] == WC_CCS_GB_2312 || 198 st.design[1] == WC_CCS_ISO_IR_165 || 199 st.design[1] == WC_CCS_CNS_11643_1) 200 iso2022cn = WC_TRUE; 201 if (WC_CCS_TYPE(st.design[2]) == WC_CCS_A_CS94W || 202 WC_CCS_TYPE(st.design[3]) == WC_CCS_A_CS94W) 203 iso2022cn = WC_TRUE; 204 } else if (*(p+1) == WC_C_G2_CS96) { 205 q = p; 206 if (! wc_parse_iso2022_esc(&q, &st)) 207 break; 208 if (WC_CCS_TYPE(st.design[2]) == WC_CCS_A_CS96) 209 iso2022jp2 = WC_TRUE; 210 } else if (*(p+1) == WC_C_CSWSR) { 211 q = p; 212 if (! wc_parse_iso2022_esc(&q, &st)) 213 break; 214 possible = 0; 215 iso_detect = DETECT_BROKEN; 216 continue; 217 } 218 iso_detect = DETECT_OK; 219 ok = WC_TRUE; 220 break; 221 case WC_C_SI: 222 case WC_C_SO: 223 iso_detect = DETECT_OK; 224 ok = WC_TRUE; 225 iso2022cn = WC_TRUE; 226 iso2022kr = WC_TRUE; 227 break; 228 default: 229 if (*p & 0x80) { 230 iso_detect = DETECT_ERROR; 231 possible--; 232 } 233 break; 234 } 235 } 236 if (euc_detect != DETECT_ERROR) { 237 switch (euc_state) { 238 case WC_EUC_NOSTATE: 239 switch (WC_ISO_MAP[*p]) { 240 case WC_ISO_MAP_GR: 241 euc_state = WC_EUC_MBYTE1; 242 break; 243 case WC_ISO_MAP_SS2: 244 if (euc == WC_CES_EUC_JP) 245 euc_state = WC_EUC_MBYTE1; 246 else if (euc == WC_CES_EUC_TW) 247 euc_state = WC_EUC_TW_SS2; 248 else 249 euc_detect = DETECT_ERROR; 250 break; 251 case WC_ISO_MAP_SS3: 252 if (euc == WC_CES_EUC_JP && 253 WC_ISO_MAP[*(p+1)] == WC_ISO_MAP_GR) 254 ; 255 else 256 euc_detect = DETECT_ERROR; 257 break; 258 case WC_ISO_MAP_C1: 259 case WC_ISO_MAP_GR96: 260 euc_detect = DETECT_ERROR; 261 break; 262 } 263 break; 264 case WC_EUC_MBYTE1: 265 if (WC_ISO_MAP[*p] == WC_ISO_MAP_GR) { 266 SET_DETECT(euc_detect, DETECT_OK); 267 ok = WC_TRUE; 268 } else 269 SET_BROKEN_ERROR(euc_detect); 270 euc_state = WC_EUC_NOSTATE; 271 break; 272 case WC_EUC_TW_SS2: 273 if (!( 0xa0 <= *p && *p <= 0xb0) || 274 WC_ISO_MAP[*(p+1)] != WC_ISO_MAP_GR) 275 euc_detect = DETECT_ERROR; 276 euc_state = WC_EUC_NOSTATE; 277 break; 278 } 279 if (euc_detect == DETECT_ERROR) 280 possible--; 281 } 282 if (sjis_detect != DETECT_ERROR) { 283 switch (sjis_state) { 284 case WC_SJIS_NOSTATE: 285 switch (WC_SJIS_MAP[*p]) { 286 case WC_SJIS_MAP_SL: 287 case WC_SJIS_MAP_SH: 288 sjis_state = WC_SJIS_SHIFT_L; 289 break; 290 case WC_SJIS_MAP_SK: 291 SET_DETECT(sjis_detect, DETECT_POSSIBLE); 292 break; 293 case WC_SJIS_MAP_SX: 294 if (WcOption.use_jisx0213) { 295 sjis_state = WC_SJIS_SHIFT_X; 296 break; 297 } 298 case WC_SJIS_MAP_80: 299 case WC_SJIS_MAP_A0: 300 case WC_SJIS_MAP_C1: 301 sjis_detect = DETECT_ERROR; 302 break; 303 } 304 break; 305 case WC_SJIS_SHIFT_L: 306 if (WC_SJIS_MAP[*p] & WC_SJIS_MAP_LB) { 307 SET_DETECT(sjis_detect, DETECT_OK); 308 ok = WC_TRUE; 309 } else 310 SET_BROKEN_ERROR(sjis_detect); 311 sjis_state = WC_SJIS_NOSTATE; 312 break; 313 case WC_SJIS_SHIFT_X: 314 if (WC_SJIS_MAP[*p] & WC_SJIS_MAP_LB) 315 SET_DETECT(sjis_detect, DETECT_POSSIBLE); 316 else 317 sjis_detect = DETECT_ERROR; 318 sjis_state = WC_SJIS_NOSTATE; 319 break; 320 } 321 if (sjis_detect == DETECT_ERROR) 322 possible--; 323 } 324 if (big5_detect != DETECT_ERROR) { 325 switch (big5_state) { 326 case WC_BIG5_NOSTATE: 327 switch (WC_BIG5_MAP[*p]) { 328 case WC_BIG5_MAP_UB: 329 big5_state = WC_BIG5_MBYTE1; 330 break; 331 case WC_BIG5_MAP_C1: 332 big5_detect = DETECT_ERROR; 333 break; 334 } 335 break; 336 case WC_BIG5_MBYTE1: 337 if (WC_BIG5_MAP[*p] & WC_BIG5_MAP_LB) { 338 SET_DETECT(big5_detect, DETECT_OK); 339 ok = WC_TRUE; 340 } else 341 SET_BROKEN_ERROR(big5_detect); 342 big5_state = WC_BIG5_NOSTATE; 343 break; 344 } 345 if (big5_detect == DETECT_ERROR) 346 possible--; 347 } 348 if (hz_detect != DETECT_ERROR) { 349 if (*p & 0x80) { 350 hz_detect = DETECT_ERROR; 351 possible--; 352 } else { 353 switch (hz_state) { 354 case WC_HZ_NOSTATE: 355 if (*p == WC_C_HZ_TILDA) 356 hz_state = WC_HZ_TILDA; 357 break; 358 case WC_HZ_TILDA: 359 if (*p == WC_C_HZ_SI) 360 hz_state = WC_HZ_MBYTE; 361 else 362 hz_state = WC_HZ_NOSTATE; 363 break; 364 case WC_HZ_TILDA_MB: 365 if (*p == WC_C_HZ_SO) 366 hz_state = WC_HZ_NOSTATE; 367 else 368 hz_state = WC_HZ_MBYTE; 369 break; 370 case WC_HZ_MBYTE: 371 if (*p == WC_C_HZ_TILDA) 372 hz_state = WC_HZ_TILDA_MB; 373 else 374 hz_state = WC_HZ_MBYTE1; 375 break; 376 case WC_HZ_MBYTE1: 377 hz_detect = DETECT_OK; 378 ok = WC_TRUE; 379 hz_state = WC_HZ_NOSTATE; 380 break; 381 } 382 } 383 } 384 if (latin_detect != DETECT_ERROR) { 385 switch (WC_ISO_MAP[*p] & WC_ISO_MAP_CG) { 386 case WC_ISO_MAP_GR: 387 case WC_ISO_MAP_GR96: 388 SET_DETECT(latin_detect, DETECT_OK); 389 ok = WC_TRUE; 390 break; 391 case WC_ISO_MAP_C1: 392 latin_detect = DETECT_ERROR; 393 break; 394 } 395 if (latin_detect == DETECT_ERROR) 396 possible--; 397 } 398 if (priv_detect != DETECT_ERROR) { 399 if (*p != WC_C_ESC && WC_DETECT_MAP[*p]) { 400 SET_DETECT(priv_detect, DETECT_OK); 401 ok = WC_TRUE; 402 } 403 /* 404 if (priv_detect == DETECT_ERROR) 405 possible--; 406 */ 407 } 408 #ifdef USE_UNICODE 409 if (utf8_detect != DETECT_ERROR) { 410 switch (utf8_state) { 411 case WC_UTF8_NOSTATE: 412 switch (utf8_next = WC_UTF8_MAP[*p]) { 413 case 1: 414 case 8: 415 break; 416 case 0: 417 case 7: 418 utf8_detect = DETECT_ERROR; 419 break; 420 default: 421 utf8_next--; 422 utf8_state = WC_UTF8_NEXT; 423 break; 424 } 425 break; 426 case WC_UTF8_NEXT: 427 if (WC_UTF8_MAP[*p]) { 428 utf8_detect = DETECT_ERROR; 429 utf8_state = WC_UTF8_NOSTATE; 430 break; 431 } 432 utf8_next--; 433 if (! utf8_next) { 434 SET_DETECT(utf8_detect, DETECT_OK); 435 ok = WC_TRUE; 436 utf8_state = WC_UTF8_NOSTATE; 437 } 438 break; 439 } 440 if (utf8_detect == DETECT_ERROR) 441 possible--; 442 } 443 #endif 444 } 445 446 if (iso_detect != DETECT_ERROR) { 447 if (iso_detect == DETECT_NORMAL) { 448 if (hz_detect == DETECT_OK) 449 return WC_CES_HZ_GB_2312; 450 if (priv_detect == DETECT_OK) 451 return priv; 452 return WC_CES_US_ASCII; 453 } 454 switch (euc) { 455 case WC_CES_EUC_CN: 456 case WC_CES_EUC_TW: 457 if (iso2022cn) 458 return WC_CES_ISO_2022_CN; 459 break; 460 case WC_CES_EUC_KR: 461 if (iso2022kr) 462 return WC_CES_ISO_2022_KR; 463 break; 464 } 465 if (iso2022jp3) 466 return WC_CES_ISO_2022_JP_3; 467 if (iso2022jp2) 468 return WC_CES_ISO_2022_JP_2; 469 if (iso2022cn) 470 return WC_CES_ISO_2022_CN; 471 if (iso2022kr) 472 return WC_CES_ISO_2022_KR; 473 return WC_CES_ISO_2022_JP; 474 } 475 switch (hint) { 476 case WC_CES_ISO_2022_JP: 477 case WC_CES_ISO_2022_JP_2: 478 case WC_CES_ISO_2022_JP_3: 479 case WC_CES_ISO_2022_KR: 480 case WC_CES_ISO_2022_CN: 481 break; 482 case WC_CES_EUC_JP: 483 case WC_CES_EUC_CN: 484 case WC_CES_EUC_TW: 485 case WC_CES_EUC_KR: 486 if (euc_detect != DETECT_ERROR) 487 return hint; 488 break; 489 case WC_CES_SHIFT_JIS: 490 case WC_CES_SHIFT_JISX0213: 491 if (sjis_detect != DETECT_ERROR) 492 return hint; 493 break; 494 case WC_CES_BIG5: 495 if (big5_detect != DETECT_ERROR) 496 return hint; 497 break; 498 #ifdef USE_UNICODE 499 case WC_CES_UTF_8: 500 return hint; 501 #endif 502 case WC_CES_US_ASCII: 503 #ifdef USE_UNICODE 504 if (utf8_detect != DETECT_ERROR) 505 return hint; 506 #endif 507 if (latin_detect != DETECT_ERROR) 508 return WC_CES_ISO_8859_1; 509 return hint; 510 default: 511 if (latin_detect != DETECT_ERROR) 512 return hint; 513 if (priv_detect != DETECT_ERROR) 514 return hint; 515 #ifdef USE_UNICODE 516 if (utf8_detect != DETECT_ERROR) 517 return WC_CES_UTF_8; 518 #endif 519 return hint; 520 } 521 if (euc_detect == DETECT_OK) 522 return euc; 523 if (sjis_detect == DETECT_OK) 524 return WC_CES_SHIFT_JIS; 525 if (big5_detect == DETECT_OK) 526 return WC_CES_BIG5; 527 #ifdef USE_UNICODE 528 if (utf8_detect == DETECT_OK) 529 return WC_CES_UTF_8; 530 if (sjis_detect & DETECT_POSSIBLE) 531 return WC_CES_SHIFT_JIS; 532 #endif 533 if (euc_detect != DETECT_ERROR) 534 return euc; 535 if (sjis_detect != DETECT_ERROR) 536 return WC_CES_SHIFT_JIS; 537 if (big5_detect != DETECT_ERROR) 538 return WC_CES_BIG5; 539 #ifdef USE_UNICODE 540 if (utf8_detect != DETECT_ERROR) 541 return WC_CES_UTF_8; 542 #endif 543 return hint; 544 }