charset.c (11311B)
1 2 #include <stdlib.h> 3 #include <ctype.h> 4 #include <gc.h> 5 #define New_N(type,n) ((type*)GC_MALLOC((n)*sizeof(type))) 6 7 #include "wc.h" 8 9 #ifdef HAVE_LANGINFO_CODESET 10 #include <langinfo.h> 11 #endif 12 13 wc_locale WcLocale = 0; 14 15 static struct { 16 char *lang; 17 wc_ces ces; 18 } lang_ces_table[] = { 19 { "cs", WC_CES_ISO_8859_2 }, /* cs_CZ */ 20 { "el", WC_CES_ISO_8859_7 }, /* el_GR */ 21 { "iw", WC_CES_ISO_8859_8 }, /* iw_IL */ 22 { "ja", WC_CES_EUC_JP }, /* ja_JP */ 23 { "ko", WC_CES_EUC_KR }, /* ko_KR */ 24 { "hu", WC_CES_ISO_8859_2 }, /* hu_HU */ 25 { "pl", WC_CES_ISO_8859_2 }, /* pl_PL */ 26 { "ro", WC_CES_ISO_8859_2 }, /* ro_RO */ 27 { "ru", WC_CES_ISO_8859_5 }, /* ru_SU */ 28 { "sk", WC_CES_ISO_8859_2 }, /* sk_SK */ 29 { "sl", WC_CES_ISO_8859_2 }, /* sl_CS */ 30 { "tr", WC_CES_ISO_8859_9 }, /* tr_TR */ 31 { "zh", WC_CES_EUC_CN }, /* zh_CN */ 32 { NULL, 0 } 33 }; 34 35 static wc_ces 36 wc_codepage(int n) 37 { 38 switch (n) { 39 case 437: return WC_CES_CP437; 40 case 737: return WC_CES_CP737; 41 case 775: return WC_CES_CP775; 42 case 850: return WC_CES_CP850; 43 case 852: return WC_CES_CP852; 44 case 855: return WC_CES_CP855; 45 case 856: return WC_CES_CP856; 46 case 857: return WC_CES_CP857; 47 case 860: return WC_CES_CP860; 48 case 861: return WC_CES_CP861; 49 case 862: return WC_CES_CP862; 50 case 863: return WC_CES_CP863; 51 case 864: return WC_CES_CP864; 52 case 865: return WC_CES_CP865; 53 case 866: return WC_CES_CP866; 54 case 869: return WC_CES_CP869; 55 case 874: return WC_CES_CP874; 56 case 932: return WC_CES_CP932; /* CP932 = Shift_JIS */ 57 case 936: return WC_CES_CP936; /* CP936 = GBK > EUC_CN */ 58 case 943: return WC_CES_CP943; /* CP943 = Shift_JIS */ 59 case 949: return WC_CES_CP949; /* CP949 = UHC > EUC_KR */ 60 case 950: return WC_CES_CP950; /* CP950 = Big5 */ 61 case 1006: return WC_CES_CP1006; 62 case 1250: return WC_CES_CP1250; 63 case 1251: return WC_CES_CP1251; 64 case 1252: return WC_CES_CP1252; 65 case 1253: return WC_CES_CP1253; 66 case 1254: return WC_CES_CP1254; 67 case 1255: return WC_CES_CP1255; 68 case 1256: return WC_CES_CP1256; 69 case 1257: return WC_CES_CP1257; 70 case 1258: return WC_CES_CP1258; 71 } 72 return 0; 73 } 74 75 wc_ces 76 wc_guess_charset(char *charset, wc_ces orig) 77 { 78 wc_ces guess; 79 80 if (charset == NULL || *charset == '\0') 81 return orig; 82 guess = wc_charset_to_ces(charset); 83 return guess ? guess : orig; 84 } 85 86 wc_ces 87 wc_guess_charset_short(char *charset, wc_ces orig) 88 { 89 wc_ces guess; 90 91 if (charset == NULL || *charset == '\0') 92 return orig; 93 guess = wc_charset_short_to_ces(charset); 94 return guess ? guess : orig; 95 } 96 97 wc_ces 98 wc_guess_locale_charset(char *locale, wc_ces orig) 99 { 100 wc_ces guess; 101 102 if (locale == NULL || *locale == '\0') 103 return orig; 104 guess = wc_locale_to_ces(locale); 105 return guess ? guess : orig; 106 } 107 108 wc_ces 109 wc_charset_to_ces(char *charset) 110 { 111 char *p = charset; 112 char buf[16]; 113 int n; 114 115 if (tolower(*p) == 'x' && *(p+1) == '-') 116 p += 2; 117 for (n = 0; *p && n < 15; p++) { 118 if ((unsigned char)*p > 0x20 && *p != '_' && *p != '-') 119 buf[n++] = tolower(*p); 120 } 121 buf[n] = 0; 122 p = buf; 123 switch (*p) { 124 case 'e': 125 if (! strncmp(p, "euc", 3)) { 126 p += 3; 127 switch (*p) { 128 case 'j': return WC_CES_EUC_JP; 129 case 'c': return WC_CES_EUC_CN; 130 case 't': return WC_CES_EUC_TW; 131 case 'k': return WC_CES_EUC_KR; 132 } 133 switch (WcLocale) { 134 case WC_LOCALE_JA_JP: return WC_CES_EUC_JP; 135 case WC_LOCALE_ZH_CN: return WC_CES_EUC_CN; 136 case WC_LOCALE_ZH_TW: return WC_CES_EUC_TW; 137 case WC_LOCALE_ZH_HK: return WC_CES_EUC_CN; 138 case WC_LOCALE_KO_KR: return WC_CES_EUC_KR; 139 } 140 return WC_CES_EUC_JP; 141 } 142 break; 143 case 'i': 144 if (! strncmp(p, "iso2022", 7)) { 145 p += 7; 146 switch (*p) { 147 case 'j': 148 if (! strncmp(p, "jp2", 3)) 149 return WC_CES_ISO_2022_JP_2; 150 if (! strncmp(p, "jp3", 3)) 151 return WC_CES_ISO_2022_JP_3; 152 return WC_CES_ISO_2022_JP; 153 case 'c': return WC_CES_ISO_2022_CN; 154 case 'k': return WC_CES_ISO_2022_KR; 155 } 156 return WC_CES_ISO_2022_JP; 157 } else if (! strncmp(p, "iso8859", 7)) { 158 n = atoi(p + 7); 159 if (n >= 1 && n <= 16 && n != 12) 160 return (WC_CES_E_ISO_8859 | n); 161 return WC_CES_ISO_8859_1; 162 } else if (! strncmp(p, "ibm", 3)) { 163 p += 3; 164 if (*p >= '1' && *p <= '9') 165 return wc_codepage(atoi(p)); 166 return wc_charset_to_ces(p); 167 } 168 break; 169 case 'j': 170 if (! strncmp(p, "johab", 5)) 171 return WC_CES_JOHAB; 172 if (! strncmp(p, "jis", 3)) 173 return WC_CES_ISO_2022_JP; 174 break; 175 case 's': 176 if (! strncmp(p, "shiftjisx0213", 13) || 177 ! strncmp(p, "sjisx0213", 9)) 178 return WC_CES_SHIFT_JISX0213; 179 if (! strncmp(p, "shiftjis", 8) || 180 ! strncmp(p, "sjis", 4)) 181 return WC_CES_SHIFT_JIS; 182 break; 183 case 'p': 184 if (! strncmp(p, "pck", 3)) 185 return WC_CES_SHIFT_JIS; 186 break; 187 case 'g': 188 if (! strncmp(p, "gb18030", 7) || 189 ! strncmp(p, "gbk2k", 5)) 190 return WC_CES_GB18030; 191 if (! strncmp(p, "gbk", 3)) 192 return WC_CES_GBK; 193 if (! strncmp(p, "gb2312", 6)) 194 return WC_CES_EUC_CN; 195 break; 196 case 'b': 197 if (! strncmp(p, "big5hkscs", 9)) 198 return WC_CES_HKSCS; 199 if (! strncmp(p, "big5", 4)) 200 return WC_CES_BIG5; 201 break; 202 case 'h': 203 if (! strncmp(p, "hz", 2)) 204 return WC_CES_HZ_GB_2312; 205 if (! strncmp(p, "hkscs", 5)) 206 return WC_CES_HKSCS; 207 break; 208 case 'k': 209 if (! strncmp(p, "koi8r", 5)) 210 return WC_CES_KOI8_R; 211 if (! strncmp(p, "koi8u", 5)) 212 return WC_CES_KOI8_U; 213 if (! strncmp(p, "ksx1001", 7)) 214 return WC_CES_EUC_KR; 215 if (! strncmp(p, "ksc5601", 7)) 216 return WC_CES_EUC_KR; 217 break; 218 case 't': 219 if (! strncmp(p, "tis620", 6)) 220 return WC_CES_TIS_620; 221 if (! strncmp(p, "tcvn", 4)) 222 return WC_CES_TCVN_5712; 223 break; 224 case 'n': 225 if (! strncmp(p, "next", 4)) 226 return WC_CES_NEXTSTEP; 227 break; 228 case 'v': 229 if (! strncmp(p, "viet", 4)) { 230 p += 4; 231 if (! strncmp(p, "tcvn", 4)) 232 return WC_CES_TCVN_5712; 233 } 234 if (! strncmp(p, "viscii", 6)) 235 return WC_CES_VISCII_11; 236 if (! strncmp(p, "vps", 3)) 237 return WC_CES_VPS; 238 break; 239 case 'u': 240 #ifdef USE_UNICODE 241 if (! strncmp(p, "utf8", 4)) 242 return WC_CES_UTF_8; 243 if (! strncmp(p, "utf7", 4)) 244 return WC_CES_UTF_7; 245 #endif 246 if (! strncmp(p, "uhc", 3)) 247 return WC_CES_UHC; 248 if (! strncmp(p, "ujis", 4)) 249 return WC_CES_EUC_JP; 250 if (! strncmp(p, "usascii", 7)) 251 return WC_CES_US_ASCII; 252 break; 253 case 'a': 254 if (! strncmp(p, "ascii", 5)) 255 return WC_CES_US_ASCII; 256 break; 257 case 'c': 258 if (! strncmp(p, "cngb", 4)) 259 return WC_CES_EUC_CN; 260 if (*(p+1) != 'p') 261 break; 262 p += 2; 263 if (*p >= '1' && *p <= '9') 264 return wc_codepage(atoi(p)); 265 break; 266 case 'w': 267 if (strncmp(p, "windows", 7)) 268 break; 269 p += 7; 270 if (! strncmp(p, "31j", 3)) 271 return WC_CES_CP932; 272 if (*p >= '1' && *p <= '9') 273 return wc_codepage(atoi(p)); 274 break; 275 } 276 return 0; 277 } 278 279 wc_ces 280 wc_charset_short_to_ces(char *charset) 281 { 282 char *p = charset; 283 char buf[16]; 284 wc_ces ces; 285 int n; 286 287 ces = wc_charset_to_ces(charset); 288 if (ces) 289 return ces; 290 291 for (n = 0; *p && n < 15; p++) { 292 if ((unsigned char)*p > 0x20 && *p != '_' && *p != '-') 293 buf[n++] = tolower(*p); 294 } 295 buf[n] = 0; 296 p = buf; 297 switch (*p) { 298 case 'e': 299 switch (*(p+1)) { 300 case 'j': return WC_CES_EUC_JP; 301 case 'c': return WC_CES_EUC_CN; 302 case 't': return WC_CES_EUC_TW; 303 case 'k': return WC_CES_EUC_KR; 304 } 305 return WC_CES_EUC_JP; 306 case 'j': 307 p++; 308 if (*p == 'o') 309 return WC_CES_JOHAB; 310 if (*p == 'p') 311 p++; 312 if (*p == '2') 313 return WC_CES_ISO_2022_JP_2; 314 if (*p == '3') 315 return WC_CES_ISO_2022_JP_3; 316 return WC_CES_ISO_2022_JP; 317 case 's': 318 return WC_CES_SHIFT_JIS; 319 case 'g': 320 return WC_CES_EUC_CN; 321 case 'b': 322 return WC_CES_BIG5; 323 case 'h': 324 if (*(p+1) == 'k') 325 return WC_CES_HKSCS; 326 return WC_CES_HZ_GB_2312; 327 case 'k': 328 if (*(p+1) == 'o') 329 return WC_CES_KOI8_R; 330 return WC_CES_ISO_2022_KR; 331 case 'l': 332 n = atoi(p + 1); 333 if (n >= 1 && n <= 16 && n != 12) 334 return (WC_CES_E_ISO_8859 | n); 335 return WC_CES_ISO_8859_1; 336 case 't': 337 if (*(p+1) == 'c') 338 return WC_CES_TCVN_5712; 339 return WC_CES_TIS_620; 340 case 'n': 341 return WC_CES_NEXTSTEP; 342 case 'v': 343 if (*(p+1) == 'p') 344 return WC_CES_VPS; 345 return WC_CES_VISCII_11; 346 #ifdef USE_UNICODE 347 case 'u': 348 if (*(p+1) == '7') 349 return WC_CES_UTF_7; 350 return WC_CES_UTF_8; 351 #endif 352 case 'a': 353 return WC_CES_US_ASCII; 354 case 'c': 355 return WC_CES_ISO_2022_CN; 356 case 'w': 357 p++; 358 if (*p >= '1' && *p <= '9') 359 return wc_codepage(atoi(p)); 360 break; 361 case 'r': 362 return WC_CES_RAW; 363 } 364 return 0; 365 } 366 367 wc_ces 368 wc_locale_to_ces(char *locale) 369 { 370 char *p = locale; 371 char buf[8]; 372 int n; 373 374 if (*p == 'C' && *(p+1) == '\0') 375 return WC_CES_US_ASCII; 376 #ifdef HAVE_LANGINFO_CODESET 377 { 378 char *cs = nl_langinfo(CODESET); 379 if (cs && strcmp(cs, "US-ASCII")) 380 return wc_charset_to_ces(cs); 381 } 382 #endif 383 for (n = 0; *p && *p != '.' && n < 7; p++) { 384 if ((unsigned char)*p > 0x20) 385 buf[n++] = tolower(*p); 386 } 387 buf[n] = 0; 388 if (*p == '.') { 389 p++; 390 if (! strcasecmp(p, "euc")) { 391 switch (buf[0]) { 392 case 'j': 393 WcLocale = WC_LOCALE_JA_JP; 394 break; 395 case 'k': 396 WcLocale = WC_LOCALE_KO_KR; 397 break; 398 case 'z': 399 if (!strcmp(buf, "zh_tw")) 400 WcLocale = WC_LOCALE_ZH_TW; 401 else if (!strcmp(buf, "zh_hk")) 402 WcLocale = WC_LOCALE_ZH_HK; 403 else 404 WcLocale = WC_LOCALE_ZH_CN; 405 break; 406 default: 407 WcLocale = 0; 408 break; 409 } 410 } 411 return wc_charset_to_ces(p); 412 } 413 414 if (!strcmp(buf, "japanese")) 415 return WC_CES_SHIFT_JIS; 416 if (!strcmp(buf, "zh_tw") || 417 !strcmp(buf, "zh_hk")) 418 return WC_CES_BIG5; 419 for (n = 0; lang_ces_table[n].lang; n++) { 420 if (!strncmp(buf, lang_ces_table[n].lang, 2)) 421 return lang_ces_table[n].ces; 422 } 423 return WC_CES_ISO_8859_1; 424 } 425 426 char * 427 wc_ces_to_charset(wc_ces ces) 428 { 429 if (ces == WC_CES_WTF) 430 return "WTF"; 431 return WcCesInfo[WC_CES_INDEX(ces)].name; 432 } 433 434 char * 435 wc_ces_to_charset_desc(wc_ces ces) 436 { 437 if (ces == WC_CES_WTF) 438 return "W3M Transfer Format"; 439 return WcCesInfo[WC_CES_INDEX(ces)].desc; 440 } 441 442 wc_ces 443 wc_guess_8bit_charset(wc_ces orig) 444 { 445 switch (orig) { 446 case WC_CES_ISO_2022_JP: 447 case WC_CES_ISO_2022_JP_2: 448 case WC_CES_ISO_2022_JP_3: 449 return WC_CES_EUC_JP; 450 case WC_CES_ISO_2022_KR: 451 return WC_CES_EUC_KR; 452 case WC_CES_ISO_2022_CN: 453 case WC_CES_HZ_GB_2312: 454 return WC_CES_EUC_CN; 455 case WC_CES_US_ASCII: 456 return WC_CES_ISO_8859_1; 457 } 458 return orig; 459 } 460 461 wc_bool 462 wc_check_ces(wc_ces ces) 463 { 464 size_t i = WC_CES_INDEX(ces); 465 466 return (i <= WC_CES_END && WcCesInfo[i].id == ces); 467 } 468 469 static int 470 wc_ces_list_cmp(const void *a, const void *b) 471 { 472 return strcasecmp(((wc_ces_list *)a)->desc, ((wc_ces_list *)b)->desc); 473 } 474 475 static wc_ces_list *list = NULL; 476 477 wc_ces_list * 478 wc_get_ces_list(void) 479 { 480 wc_ces_info *info; 481 size_t n; 482 483 if (list) 484 return list; 485 for (info = WcCesInfo, n = 0; info->id; info++) { 486 if (info->name != NULL) 487 n++; 488 } 489 list = New_N(wc_ces_list, n + 1); 490 for (info = WcCesInfo, n = 0; info->id; info++) { 491 if (info->name != NULL) { 492 list[n].id = info->id; 493 list[n].name = info->name; 494 list[n].desc = info->desc; 495 n++; 496 } 497 } 498 list[n].id = 0; 499 list[n].name = NULL; 500 list[n].desc = NULL; 501 qsort(list, n, sizeof(wc_ces_list), wc_ces_list_cmp); 502 return list; 503 }