utf8.c (8096B)
1 2 #ifdef USE_UNICODE 3 4 #include "wc.h" 5 #include "ucs.h" 6 #include "utf8.h" 7 #include "wtf.h" 8 9 wc_uint8 WC_UTF8_MAP[ 0x100 ] = { 10 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 11 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 12 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 13 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 14 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 15 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 18 19 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 23 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 24 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 25 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 26 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 7, 27 }; 28 29 static wc_uchar utf8_buf[7]; 30 31 size_t 32 wc_ucs_to_utf8(wc_uint32 ucs, wc_uchar *utf8) 33 { 34 if (ucs < WC_C_UTF8_L2) { 35 utf8[0] = ucs; 36 utf8[1] = 0; 37 return 1; 38 } else if (ucs < WC_C_UTF8_L3) { 39 utf8[0] = (ucs >> 6) | 0xc0; 40 utf8[1] = (ucs & 0x3f) | 0x80; 41 utf8[2] = 0; 42 return 2; 43 } else if (ucs < WC_C_UTF8_L4) { 44 utf8[0] = (ucs >> 12) | 0xe0; 45 utf8[1] = ((ucs >> 6) & 0x3f) | 0x80; 46 utf8[2] = (ucs & 0x3f) | 0x80; 47 utf8[3] = 0; 48 return 3; 49 } else if (ucs < WC_C_UTF8_L5) { 50 utf8[0] = (ucs >> 18) | 0xf0; 51 utf8[1] = ((ucs >> 12) & 0x3f) | 0x80; 52 utf8[2] = ((ucs >> 6) & 0x3f) | 0x80; 53 utf8[3] = (ucs & 0x3f) | 0x80; 54 utf8[4] = 0; 55 return 4; 56 } else if (ucs < WC_C_UTF8_L6) { 57 utf8[0] = (ucs >> 24) | 0xf8; 58 utf8[1] = ((ucs >> 18) & 0x3f) | 0x80; 59 utf8[2] = ((ucs >> 12) & 0x3f) | 0x80; 60 utf8[3] = ((ucs >> 6) & 0x3f) | 0x80; 61 utf8[4] = (ucs & 0x3f) | 0x80; 62 utf8[5] = 0; 63 return 5; 64 } else if (ucs <= WC_C_UCS4_END) { 65 utf8[0] = (ucs >> 30) | 0xfc; 66 utf8[1] = ((ucs >> 24) & 0x3f) | 0x80; 67 utf8[2] = ((ucs >> 18) & 0x3f) | 0x80; 68 utf8[3] = ((ucs >> 12) & 0x3f) | 0x80; 69 utf8[4] = ((ucs >> 6) & 0x3f) | 0x80; 70 utf8[5] = (ucs & 0x3f) | 0x80; 71 utf8[6] = 0; 72 return 6; 73 } else { 74 utf8[0] = 0; 75 return 0; 76 } 77 } 78 79 wc_uint32 80 wc_utf8_to_ucs(wc_uchar *utf8) 81 { 82 wc_uint32 ucs; 83 84 switch (WC_UTF8_MAP[utf8[0]]) { 85 case 1: 86 ucs = (wc_uint32) utf8[0]; 87 if (ucs >= WC_C_UTF8_L2) 88 break; 89 return ucs; 90 case 2: 91 ucs = ((wc_uint32)(utf8[0] & 0x1f) << 6) 92 | (wc_uint32)(utf8[1] & 0x3f); 93 if (ucs < WC_C_UTF8_L2) 94 break; 95 return ucs; 96 case 3: 97 ucs = ((wc_uint32)(utf8[0] & 0x0f) << 12) 98 | ((wc_uint32)(utf8[1] & 0x3f) << 6) 99 | (wc_uint32)(utf8[2] & 0x3f); 100 if (ucs < WC_C_UTF8_L3) 101 break; 102 return ucs; 103 case 4: 104 ucs = ((wc_uint32)(utf8[0] & 0x07) << 18) 105 | ((wc_uint32)(utf8[1] & 0x3f) << 12) 106 | ((wc_uint32)(utf8[2] & 0x3f) << 6) 107 | (wc_uint32)(utf8[3] & 0x3f); 108 if (ucs < WC_C_UTF8_L4) 109 break; 110 return ucs; 111 case 5: 112 ucs = ((wc_uint32)(utf8[0] & 0x03) << 24) 113 | ((wc_uint32)(utf8[1] & 0x3f) << 18) 114 | ((wc_uint32)(utf8[2] & 0x3f) << 12) 115 | ((wc_uint32)(utf8[3] & 0x3f) << 6) 116 | (wc_uint32)(utf8[4] & 0x3f); 117 if (ucs < WC_C_UTF8_L5) 118 break; 119 return ucs; 120 case 6: 121 ucs = ((wc_uint32)(utf8[0] & 0x01) << 30) 122 | ((wc_uint32)(utf8[1] & 0x3f) << 24) 123 | ((wc_uint32)(utf8[2] & 0x3f) << 18) 124 | ((wc_uint32)(utf8[3] & 0x3f) << 12) 125 | ((wc_uint32)(utf8[4] & 0x3f) << 6) 126 | (wc_uint32)(utf8[5] & 0x3f); 127 if (ucs < WC_C_UTF8_L6) 128 break; 129 return ucs; 130 default: 131 break; 132 } 133 return WC_C_UCS4_ERROR; 134 } 135 136 Str 137 wc_conv_from_utf8(Str is, wc_ces ces) 138 { 139 Str os; 140 wc_uchar *sp = (wc_uchar *)is->ptr; 141 wc_uchar *ep = sp + is->length; 142 wc_uchar *p; 143 wc_uchar *q = NULL; 144 int state = WC_UTF8_NOSTATE; 145 size_t next = 0; 146 wc_uint32 ucs; 147 wc_status st; 148 149 for (p = sp; p < ep && *p < 0x80; p++) 150 ; 151 if (p == ep) 152 return is; 153 os = Strnew_size(is->length * 4 / 3); 154 if (p > sp) 155 Strcat_charp_n(os, is->ptr, (int)(p - sp)); 156 157 st.tag = NULL; 158 st.ntag = 0; 159 for (; p < ep; p++) { 160 switch (state) { 161 case WC_UTF8_NOSTATE: 162 next = WC_UTF8_MAP[*p]; 163 switch (next) { 164 case 1: 165 wtf_push_ucs(os, (wc_uint32)*p, &st); 166 break; 167 case 8: 168 Strcat_char(os, (char)*p); 169 break; 170 case 0: 171 case 7: 172 wtf_push_unknown(os, p, 1); 173 break; 174 default: 175 q = p; 176 next--; 177 state = WC_UTF8_NEXT; 178 break; 179 } 180 break; 181 case WC_UTF8_NEXT: 182 if (WC_UTF8_MAP[*p]) { 183 wtf_push_unknown(os, q, p - q + 1); 184 state = WC_UTF8_NOSTATE; 185 break; 186 } 187 if (--next) 188 break; 189 state = WC_UTF8_NOSTATE; 190 ucs = wc_utf8_to_ucs(q); 191 if (ucs == WC_C_UCS4_ERROR || 192 (ucs >= WC_C_UCS2_SURROGATE && ucs <= WC_C_UCS2_SURROGATE_END)) 193 wtf_push_unknown(os, q, p - q + 1); 194 else if (ucs != WC_C_UCS2_BOM) 195 wtf_push_ucs(os, ucs, &st); 196 break; 197 } 198 } 199 switch (state) { 200 case WC_UTF8_NEXT: 201 wtf_push_unknown(os, q, p - q); 202 break; 203 } 204 return os; 205 } 206 207 static int 208 wc_push_tag_to_utf8(Str os, int ntag) 209 { 210 char *p; 211 212 if (ntag) { 213 p = wc_ucs_get_tag(ntag); 214 if (p == NULL) 215 ntag = 0; 216 } 217 if (ntag) { 218 wc_ucs_to_utf8(WC_C_LANGUAGE_TAG, utf8_buf); 219 Strcat_charp(os, (char *)utf8_buf); 220 for (; *p; p++) { 221 wc_ucs_to_utf8(WC_C_LANGUAGE_TAG0 | *p, utf8_buf); 222 Strcat_charp(os, (char *)utf8_buf); 223 } 224 } else { 225 wc_ucs_to_utf8(WC_C_CANCEL_TAG, utf8_buf); 226 Strcat_charp(os, (char *)utf8_buf); 227 } 228 return ntag; 229 } 230 231 void 232 wc_push_to_utf8(Str os, wc_wchar_t cc, wc_status *st) 233 { 234 while (1) { 235 switch (WC_CCS_SET(cc.ccs)) { 236 case WC_CCS_US_ASCII: 237 if (st->ntag) 238 st->ntag = wc_push_tag_to_utf8(os, 0); 239 Strcat_char(os, (char)(cc.code & 0x7f)); 240 return; 241 case WC_CCS_UCS2: 242 case WC_CCS_UCS4: 243 if (st->ntag) 244 st->ntag = wc_push_tag_to_utf8(os, 0); 245 wc_ucs_to_utf8(cc.code, utf8_buf); 246 Strcat_charp(os, (char *)utf8_buf); 247 return; 248 case WC_CCS_UCS_TAG: 249 if (WcOption.use_language_tag && wc_ucs_tag_to_tag(cc.code) != st->ntag) 250 st->ntag = wc_push_tag_to_utf8(os, wc_ucs_tag_to_tag(cc.code)); 251 wc_ucs_to_utf8(wc_ucs_tag_to_ucs(cc.code), utf8_buf); 252 Strcat_charp(os, (char *)utf8_buf); 253 return; 254 case WC_CCS_ISO_8859_1: 255 if (st->ntag) 256 st->ntag = wc_push_tag_to_utf8(os, 0); 257 wc_ucs_to_utf8((cc.code | 0x80), utf8_buf); 258 Strcat_charp(os, (char *)utf8_buf); 259 return; 260 case WC_CCS_UNKNOWN_W: 261 if (!WcOption.no_replace) { 262 if (st->ntag) 263 st->ntag = wc_push_tag_to_utf8(os, 0); 264 Strcat_charp(os, WC_REPLACE_W); 265 } 266 return; 267 case WC_CCS_UNKNOWN: 268 if (!WcOption.no_replace) { 269 if (st->ntag) 270 st->ntag = wc_push_tag_to_utf8(os, 0); 271 Strcat_charp(os, WC_REPLACE); 272 } 273 return; 274 default: 275 if (WcOption.ucs_conv && 276 (cc.code = wc_any_to_ucs(cc)) != WC_C_UCS4_ERROR) 277 cc.ccs = WC_CCS_UCS2; 278 else 279 cc.ccs = WC_CCS_IS_WIDE(cc.ccs) ? WC_CCS_UNKNOWN_W : WC_CCS_UNKNOWN; 280 continue; 281 } 282 } 283 } 284 285 void 286 wc_push_to_utf8_end(Str os, wc_status *st) 287 { 288 if (st->ntag) 289 st->ntag = wc_push_tag_to_utf8(os, 0); 290 return; 291 } 292 293 Str 294 wc_char_conv_from_utf8(wc_uchar c, wc_status *st) 295 { 296 static Str os; 297 static wc_uchar buf[6]; 298 static size_t nbuf, next; 299 wc_uint32 ucs; 300 301 if (st->state == -1) { 302 st->state = WC_UTF8_NOSTATE; 303 os = Strnew_size(8); 304 st->tag = NULL; 305 st->ntag = 0; 306 nbuf = 0; 307 } 308 309 switch (st->state) { 310 case WC_UTF8_NOSTATE: 311 switch (next = WC_UTF8_MAP[c]) { 312 case 1: 313 wtf_push_ucs(os, (wc_uint32)c, st); 314 break; 315 case 8: 316 Strcat_char(os, (char)c); 317 break; 318 case 0: 319 case 7: 320 break; 321 default: 322 buf[nbuf++] = c; 323 next--; 324 st->state = WC_UTF8_NEXT; 325 return NULL; 326 } 327 break; 328 case WC_UTF8_NEXT: 329 if (WC_UTF8_MAP[c]) 330 break; 331 buf[nbuf++] = c; 332 if (--next) 333 return NULL; 334 ucs = wc_utf8_to_ucs(buf); 335 if (ucs == WC_C_UCS4_ERROR || 336 (ucs >= WC_C_UCS2_SURROGATE && ucs <= WC_C_UCS2_SURROGATE_END)) 337 break; 338 if (ucs != WC_C_UCS2_BOM) 339 wtf_push_ucs(os, ucs, st); 340 break; 341 } 342 st->state = -1; 343 return os; 344 } 345 346 #endif