w3m

Unnamed repository; edit this file to name it for gitweb.
git clone https://logand.com/git/w3m.git/
Log | Files | Refs | README

utf8.c (8096B)


      1 
      2 #ifdef USE_UNICODE
      3 
      4 #include "wc.h"
      5 #include "ucs.h"
      6 #include "utf8.h"
      7 #include "wtf.h"
      8 
      9 wc_uint8 WC_UTF8_MAP[ 0x100 ] = {
     10    8, 8, 8, 8, 8, 8, 8, 8,  8, 8, 8, 8, 8, 8, 8, 8,
     11    8, 8, 8, 8, 8, 8, 8, 8,  8, 8, 8, 8, 8, 8, 8, 8,
     12    1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
     13    1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
     14    1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
     15    1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
     16    1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
     17    1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 8,
     18 
     19    0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
     20    0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
     21    0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
     22    0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
     23    2, 2, 2, 2, 2, 2, 2, 2,  2, 2, 2, 2, 2, 2, 2, 2,
     24    2, 2, 2, 2, 2, 2, 2, 2,  2, 2, 2, 2, 2, 2, 2, 2,
     25    3, 3, 3, 3, 3, 3, 3, 3,  3, 3, 3, 3, 3, 3, 3, 3,
     26    4, 4, 4, 4, 4, 4, 4, 4,  5, 5, 5, 5, 6, 6, 7, 7,
     27 };
     28 
     29 static wc_uchar utf8_buf[7];
     30 
     31 size_t
     32 wc_ucs_to_utf8(wc_uint32 ucs, wc_uchar *utf8)
     33 {
     34     if (ucs < WC_C_UTF8_L2) {
     35 	utf8[0] =   ucs;
     36 	utf8[1] = 0;
     37 	return 1;
     38     } else if (ucs < WC_C_UTF8_L3) {
     39 	utf8[0] =  (ucs >> 6)          | 0xc0;
     40 	utf8[1] =  (ucs        & 0x3f) | 0x80;
     41 	utf8[2] = 0;
     42 	return 2;
     43     } else if (ucs < WC_C_UTF8_L4) {
     44 	utf8[0] =  (ucs >> 12)         | 0xe0;
     45 	utf8[1] = ((ucs >> 6)  & 0x3f) | 0x80;
     46 	utf8[2] =  (ucs        & 0x3f) | 0x80;
     47 	utf8[3] = 0;
     48 	return 3;
     49     } else if (ucs < WC_C_UTF8_L5) {
     50 	utf8[0] =  (ucs >> 18)         | 0xf0;
     51 	utf8[1] = ((ucs >> 12) & 0x3f) | 0x80;
     52 	utf8[2] = ((ucs >> 6)  & 0x3f) | 0x80;
     53 	utf8[3] =  (ucs        & 0x3f) | 0x80;
     54 	utf8[4] = 0;
     55 	return 4;
     56     } else if (ucs < WC_C_UTF8_L6) {
     57 	utf8[0] =  (ucs >> 24)         | 0xf8;
     58 	utf8[1] = ((ucs >> 18) & 0x3f) | 0x80;
     59 	utf8[2] = ((ucs >> 12) & 0x3f) | 0x80;
     60 	utf8[3] = ((ucs >> 6)  & 0x3f) | 0x80;
     61 	utf8[4] =  (ucs        & 0x3f) | 0x80;
     62 	utf8[5] = 0;
     63 	return 5;
     64     } else if (ucs <= WC_C_UCS4_END) {
     65 	utf8[0] =  (ucs >> 30)         | 0xfc;
     66 	utf8[1] = ((ucs >> 24) & 0x3f) | 0x80;
     67 	utf8[2] = ((ucs >> 18) & 0x3f) | 0x80;
     68 	utf8[3] = ((ucs >> 12) & 0x3f) | 0x80;
     69 	utf8[4] = ((ucs >> 6)  & 0x3f) | 0x80;
     70 	utf8[5] =  (ucs        & 0x3f) | 0x80;
     71 	utf8[6] = 0;
     72 	return 6;
     73     } else {
     74 	utf8[0] = 0;
     75 	return 0;
     76     }
     77 }
     78 
     79 wc_uint32
     80 wc_utf8_to_ucs(wc_uchar *utf8)
     81 {
     82     wc_uint32 ucs;
     83 
     84     switch (WC_UTF8_MAP[utf8[0]]) {
     85     case 1:
     86 	ucs =  (wc_uint32) utf8[0];
     87 	if (ucs >= WC_C_UTF8_L2)
     88 	    break;
     89 	return ucs;
     90     case 2:
     91 	ucs = ((wc_uint32)(utf8[0] & 0x1f) << 6)
     92 	    |  (wc_uint32)(utf8[1] & 0x3f);
     93 	if (ucs < WC_C_UTF8_L2)
     94 	    break;
     95 	return ucs;
     96     case 3:
     97 	ucs = ((wc_uint32)(utf8[0] & 0x0f) << 12)
     98 	    | ((wc_uint32)(utf8[1] & 0x3f) << 6)
     99 	    |  (wc_uint32)(utf8[2] & 0x3f);
    100 	if (ucs < WC_C_UTF8_L3)
    101 	    break;
    102 	return ucs;
    103     case 4:
    104 	ucs = ((wc_uint32)(utf8[0] & 0x07) << 18)
    105 	    | ((wc_uint32)(utf8[1] & 0x3f) << 12)
    106 	    | ((wc_uint32)(utf8[2] & 0x3f) << 6)
    107 	    |  (wc_uint32)(utf8[3] & 0x3f);
    108 	if (ucs < WC_C_UTF8_L4)
    109 	    break;
    110 	return ucs;
    111     case 5:
    112 	ucs = ((wc_uint32)(utf8[0] & 0x03) << 24)
    113 	    | ((wc_uint32)(utf8[1] & 0x3f) << 18)
    114 	    | ((wc_uint32)(utf8[2] & 0x3f) << 12)
    115 	    | ((wc_uint32)(utf8[3] & 0x3f) << 6)
    116 	    |  (wc_uint32)(utf8[4] & 0x3f);
    117 	if (ucs < WC_C_UTF8_L5)
    118 	    break;
    119 	return ucs;
    120     case 6:
    121 	ucs = ((wc_uint32)(utf8[0] & 0x01) << 30)
    122 	    | ((wc_uint32)(utf8[1] & 0x3f) << 24)
    123 	    | ((wc_uint32)(utf8[2] & 0x3f) << 18)
    124 	    | ((wc_uint32)(utf8[3] & 0x3f) << 12)
    125 	    | ((wc_uint32)(utf8[4] & 0x3f) << 6)
    126 	    |  (wc_uint32)(utf8[5] & 0x3f);
    127 	if (ucs < WC_C_UTF8_L6)
    128 	    break;
    129 	return ucs;
    130     default:
    131 	break;
    132     }
    133     return WC_C_UCS4_ERROR;
    134 }
    135 
    136 Str
    137 wc_conv_from_utf8(Str is, wc_ces ces)
    138 {
    139     Str os;
    140     wc_uchar *sp = (wc_uchar *)is->ptr;
    141     wc_uchar *ep = sp + is->length;
    142     wc_uchar *p;
    143     wc_uchar *q = NULL;
    144     int state = WC_UTF8_NOSTATE;
    145     size_t next = 0;
    146     wc_uint32 ucs;
    147     wc_status st;
    148 
    149     for (p = sp; p < ep && *p < 0x80; p++)
    150 	;
    151     if (p == ep)
    152 	return is;
    153     os = Strnew_size(is->length * 4 / 3);
    154     if (p > sp)
    155 	Strcat_charp_n(os, is->ptr, (int)(p - sp));
    156 
    157     st.tag = NULL;
    158     st.ntag = 0;
    159     for (; p < ep; p++) {
    160 	switch (state) {
    161 	case WC_UTF8_NOSTATE:
    162 	    next = WC_UTF8_MAP[*p];
    163 	    switch (next) {
    164 	    case 1:
    165 		wtf_push_ucs(os, (wc_uint32)*p, &st);
    166 		break;
    167 	    case 8:
    168 		Strcat_char(os, (char)*p);
    169 		break;
    170 	    case 0:
    171 	    case 7:
    172 		wtf_push_unknown(os, p, 1);
    173 		break;
    174 	    default:
    175 		q = p;
    176 		next--;
    177 		state = WC_UTF8_NEXT;
    178 		break;
    179 	    }
    180 	    break;
    181 	case WC_UTF8_NEXT:
    182 	    if (WC_UTF8_MAP[*p]) {
    183 		wtf_push_unknown(os, q, p - q + 1);
    184 		state = WC_UTF8_NOSTATE;
    185 		break;
    186 	    }
    187 	    if (--next)
    188 		break;
    189 	    state = WC_UTF8_NOSTATE;
    190 	    ucs = wc_utf8_to_ucs(q);
    191 	    if (ucs == WC_C_UCS4_ERROR ||
    192 		(ucs >= WC_C_UCS2_SURROGATE && ucs <= WC_C_UCS2_SURROGATE_END))
    193 		wtf_push_unknown(os, q, p - q + 1);
    194 	    else if (ucs != WC_C_UCS2_BOM)
    195 		wtf_push_ucs(os, ucs, &st);
    196 	    break;
    197 	}
    198     }
    199     switch (state) {
    200     case WC_UTF8_NEXT:
    201 	wtf_push_unknown(os, q, p - q);
    202 	break;
    203     }
    204     return os;
    205 }
    206 
    207 static int
    208 wc_push_tag_to_utf8(Str os, int ntag)
    209 {
    210     char *p;
    211 
    212     if (ntag) {
    213 	p = wc_ucs_get_tag(ntag);
    214 	if (p == NULL)
    215 	    ntag = 0;
    216     }
    217     if (ntag) {
    218 	wc_ucs_to_utf8(WC_C_LANGUAGE_TAG, utf8_buf);
    219 	Strcat_charp(os, (char *)utf8_buf);
    220 	for (; *p; p++) {
    221 	    wc_ucs_to_utf8(WC_C_LANGUAGE_TAG0 | *p, utf8_buf);
    222 	    Strcat_charp(os, (char *)utf8_buf);
    223 	}
    224     } else {
    225 	wc_ucs_to_utf8(WC_C_CANCEL_TAG, utf8_buf);
    226 	Strcat_charp(os, (char *)utf8_buf);
    227     }
    228     return ntag;
    229 }
    230 
    231 void
    232 wc_push_to_utf8(Str os, wc_wchar_t cc, wc_status *st)
    233 {
    234   while (1) {
    235     switch (WC_CCS_SET(cc.ccs)) {
    236     case WC_CCS_US_ASCII:
    237 	if (st->ntag)
    238 	    st->ntag = wc_push_tag_to_utf8(os, 0);
    239 	Strcat_char(os, (char)(cc.code & 0x7f));
    240 	return;
    241     case WC_CCS_UCS2:
    242     case WC_CCS_UCS4:
    243 	if (st->ntag)
    244 	    st->ntag = wc_push_tag_to_utf8(os, 0);
    245 	wc_ucs_to_utf8(cc.code, utf8_buf);
    246 	Strcat_charp(os, (char *)utf8_buf);
    247 	return;
    248     case WC_CCS_UCS_TAG:
    249 	if (WcOption.use_language_tag && wc_ucs_tag_to_tag(cc.code) != st->ntag)
    250 	    st->ntag = wc_push_tag_to_utf8(os, wc_ucs_tag_to_tag(cc.code));
    251 	wc_ucs_to_utf8(wc_ucs_tag_to_ucs(cc.code), utf8_buf);
    252 	Strcat_charp(os, (char *)utf8_buf);
    253 	return;
    254     case WC_CCS_ISO_8859_1:
    255 	if (st->ntag)
    256 	    st->ntag = wc_push_tag_to_utf8(os, 0);
    257 	wc_ucs_to_utf8((cc.code | 0x80), utf8_buf);
    258 	Strcat_charp(os, (char *)utf8_buf);
    259 	return;
    260     case WC_CCS_UNKNOWN_W:
    261 	if (!WcOption.no_replace) {
    262 	    if (st->ntag)
    263 	        st->ntag = wc_push_tag_to_utf8(os, 0);
    264 	    Strcat_charp(os, WC_REPLACE_W);
    265 	}
    266 	return;
    267     case WC_CCS_UNKNOWN:
    268 	if (!WcOption.no_replace) {
    269 	    if (st->ntag)
    270 	        st->ntag = wc_push_tag_to_utf8(os, 0);
    271 	    Strcat_charp(os, WC_REPLACE);
    272 	}
    273 	return;
    274     default:
    275 	if (WcOption.ucs_conv &&
    276 		(cc.code = wc_any_to_ucs(cc)) != WC_C_UCS4_ERROR)
    277 	    cc.ccs = WC_CCS_UCS2;
    278 	else
    279 	    cc.ccs = WC_CCS_IS_WIDE(cc.ccs) ? WC_CCS_UNKNOWN_W : WC_CCS_UNKNOWN;
    280 	continue;
    281     }
    282   }
    283 }
    284 
    285 void
    286 wc_push_to_utf8_end(Str os, wc_status *st)
    287 {
    288     if (st->ntag)
    289 	st->ntag = wc_push_tag_to_utf8(os, 0);
    290     return;
    291 }
    292 
    293 Str
    294 wc_char_conv_from_utf8(wc_uchar c, wc_status *st)
    295 {
    296     static Str os;
    297     static wc_uchar buf[6];
    298     static size_t nbuf, next;
    299     wc_uint32 ucs;
    300 
    301     if (st->state == -1) {
    302 	st->state = WC_UTF8_NOSTATE;
    303 	os = Strnew_size(8);
    304 	st->tag = NULL;
    305 	st->ntag = 0;
    306 	nbuf = 0;
    307     }
    308 
    309     switch (st->state) {
    310     case WC_UTF8_NOSTATE:
    311 	switch (next = WC_UTF8_MAP[c]) {
    312 	case 1:
    313 	    wtf_push_ucs(os, (wc_uint32)c, st);
    314 	    break;
    315 	case 8:
    316 	    Strcat_char(os, (char)c);
    317 	    break;
    318 	case 0:
    319 	case 7:
    320 	    break;
    321 	default:
    322 	    buf[nbuf++] = c;
    323 	    next--;
    324 	    st->state = WC_UTF8_NEXT;
    325 	    return NULL;
    326 	}
    327 	break;
    328     case WC_UTF8_NEXT:
    329 	if (WC_UTF8_MAP[c])
    330 	    break;
    331 	buf[nbuf++] = c;
    332 	if (--next)
    333 	    return NULL;
    334 	ucs = wc_utf8_to_ucs(buf);
    335 	if (ucs == WC_C_UCS4_ERROR ||
    336 	    (ucs >= WC_C_UCS2_SURROGATE && ucs <= WC_C_UCS2_SURROGATE_END))
    337 	    break;
    338 	if (ucs != WC_C_UCS2_BOM)
    339 	    wtf_push_ucs(os, ucs, st);
    340 	break;
    341     }
    342     st->state = -1;
    343     return os;
    344 }
    345 
    346 #endif