mk_gb18030_ucs_map.pl (2768B)
1 2 @NAME = (); 3 while(<DATA>) { 4 chop; 5 s/\s*$//; 6 (($n, $m, $c) = split(" ", $_, 3)) >= 3 || next; 7 push(@NAME, $n); 8 $MAP{$n} = $m; 9 $CODE{$n} = $c; 10 } 11 12 %from_ucs0 = (); 13 foreach $name (@NAME) { 14 15 $code = $CODE{$name}; 16 $map = $MAP{$name}; 17 18 print "$name\t$map\t$code\n"; 19 20 %to_ucs = (); 21 %from_ucs = (); 22 open(MAP, "< $map"); 23 while(<MAP>) { 24 /^#/ && next; 25 s/#.*//; 26 (($i, $u) = split(" ")) || next; 27 $i = hex($i); 28 $u = hex($u); 29 $from_ucs{$u} = $i; 30 if (! $from_ucs0{$u}) { 31 $to_ucs{$i} = $u; 32 } 33 } 34 35 if ($name eq "gbk") { 36 %from_ucs0 = %from_ucs; 37 next; 38 } 39 40 $p = 0; 41 for $ub (0x81 .. 0xFE) { 42 for $lb (0x40 .. 0x7E, 0x80 .. 0xFE) { 43 $i = ($ub << 8) + $lb; 44 if ($u = $to_ucs{$i}) { 45 if ($u != $ou + 1) { 46 if ($p) { 47 $ucs2_end{$su} = $ou; 48 $gbk_end{$s} = $og; 49 } 50 $p = 0; 51 } 52 if (! $p) { 53 $to_ucs2{$i} = $u; 54 $from_ucs2{$u} = $i; 55 $s = $i; 56 $su = $u; 57 } 58 $p = 1; 59 $ou = $u; 60 } else { 61 if ($p) { 62 $ucs2_end{$su} = $ou; 63 $gbk_end{$s} = $og; 64 } 65 $p = 0; 66 } 67 $og = $i; 68 } 69 } 70 if ($p) { 71 $ucs2_end{$su} = $ou; 72 $gbk_end{$s} = 0xFEFE; 73 } 74 75 %from_ucs4 = (); 76 $i = 0; 77 $p = 0; 78 for $u (0x0080 .. 0xD7FF, 0xE000 .. 0xFFFF) { 79 if (! $from_ucs{$u}) { 80 if (! $p) { 81 $from_ucs4{$u} = $i; 82 $s = $u; 83 } 84 $i++; 85 $p = 1; 86 } else { 87 if ($p) { 88 $ucs4_end{$s} = $u - 1; 89 } 90 $p = 0; 91 } 92 if ($u == 0xD7FF) { 93 if ($p) { 94 $ucs4_end{$s} = $u - 1; 95 } 96 $p = 0; 97 } 98 } 99 if ($p) { 100 $ucs4_end{$s} = 0xFFFF; 101 } 102 103 open(OUT, "> ${name}_ucs.map"); 104 105 # print OUT <<EOF; 106 # /* 107 # These conversion tables between $code and 108 # Unicode were made from 109 # 110 # ftp://ftp.unicode.org/Public/MAPPINGS/$map. 111 # */ 112 print OUT <<EOF; 113 /* $code */ 114 EOF 115 116 @ucs = sort { $a <=> $b } keys %to_ucs2; 117 $nucs = @ucs + 0; 118 119 print OUT <<EOF; 120 121 #define N_gbk_ext_ucs_map $nucs 122 123 wc_map3 gbk_ext_ucs_map[ N_gbk_ext_ucs_map ] = { 124 EOF 125 for(@ucs) { 126 printf OUT " { 0x%.4X, 0x%.4X, 0x%.4X },\n", $_, $gbk_end{$_}, $to_ucs2{$_}; 127 } 128 129 print OUT <<EOF; 130 }; 131 EOF 132 133 @ucs = sort { $a <=> $b } keys %from_ucs2; 134 $nucs = @ucs + 0; 135 136 print OUT <<EOF; 137 138 #define N_ucs_gbk_ext_map $nucs 139 140 static wc_map3 ucs_gbk_ext_map[ N_ucs_gbk_ext_map ] = { 141 EOF 142 for(@ucs) { 143 printf OUT " { 0x%.4X, 0x%.4X, 0x%.4X },\n", $_, $ucs2_end{$_}, $from_ucs2{$_}; 144 } 145 146 print OUT <<EOF; 147 }; 148 EOF 149 150 @ucs = sort { $a <=> $b } keys %from_ucs4; 151 $nucs = @ucs + 0; 152 153 print OUT <<EOF; 154 155 #define N_ucs_${name}_map $nucs 156 157 static wc_map3 ucs_${name}_map[ N_ucs_${name}_map ] = { 158 EOF 159 for(@ucs) { 160 printf OUT " { 0x%.4X, 0x%.4X, 0x%.4X },\n", $_, $ucs4_end{$_}, $from_ucs4{$_}; 161 } 162 163 print OUT <<EOF; 164 }; 165 EOF 166 167 close(MAP); 168 } 169 170 __END__ 171 gbk VENDORS/MICSFT/WINDOWS/CP936.TXT GBK (Chinese) 172 gb18030 GBK.TXT GB18030 (Chinese)