#!/usr/bin/perl # # Convert font from JISX0208.1983 to Unicode encoding # May 2004 --pkv # See http://www.tzone.org/~vandry/howto/bigger-fonts-in-xterm.html # use strict; ### ### Build a mapping from JIS codes to Unicode using charcters not ### found in Kanjidic ### my $jis_to_unicode = { 0x2121 => 0x3000, # wide space 0x2122 => 0x3001, # japanese comma 0x2123 => 0x3002, # japanses dot 0x2124 => 0xff0c, # wide comma 0x2125 => 0xff0e, # wide dot 0x2126 => 0x30fb, # wide center dot 0x2127 => 0xff1a, # wide colon 0x2128 => 0xff1b, # wide semicolon 0x2129 => 0xff1f, # wide question mark 0x212a => 0xff01, # wide bang 0x212b => 0x309b, # voice diacritic 0x212c => 0x309c, # circle diacritic 0x212d => 0x00b4, # acute accent 0x212e => 0xff40, # grave accent 0x212f => 0x00a8, # umlaut 0x2130 => 0xff3e, # circumflex 0x2131 => 0xffe3, # overline 0x2132 => 0xff3f, # underline 0x2133 => 0x30fd, # repetition mark in katakana 0x2134 => 0x30fe, # voiced repetition mark in katakana 0x2135 => 0x309d, # repetition mark in hiragana 0x2136 => 0x309e, # voiced repetition mark in hiragana 0x2137 => 0x3003, # ditto mark 0x2138 => 0x4edd, # "as above" mark 0x2139 => 0x3005, # repetition of kanji 0x213a => 0x3006, # end or closure mark 0x213b => 0x3007, # circle 0x213c => 0x30fc, # long sound 0x213d => 0x2015, # horizontal bar 0x213e => 0x2010, # short horizontal bar 0x213f => 0xff0f, # wide slash 0x2140 => 0xff3c, # wide backslash 0x2141 => 0x301c, # wide tilde 0x2142 => 0x2016, # double vertical bar 0x2143 => 0xff5c, # single vertical bar 0x2144 => 0x2026, # three dots 0x2145 => 0x2025, # two dots 0x2146 => 0x2018, # open single quote 0x2147 => 0x2019, # close single quote 0x2148 => 0x201c, # open double quote 0x2149 => 0x201d, # close double quote 0x214a => 0xff08, # wide left paranthesis 0x214b => 0xff09, # wide right paranthesis 0x214c => 0x3014, # wide left bent square bracket 0x214d => 0x3015, # wide right bent square bracket 0x214e => 0xff3b, # wide left square bracket 0x214f => 0xff3d, # wide right square bracket 0x2150 => 0xff5b, # wide left curly bracket 0x2151 => 0xff5d, # wide right curly bracket 0x2152 => 0x3008, # wide less than 0x2153 => 0x3009, # wide greater than 0x2154 => 0x300a, # wide much less than 0x2155 => 0x300b, # wide much greater than 0x2156 => 0x300c, # japanese open single quote 0x2157 => 0x300d, # japanese close single quote 0x2158 => 0x300e, # japanese open double quote 0x2159 => 0x300f, # japanese close double quote 0x215a => 0x3010, # bold left square bracket 0x215b => 0x3011, # bold right square bracket 0x215c => 0xff0b, # wide plus 0x215d => 0x2212, # wide minus 0x215e => 0x00b1, # plus or minus 0x215f => 0x00d7, # cross multiplication 0x2160 => 0x00f7, # divide 0x2161 => 0xff1d, # wide equals 0x2162 => 0x2260, # wide not equal 0x2163 => 0xff1c, # wide smashed less than 0x2164 => 0xff1e, # wide smashed greater than 0x2165 => 0x2266, # less than or equal to 0x2166 => 0x2267, # greater than or equal to 0x2167 => 0x221e, # infinity 0x2168 => 0x2234, # therefore 0x2169 => 0x2642, # male 0x216a => 0x2640, # female 0x216b => 0x00b0, # degree 0x216c => 0x2032, # minute 0x216d => 0x2033, # second 0x216e => 0x2103, # degrees celcius 0x216f => 0xffe5, # Yen 0x2170 => 0xff04, # wide dollar 0x2171 => 0x00a2, # cents 0x2172 => 0x00a3, # pound 0x2173 => 0xff05, # wide percent 0x2174 => 0xff03, # wide hash 0x2175 => 0xff06, # wide ampersand 0x2176 => 0xff0a, # wide star 0x2177 => 0xff20, # wide @ 0x2178 => 0x00a7, # section mark 0x2179 => 0x2606, # open star 0x217a => 0x2605, # closed star 0x217b => 0x25cb, # open circle 0x217c => 0x25cf, # closed circle 0x217d => 0x25ce, # double circle 0x217e => 0x25c7, # open rotated square 0x2221 => 0x25c6, # closed rotated square 0x2222 => 0x25a1, # open box 0x2223 => 0x25a0, # closed box 0x2224 => 0x25b3, # open triangle 0x2225 => 0x25b2, # closed triangle 0x2226 => 0x25bd, # open upside down triangle 0x2227 => 0x25bc, # closed upside down triangle 0x2228 => 0x203b, # fancy thing 0x2229 => 0x3012, # post 0x222a => 0x2192, # right arrow 0x222b => 0x2190, # left arrow 0x222c => 0x2191, # up arrow 0x222d => 0x2193, # down arrow 0x222e => 0x3013, # thick equals 0x223a => 0x2208, # element of 0x223b => 0x220b, # backwards element of 0x223c => 0x2286, # subset of 0x223d => 0x2287, # superset of 0x223e => 0x2282, # proper subset of 0x223f => 0x2283, # proper superset of 0x2240 => 0x222a, # union 0x2241 => 0x2229, # intersection 0x224a => 0x2227, # hat 0x224b => 0x2228, # vee 0x224c => 0x00ac, # nook 0x224d => 0x21d2, # implies 0x224e => 0x21d4, # iff 0x224f => 0x2200, # forall 0x2250 => 0x2203, # exists 0x225c => 0x2220, # angle 0x225d => 0x22a5, # perpendicular 0x225e => 0x2312, # top parenthesis 0x225f => 0x2202, # partial differentiation 0x2260 => 0x2207, # delta 0x2261 => 0x2261, # definition 0x2262 => 0x2252, # funky equals 0x2263 => 0x226a, # tight much less than 0x2264 => 0x226b, # tight much greater than 0x2265 => 0x221a, # root 0x2266 => 0x223d, # chain link 0x2267 => 0x221d, # open infinity 0x2268 => 0x2235, # upside down therefore 0x2269 => 0x222b, # integral 0x226a => 0x222c, # double integral 0x2272 => 0x212b, # capital A with circle above 0x2273 => 0x2030, # perthousand 0x2274 => 0x266f, # sharp 0x2275 => 0x266d, # flat 0x2276 => 0x266a, # note 0x2277 => 0x2020, # dagger 0x2278 => 0x2021, # double dagger 0x2279 => 0x00b6, # paragraph 0x227e => 0x25ef, # big open circle 0x2727 => 0x0401, # capital IO 0x2757 => 0x0451, # lowercase io 0x2821 => 0x2500, # box drawing: - 0x2822 => 0x2502, # box drawing: | 0x2823 => 0x250c, # box drawing: /- 0x2824 => 0x2510, # box drawing: -\ 0x2825 => 0x2518, # box drawing: -/ 0x2826 => 0x2514, # box drawing: \- 0x2827 => 0x251c, # box drawing: |- 0x2828 => 0x252c, # box drawing: T 0x2829 => 0x2524, # box drawing: -| 0x282a => 0x2534, # box drawing: _|_ 0x282b => 0x253c, # box drawing: + 0x282c => 0x2501, # box drawing: bold - 0x282d => 0x2503, # box drawing: bold | 0x282e => 0x250f, # box drawing: bold /- 0x282f => 0x2513, # box drawing: bold -\ 0x2830 => 0x251b, # box drawing: bold -/ 0x2831 => 0x2517, # box drawing: bold \- 0x2832 => 0x2523, # box drawing: bold |- 0x2833 => 0x2533, # box drawing: bold T 0x2834 => 0x252b, # box drawing: bold -| 0x2835 => 0x253b, # box drawing: bold _|_ 0x2836 => 0x254b, # box drawing: bold + 0x2837 => 0x2520, # box drawing: bold left |- 0x2838 => 0x252f, # box drawing: bold top T 0x2839 => 0x2528, # box drawing: bold right -| 0x283a => 0x2537, # box drawing: bold bottom _|_ 0x283b => 0x253f, # box drawing: bold h + 0x283c => 0x251d, # box drawing: bold right |- 0x283d => 0x2530, # box drawing: bold bottom T 0x283e => 0x2525, # box drawing: bold left -| 0x283f => 0x2538, # box drawing: bold top _|_ 0x2840 => 0x2542, # box drawing: bold v + }; # wide ASCII for (0..9, 17..42, 49..74) { $jis_to_unicode->{$_ + 0x2330} = $_ + 0xff10; } # Hiragana for (0..82) { $jis_to_unicode->{$_ + 0x2421} = $_ + 0x3041; } # Katakana for (0..85) { $jis_to_unicode->{$_ + 0x2521} = $_ + 0x30a1; } # Greek letters 1 for (0..16) { $jis_to_unicode->{$_ + 0x2621} = $_ + 0x391; $jis_to_unicode->{$_ + 0x2641} = $_ + 0x3b1; } # Greek letters 2 for (17..23) { $jis_to_unicode->{$_ + 0x2621} = $_ + 0x392; $jis_to_unicode->{$_ + 0x2641} = $_ + 0x3b2; } # Cyrillic 1 for (0..5) { $jis_to_unicode->{$_ + 0x2721} = $_ + 0x410; $jis_to_unicode->{$_ + 0x2751} = $_ + 0x430; } # Cyrillic 2 for (7..32) { $jis_to_unicode->{$_ + 0x2721} = $_ + 0x40f; $jis_to_unicode->{$_ + 0x2751} = $_ + 0x42f; } ### ### Supplement the mapping using Kanjidic ### my $KANJIDIC = "/usr/share/edict/kanjidic"; open(K, "<", $KANJIDIC) || die "cannot open kanjidic"; if (open(PIPE, "-|") == 0) { open(STDIN, "<&K"); close K; exec 'sh', '-c', 'cut -d\ -f1,2 | iconv -f EUC-JP -t UCS-4'; } close K; my ($buffer, $b2); my ($fill, $i); my $value; my $char; my $mode = 'c'; $b2 = ''; while (($i = read(PIPE, $buffer, 4096-$fill)) > 0) { if ($b2 ne '') { $buffer = $b2 . $buffer; $i += $fill; } if ($i & 3) { $fill = $i-($i&3); $b2 = substr($buffer, $fill); $buffer = substr($buffer, 0, $fill); } else { $fill = 0; $b2 = ""; } for (unpack("N*", $buffer)) { if ($mode eq 'c') { if (($_ != 10) && ($_ != 13)) { $char = $_; $mode = ' '; } } elsif ($mode eq ' ') { $value = 0; $mode = ($_ == 32) ? 'v' : 'x'; } elsif ($mode eq 'v') { if (($_ == 10) || ($_ == 13)) { $jis_to_unicode->{$value} = $char; $mode = 'c'; } elsif (($_ >= 48) && ($_ < 58)) { $value <<= 4; $value += ($_ - 48); } elsif (($_ >= 65) && ($_ < 71)) { $value <<= 4; $value += ($_ - 55); } elsif (($_ >= 97) && ($_ < 103)) { $value <<= 4; $value += ($_ - 87); } else { $mode = 'x'; } } elsif ($mode eq 'x') { $mode = 'c' if (($_ == 10) || ($_ == 13)); } } } close PIPE; ### ### Read the font from stdin and dump it to stdout translating ### using what we have just built ### while () { if (/(^ENCODING\s+)(\d+)(\s*)$/s) { if (defined($jis_to_unicode->{$2})) { $_ = $1 . $jis_to_unicode->{$2} . $3; } else { print STDERR sprintf "unknown JIS character 0x%04x left alone\n", $2; } } elsif (/(^FONT\s.*)-JISX0208.1983-0(\s*)$/s) { $_ = $1 . "-ISO10646-1" . $2; } elsif (/(^CHARSET_REGISTRY\s+")JISX0208.1983("\s*)$/s) { $_ = $1 . "ISO10646" . $2; } elsif (/(^CHARSET_ENCODING\s+")0("\s*)$/s) { $_ = $1 . "1" . $2; } print; }