1#!/usr/bin/perl 2# 3# Produce a codepage matching table. For each 8-bit character, list 4# a primary and an alternate match (the latter used for case-insensitive 5# matching.) 6# 7# Usage: 8# cptable.pl UnicodeData console-cp.txt filesystem-cp.txt output.cp 9# 10# Note: for the format of the UnicodeData file, see: 11# http://www.unicode.org/Public/UNIDATA/UCD.html 12# 13 14($ucd, $cpco, $cpfs, $cpout) = @ARGV; 15 16if (!defined($cpout)) { 17 die "Usage: $0 UnicodeData console-cp.txt fs-cp.txt output.cp\n"; 18} 19 20%ucase = (); 21%lcase = (); 22%tcase = (); 23%decomp = (); 24 25open(UCD, '<', $ucd) 26 or die "$0: could not open unicode data: $ucd: $!\n"; 27while (defined($line = <UCD>)) { 28 chomp $line; 29 @f = split(/;/, $line); 30 $n = hex $f[0]; 31 $ucase{$n} = ($f[12] ne '') ? hex $f[12] : $n; 32 $lcase{$n} = ($f[13] ne '') ? hex $f[13] : $n; 33 $tcase{$n} = ($f[14] ne '') ? hex $f[14] : $n; 34 if ($f[5] =~ /^[0-9A-F\s]+$/) { 35 # This character has a canonical decomposition. 36 # The regular expression rejects angle brackets, so other 37 # decompositions aren't permitted. 38 $decomp{$n} = []; 39 foreach my $dch (split(' ', $f[5])) { 40 push(@{$decomp{$n}}, hex $dch); 41 } 42 } 43} 44close(UCD); 45 46# 47# Filesystem and console codepages. The filesystem codepage is used 48# for FAT shortnames, whereas the console codepage is whatever is used 49# on the screen and keyboard. 50# 51@xtab = (undef) x 256; 52%tabx = (); 53open(CPFS, '<', $cpfs) 54 or die "$0: could not open fs codepage: $cpfs: $!\n"; 55while (defined($line = <CPFS>)) { 56 $line =~ s/\s*(\#.*|)$//; 57 @f = split(/\s+/, $line); 58 next if (scalar @f != 2); 59 next if (hex $f[0] > 255); 60 $xtab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode 61 $tabx{hex $f[1]} = hex $f[0]; # Unicode -> Codepage 62} 63close(CPFS); 64 65@ytab = (undef) x 256; 66%taby = (); 67open(CPCO, '<', $cpco) 68 or die "$0: could not open console codepage: $cpco: $!\n"; 69while (defined($line = <CPCO>)) { 70 $line =~ s/\s*(\#.*|)$//; 71 @f = split(/\s+/, $line); 72 next if (scalar @f != 2); 73 next if (hex $f[0] > 255); 74 $ytab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode 75 $taby{hex $f[1]} = hex $f[0]; # Unicode -> Codepage 76} 77close(CPCO); 78 79open(CPOUT, '>', $cpout) 80 or die "$0: could not open output file: $cpout: $!\n"; 81# 82# Magic number, in anticipation of being able to load these 83# files dynamically... 84# 85print CPOUT pack("VV", 0x58a8b3d4, 0x51d21eb1); 86 87# Header fields available for future use... 88print CPOUT pack("VVVVVV", 0, 0, 0, 0, 0, 0); 89 90# 91# Self (shortname) uppercase table. 92# This depends both on the console codepage and the filesystem codepage; 93# the logical transcoding operation is: 94# 95# $tabx{$ucase{$ytab[$i]}} 96# 97# ... where @ytab is console codepage -> Unicode and 98# %tabx is Unicode -> filesystem codepage. 99# 100@uctab = (undef) x 256; 101for ($i = 0; $i < 256; $i++) { 102 $uuc = $ucase{$ytab[$i]}; # Unicode upper case 103 if (defined($tabx{$uuc})) { 104 # Straight-forward conversion 105 $u = $tabx{$uuc}; 106 } elsif (defined($tabx{${$decomp{$uuc}}[0]})) { 107 # Upper case equivalent stripped of accents 108 $u = $tabx{${$decomp{$uuc}}[0]}; 109 } else { 110 # No equivalent at all found. Assume it is a lower-case-only 111 # character, like greek alpha in CP437. 112 $u = $i; 113 } 114 $uctab[$i] = $u; 115 print CPOUT pack("C", $u); 116} 117 118# 119# Self (shortname) lowercase table. 120# This depends both on the console codepage and the filesystem codepage; 121# the logical transcoding operation is: 122# 123# $taby{$lcase{$xtab[$i]}} 124# 125# ... where @ytab is console codepage -> Unicode and 126# %tabx is Unicode -> filesystem codepage. 127# 128@lctab = (undef) x 256; 129for ($i = 0; $i < 256; $i++) { 130 $llc = $lcase{$xtab[$i]}; # Unicode lower case 131 if (defined($l = $taby{$llc}) && $uctab[$l] == $i) { 132 # Straight-forward conversion 133 } elsif (defined($l = $tabx{${$decomp{$llc}}[0]}) && $uctab[$l] == $i) { 134 # Lower case equivalent stripped of accents 135 } else { 136 # No equivalent at all found. Find *anything* that matches the 137 # bijection criterion... 138 for ($l = 0; $l < 256; $l++) { 139 last if ($uctab[$l] == $i); 140 } 141 $l = $i if ($l == 256); # If nothing, we're screwed anyway... 142 } 143 $lctab[$i] = $l; 144 print CPOUT pack("C", $l); 145} 146 147# 148# Unicode (longname) matching table. 149# This only depends on the console codepage. 150# 151$pp0 = ''; $pp1 = ''; 152for ($i = 0; $i < 256; $i++) { 153 if (!defined($ytab[$i])) { 154 $p0 = $p1 = 0xffff; 155 } else { 156 $p0 = $ytab[$i]; 157 if ($ucase{$p0} != $p0) { 158 $p1 = $ucase{$p0}; 159 } elsif ($lcase{$p0} != $p0) { 160 $p1 = $lcase{$p0}; 161 } elsif ($tcase{$p0} != $p0) { 162 $p1 = $tcase{$p0}; 163 } else { 164 $p1 = $p0; 165 } 166 } 167 # Only the BMP is supported... 168 $p0 = 0xffff if ($p0 > 0xffff); 169 $p1 = 0xffff if ($p1 > 0xffff); 170 $pp0 .= pack("v", $p0); 171 $pp1 .= pack("v", $p1); 172} 173print CPOUT $pp0, $pp1; 174close (CPOUT); 175 176 177