1#!/usr/bin/perl 2# 3# Generate a subset of the UnicodeData.txt file, available from 4# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt 5# 6# Usage: 7# gensubset.pl [subset files] < UnicodeData.txt > MiniUCD.txt 8# 9 10%need_these = (); 11 12# Mark as needed all the characters mentioned in the relevant files 13foreach $file (@ARGV) { 14 open(F, '<', $file) or die; 15 while (defined($line = <F>)) { 16 $line =~ s/\s*(\#.*|)$//; # Remove comments and final blanks 17 @f = split(/\s+/, $line); 18 next if (scalar @f != 2); 19 $need_these{hex $f[1]}++; 20 } 21 close(F); 22} 23 24# Also mark as needed any case variants of those 25# (Note: this doesn't necessarily provide the full transitive closure, 26# but we shouldn't need it.) 27while (defined($line = <STDIN>)) { 28 @f = split(/;/, $line); 29 if ($f[0] =~ /^([0-9a-f]+)$/i) { 30 $r = hex $f[0]; 31 if ($need_these{$r}) { 32 $need_these{hex $f[12]}++ if ($f[12] ne ''); 33 $need_these{hex $f[13]}++ if ($f[13] ne ''); 34 $need_these{hex $f[14]}++ if ($f[14] ne ''); 35 } 36 } 37} 38 39# Finally, write out the subset 40seek(STDIN, 0, 0); 41while (defined($line = <STDIN>)) { 42 ($v, $l) = split(/;/, $line, 2); 43 if ($v =~ /^([0-9a-f]+)\-([0-9a-f]+)$/i) { 44 # This isn't actually the format... fix that if it ever matters 45 $r1 = hex $1; 46 $r2 = hex $2; 47 } elsif ($v =~ /^([0-9a-f]+)$/i) { 48 $r1 = $r2 = hex $1; 49 } else { 50 next; 51 } 52 for ($r = $r1; $r <= $r2; $r++) { 53 printf "%04X;%s", $r, $l if ($need_these{$r}); 54 } 55} 56 57 58