• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/perl
2#
3# Generate a subset of the UnicodeData.txt file, available from
4# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
5#
6# Usage:
7#   gensubset.pl [subset files] < UnicodeData.txt > MiniUCD.txt
8#
9
10%need_these = ();
11
12# Mark as needed all the characters mentioned in the relevant files
13foreach $file (@ARGV) {
14    open(F, '<', $file) or die;
15    while (defined($line = <F>)) {
16	$line =~ s/\s*(\#.*|)$//; # Remove comments and final blanks
17	@f = split(/\s+/, $line);
18	next if (scalar @f != 2);
19	$need_these{hex $f[1]}++;
20    }
21    close(F);
22}
23
24# Also mark as needed any case variants of those
25# (Note: this doesn't necessarily provide the full transitive closure,
26# but we shouldn't need it.)
27while (defined($line = <STDIN>)) {
28    @f = split(/;/, $line);
29    if ($f[0] =~ /^([0-9a-f]+)$/i) {
30	$r = hex $f[0];
31	if ($need_these{$r}) {
32	    $need_these{hex $f[12]}++ if ($f[12] ne '');
33	    $need_these{hex $f[13]}++ if ($f[13] ne '');
34	    $need_these{hex $f[14]}++ if ($f[14] ne '');
35	}
36    }
37}
38
39# Finally, write out the subset
40seek(STDIN, 0, 0);
41while (defined($line = <STDIN>)) {
42    ($v, $l) = split(/;/, $line, 2);
43    if ($v =~ /^([0-9a-f]+)\-([0-9a-f]+)$/i) {
44	# This isn't actually the format... fix that if it ever matters
45	$r1 = hex $1;
46	$r2 = hex $2;
47    } elsif ($v =~ /^([0-9a-f]+)$/i) {
48	$r1 = $r2 = hex $1;
49    } else {
50	next;
51    }
52    for ($r = $r1; $r <= $r2; $r++) {
53	printf "%04X;%s", $r, $l if ($need_these{$r});
54    }
55}
56
57
58