1#!/usr/bin/perl 2 3# 4# Copyright (C) 2017 and later: Unicode, Inc. and others. 5# License & terms of use: http://www.unicode.org/copyright.html 6# 7 8use strict; 9use Unicode::UCD 'charinfo'; 10use Unicode::Normalize; 11use utf8; 12use open ':utf8'; 13 14my $printout = 0; 15my $braces = 0; 16my $colls = 0; 17my $aliased = 0; 18my $newName = ""; 19my $filename; 20my $suffix; 21my $locale; 22 23NEW_FILE: 24foreach my $arg (@ARGV) { 25 if($newName =~ /^$/) { 26 $locale = $arg; 27 $locale =~ s#^.*/##g; 28 $locale =~ s/\.txt//; 29 } else { 30 $newName = ""; 31 } 32 my $command = "/home/weiv/build/current/bin/uconv -x hex-any/Java -f utf8 -t utf8 $arg"; 33 print $command."\n"; 34 my @bundle = `$command`; 35 foreach $_ (@bundle) { 36 #while(<>) { 37 #print $ARGV if eof; 38 if(/^\/\//) { 39 next; 40 } 41 if(/collations/) { 42 print "found Collations\n"; 43 $colls = 1; 44 if(/alias/) { 45 print "collations are aliased\n"; 46 $aliased = 1; 47 } 48 } 49 if($aliased) { 50 print "processing aliased data: $_\n"; 51 if(/\{/) { 52 print "Braces opened\n"; 53 $braces = 1; 54 } 55 if($braces && /\"(.*)\"/) { 56 $newName = $1; 57 print "Aliasing to $newName\n"; 58 } 59 if($braces && /\}/) { 60 $braces = 0; 61 print "Braces closed\n"; 62 $aliased = 0; 63 print "Switching from $filename to $newName\n"; 64 $arg =~ s/$locale\.txt$/$newName\.txt/; 65 print "$arg\n"; 66 redo NEW_FILE; 67 } 68 69 } 70 if(/standard|phonebook|traditional|pinyin|stroke|direct/ && $colls) { 71 print "found $& collation\n"; 72 $suffix = "_".uc($&); 73 if(/standard/) { 74 $suffix = ""; 75 } 76 } 77 if(/Sequence/ && $colls) { 78 #binmode ARGV, ":utf8"; 79 $printout = 1; 80 #$filename = $ARGV; 81 $filename = $locale; 82 if($suffix) { 83 $filename .= "_".$suffix; 84 } 85 $filename .= "_collation.html"; 86 print "filename is $filename\n"; 87 #open(OUT, ">:utf8", "$filename"); 88 open(OUT, ">$filename"); 89 printHeading($arg); 90 #next; 91 } 92 my $line = $_; 93 if($line =~ /\{/ && $printout) { 94 $braces++; 95 } 96 if($printout) { 97 print OUT processLine($line); 98 print OUT "\n"; 99 } 100 if( $line =~ /\}/ && $printout) { 101 $braces--; 102 if($braces == 0) { 103 $printout = 0; 104 printFooting(); 105 close(OUT); 106 } 107 } 108 } 109} 110 111sub processLine { 112 my $line = shift; 113 $_ = $line; 114 my $i = 0; 115 my $j = 0; 116 my $result; 117# remove comments 118 s#//.*$##g; 119# remove "Sequence" if present 120 s/Sequence\s*//; 121# remove leading brace if present 122 s/^\s*{//; 123# remove trailing brace if present 124 s/}\s*$//; 125# remove trailing quote 126 s/"\s*$//; 127#remove lead quote 128 s/^\s*"//; 129#separate options 130 s/(\[.*\])/\n\1/g; 131#separate resets 132 s/\s*\&\s*/\n\& /g; 133#separate strengths and insert spaces 134 s/\s*(<{1,4})\s*/\n\1 /g; 135#separate equals and insert spaces 136 s/\s*=\s*/\n= /g; 137 138# break into individual reset/strength/setting lines 139 my @lines = split(/\n/); 140 141 my $line; 142 my $name; 143 my $spanEnd = ""; 144 my $result = ""; 145 my $names = ""; 146 my $codes = ""; 147 my $lrm = ""; 148 149 foreach $line (@lines) { 150 # skip empty lines 151 if($line =~ /^$/) { 152 next; 153 } 154 $spanEnd = ""; 155 $name = ""; 156 $lrm = ""; 157 $line = NFC($line); 158 # for resets and strengths we will get name for elements 159 if($line =~ /<{1,4} |= |& \[.*\]|& /) { 160 $name = "<span title=\""; 161 $names = ""; 162 $codes = ""; 163 my $start = $&; 164 my $rest = $'; 165 for ($j = 0; $j < length($rest); $j++) { 166 my $char = substr($rest, $j, 1); 167 my $charVal = ord($char); 168 # some of elements are part of the syntax, so they are 169 # entered without translation to the name 170 if($charVal == 0x002F || $charVal == 0x007C) { 171 $name .= $codes.$names." $char "; 172 $codes = ""; 173 $names = ""; 174 } elsif($charVal == 0x0027) { #quote requires more processing 175 #$name .= "'"; 176 } else { 177 my $charinfo = charinfo($charVal); 178 $codes .= $charinfo->{'code'}." "; 179 $names .= "{".$charinfo->{'name'}."} "; 180 if($charinfo->{'bidi'} eq "R" || $charinfo->{'bidi'} eq "AL") { 181 $lrm = "‎"; 182 } 183 #$name .= $charinfo->{'code'}." {".$charinfo->{'name'}."} "; 184 } 185 } 186 $name .= $codes.$names."\" >"; 187 $spanEnd = "</span>"; 188 } 189 #print $name."\n"; 190 if($line =~ /^<<<</) { 191 $line = " $line"; 192 } elsif($line =~ /^<<</) { 193 $line = " $line"; 194 } elsif($line =~ /^<</) { 195 $line = " $line"; 196 } elsif($line =~ /^</) { 197 $line = " $line"; 198 } elsif($line =~ /^=/) { 199 $line = " $line"; 200 } 201 # insert spaces around vertical bars (fix prefixes) 202 203 # insert spaces around slashes (fix expansions) 204 $line =~ s#/# / #g; 205 # replace & 206 $line =~ s/\&/&/g; 207 # replace spaces 208 $line =~ s/ / /g; 209 # replace < 210 $line =~ s/</</g; 211 # replace > 212 $line =~ s/>/>/g; 213 214 #$lines[$i] = $name.$lrm.$line."</span><br>"; 215 #$i++; 216 $result .= $name.$lrm.$line.$spanEnd."<br>\n"; 217 } 218 219 #$_ = join("\n", @lines); 220 return $result; 221 222} 223 224sub printHeading { 225my $filename = shift; 226$filename =~ s/\.txt//; 227print OUT <<"EndOfHeading"; 228<html> 229<head> 230<meta http-equiv="content-type" content="text/html; charset=utf-8"> 231</head> 232# Collation data resource bundle generated for locale: $filename<br> 233# For platform icu reference platform UCA<br><br> 234 235 236$filename {<br> 237 CollationElements {<br> 238 Sequence {<br> 239EndOfHeading 240} 241 242sub printFooting { 243print OUT <<"EndOfFooting"; 244 }<br> 245 }<br> 246}<br> 247 248</pre> 249</html> 250EndOfFooting 251} 252