1#! /usr/bin/perl -w 2# -*- Perl -*- 3# 4# afblue.pl 5# 6# Process a blue zone character data file. 7# 8# Copyright (C) 2013-2020 by 9# David Turner, Robert Wilhelm, and Werner Lemberg. 10# 11# This file is part of the FreeType project, and may only be used, 12# modified, and distributed under the terms of the FreeType project 13# license, LICENSE.TXT. By continuing to use, modify, or distribute 14# this file you indicate that you have read the license and 15# understand and accept it fully. 16 17use strict; 18use warnings; 19use English '-no_match_vars'; 20use open ':std', ':encoding(UTF-8)'; 21 22 23my $prog = $PROGRAM_NAME; 24$prog =~ s| .* / ||x; # Remove path. 25 26die "usage: $prog datafile < infile > outfile\n" if $#ARGV != 0; 27 28 29my $datafile = $ARGV[0]; 30 31my %diversions; # The extracted and massaged data from `datafile'. 32my @else_stack; # Booleans to track else-clauses. 33my @name_stack; # Stack of integers used for names of aux. variables. 34 35my $curr_enum; # Name of the current enumeration. 36my $curr_array; # Name of the current array. 37my $curr_max; # Name of the current maximum value. 38 39my $curr_enum_element; # Name of the current enumeration element. 40my $curr_offset; # The offset relative to current aux. variable. 41my $curr_elem_size; # The number of non-space characters in the current string or 42 # the number of elements in the current block. 43 44my $have_sections = 0; # Boolean; set if start of a section has been seen. 45my $have_strings; # Boolean; set if current section contains strings. 46my $have_blocks; # Boolean; set if current section contains blocks. 47 48my $have_enum_element; # Boolean; set if we have an enumeration element. 49my $in_string; # Boolean; set if a string has been parsed. 50 51my $num_sections = 0; # Number of sections seen so far. 52 53my $last_aux; # Name of last auxiliary variable. 54 55 56# Regular expressions. 57 58# [<ws>] <enum_name> <ws> <array_name> <ws> <max_name> [<ws>] ':' [<ws>] '\n' 59my $section_re = qr/ ^ \s* (\S+) \s+ (\S+) \s+ (\S+) \s* : \s* $ /x; 60 61# [<ws>] <enum_element_name> [<ws>] '\n' 62my $enum_element_re = qr/ ^ \s* ( [A-Za-z0-9_]+ ) \s* $ /x; 63 64# '#' <preprocessor directive> '\n' 65my $preprocessor_re = qr/ ^ \# /x; 66 67# [<ws>] '/' '/' <comment> '\n' 68my $comment_re = qr| ^ \s* // |x; 69 70# empty line 71my $whitespace_only_re = qr/ ^ \s* $ /x; 72 73# [<ws>] '"' <string> '"' [<ws>] '\n' (<string> doesn't contain newlines) 74my $string_re = qr/ ^ \s* 75 " ( (?> (?: (?> [^"\\]+ ) | \\. )* ) ) " 76 \s* $ /x; 77 78# [<ws>] '{' <block> '}' [<ws>] '\n' (<block> can contain newlines) 79my $block_start_re = qr/ ^ \s* \{ /x; 80 81# We need the capturing group for `split' to make it return the separator 82# tokens (i.e., the opening and closing brace) also. 83my $brace_re = qr/ ( [{}] ) /x; 84 85 86sub Warn 87{ 88 my $message = shift; 89 warn "$datafile:$INPUT_LINE_NUMBER: warning: $message\n"; 90} 91 92 93sub Die 94{ 95 my $message = shift; 96 die "$datafile:$INPUT_LINE_NUMBER: error: $message\n"; 97} 98 99 100my $warned_before = 0; 101 102sub warn_before 103{ 104 Warn("data before first section gets ignored") unless $warned_before; 105 $warned_before = 1; 106} 107 108 109sub strip_newline 110{ 111 chomp; 112 s/ \x0D $ //x; 113} 114 115 116sub end_curr_string 117{ 118 # Append final null byte to string. 119 if ($have_strings) 120 { 121 push @{$diversions{$curr_array}}, " '\\0',\n" if $in_string; 122 123 $curr_offset++; 124 $in_string = 0; 125 } 126} 127 128 129sub update_max_elem_size 130{ 131 if ($curr_elem_size) 132 { 133 my $max = pop @{$diversions{$curr_max}}; 134 $max = $curr_elem_size if $curr_elem_size > $max; 135 push @{$diversions{$curr_max}}, $max; 136 } 137} 138 139 140sub convert_non_ascii_char 141{ 142 # A UTF-8 character outside of the printable ASCII range, with possibly a 143 # leading backslash character. 144 my $s = shift; 145 146 # Here we count characters, not bytes. 147 $curr_elem_size += length $s; 148 149 utf8::encode($s); 150 $s = uc unpack 'H*', $s; 151 152 $curr_offset += $s =~ s/\G(..)/'\\x$1', /sg; 153 154 return $s; 155} 156 157 158sub convert_ascii_chars 159{ 160 # A series of ASCII characters in the printable range. 161 my $s = shift; 162 163 # We reduce multiple space characters to a single one. 164 $s =~ s/ +/ /g; 165 166 # Count all non-space characters. Note that `()' applies a list context 167 # to the capture that is used to count the elements. 168 $curr_elem_size += () = $s =~ /[^ ]/g; 169 170 $curr_offset += $s =~ s/\G(.)/'$1', /g; 171 172 return $s; 173} 174 175 176sub convert_literal 177{ 178 my $s = shift; 179 my $orig = $s; 180 181 # ASCII printables and space 182 my $safe_re = '\x20-\x7E'; 183 # ASCII printables and space, no backslash 184 my $safe_no_backslash_re = '\x20-\x5B\x5D-\x7E'; 185 186 $s =~ s{ 187 (?: \\? ( [^$safe_re] ) 188 | ( (?: [$safe_no_backslash_re] 189 | \\ [$safe_re] )+ ) ) 190 } 191 { 192 defined($1) ? convert_non_ascii_char($1) 193 : convert_ascii_chars($2) 194 }egx; 195 196 # We assume that `$orig' doesn't contain `*/' 197 return $s . " /* $orig */"; 198} 199 200 201sub aux_name 202{ 203 return "af_blue_" . $num_sections. "_" . join('_', @name_stack); 204} 205 206 207sub aux_name_next 208{ 209 $name_stack[$#name_stack]++; 210 my $name = aux_name(); 211 $name_stack[$#name_stack]--; 212 213 return $name; 214} 215 216 217sub enum_val_string 218{ 219 # Build string that holds code to save the current offset in an 220 # enumeration element. 221 my $aux = shift; 222 223 my $add = ($last_aux eq "af_blue_" . $num_sections . "_0" ) 224 ? "" 225 : "$last_aux + "; 226 227 return " $aux = $add$curr_offset,\n"; 228} 229 230 231 232# Process data file. 233 234open(DATA, $datafile) || die "$prog: can't open \`$datafile': $OS_ERROR\n"; 235 236while (<DATA>) 237{ 238 strip_newline(); 239 240 next if /$comment_re/; 241 next if /$whitespace_only_re/; 242 243 if (/$section_re/) 244 { 245 Warn("previous section is empty") if ($have_sections 246 && !$have_strings 247 && !$have_blocks); 248 249 end_curr_string(); 250 update_max_elem_size(); 251 252 # Save captured groups from `section_re'. 253 $curr_enum = $1; 254 $curr_array = $2; 255 $curr_max = $3; 256 257 $curr_enum_element = ""; 258 $curr_offset = 0; 259 260 Warn("overwriting already defined enumeration \`$curr_enum'") 261 if exists($diversions{$curr_enum}); 262 Warn("overwriting already defined array \`$curr_array'") 263 if exists($diversions{$curr_array}); 264 Warn("overwriting already defined maximum value \`$curr_max'") 265 if exists($diversions{$curr_max}); 266 267 $diversions{$curr_enum} = []; 268 $diversions{$curr_array} = []; 269 $diversions{$curr_max} = []; 270 271 push @{$diversions{$curr_max}}, 0; 272 273 @name_stack = (); 274 push @name_stack, 0; 275 276 $have_sections = 1; 277 $have_strings = 0; 278 $have_blocks = 0; 279 280 $have_enum_element = 0; 281 $in_string = 0; 282 283 $num_sections++; 284 $curr_elem_size = 0; 285 286 $last_aux = aux_name(); 287 288 next; 289 } 290 291 if (/$preprocessor_re/) 292 { 293 if ($have_sections) 294 { 295 # Having preprocessor conditionals complicates the computation of 296 # correct offset values. We have to introduce auxiliary enumeration 297 # elements with the name `af_blue_<s>_<n1>_<n2>_...' that store 298 # offsets to be used in conditional clauses. `<s>' is the number of 299 # sections seen so far, `<n1>' is the number of `#if' and `#endif' 300 # conditionals seen so far in the topmost level, `<n2>' the number of 301 # `#if' and `#endif' conditionals seen so far one level deeper, etc. 302 # As a consequence, uneven values are used within a clause, and even 303 # values after a clause, since the C standard doesn't allow the 304 # redefinition of an enumeration value. For example, the name 305 # `af_blue_5_1_6' is used to construct enumeration values in the fifth 306 # section after the third (second-level) if-clause within the first 307 # (top-level) if-clause. After the first top-level clause has 308 # finished, `af_blue_5_2' is used. The current offset is then 309 # relative to the value stored in the current auxiliary element. 310 311 if (/ ^ \# \s* if /x) 312 { 313 push @else_stack, 0; 314 315 $name_stack[$#name_stack]++; 316 317 push @{$diversions{$curr_enum}}, enum_val_string(aux_name()); 318 $last_aux = aux_name(); 319 320 push @name_stack, 0; 321 322 $curr_offset = 0; 323 } 324 elsif (/ ^ \# \s* elif /x) 325 { 326 Die("unbalanced #elif") unless @else_stack; 327 328 pop @name_stack; 329 330 push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next()); 331 $last_aux = aux_name(); 332 333 push @name_stack, 0; 334 335 $curr_offset = 0; 336 } 337 elsif (/ ^ \# \s* else /x) 338 { 339 my $prev_else = pop @else_stack; 340 Die("unbalanced #else") unless defined($prev_else); 341 Die("#else already seen") if $prev_else; 342 push @else_stack, 1; 343 344 pop @name_stack; 345 346 push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next()); 347 $last_aux = aux_name(); 348 349 push @name_stack, 0; 350 351 $curr_offset = 0; 352 } 353 elsif (/ ^ (\# \s*) endif /x) 354 { 355 my $prev_else = pop @else_stack; 356 Die("unbalanced #endif") unless defined($prev_else); 357 358 pop @name_stack; 359 360 # If there is no else-clause for an if-clause, we add one. This is 361 # necessary to have correct offsets. 362 if (!$prev_else) 363 { 364 # Use amount of whitespace from `endif'. 365 push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next()) 366 . $1 . "else\n"; 367 $last_aux = aux_name(); 368 369 $curr_offset = 0; 370 } 371 372 $name_stack[$#name_stack]++; 373 374 push @{$diversions{$curr_enum}}, enum_val_string(aux_name()); 375 $last_aux = aux_name(); 376 377 $curr_offset = 0; 378 } 379 380 # Handle (probably continued) preprocessor lines. 381 CONTINUED_LOOP: 382 { 383 do 384 { 385 strip_newline(); 386 387 push @{$diversions{$curr_enum}}, $ARG . "\n"; 388 push @{$diversions{$curr_array}}, $ARG . "\n"; 389 390 last CONTINUED_LOOP unless / \\ $ /x; 391 392 } while (<DATA>); 393 } 394 } 395 else 396 { 397 warn_before(); 398 } 399 400 next; 401 } 402 403 if (/$enum_element_re/) 404 { 405 end_curr_string(); 406 update_max_elem_size(); 407 408 $curr_enum_element = $1; 409 $have_enum_element = 1; 410 $curr_elem_size = 0; 411 412 next; 413 } 414 415 if (/$string_re/) 416 { 417 if ($have_sections) 418 { 419 Die("strings and blocks can't be mixed in a section") if $have_blocks; 420 421 # Save captured group from `string_re'. 422 my $string = $1; 423 424 if ($have_enum_element) 425 { 426 push @{$diversions{$curr_enum}}, enum_val_string($curr_enum_element); 427 $have_enum_element = 0; 428 } 429 430 $string = convert_literal($string); 431 432 push @{$diversions{$curr_array}}, " $string\n"; 433 434 $have_strings = 1; 435 $in_string = 1; 436 } 437 else 438 { 439 warn_before(); 440 } 441 442 next; 443 } 444 445 if (/$block_start_re/) 446 { 447 if ($have_sections) 448 { 449 Die("strings and blocks can't be mixed in a section") if $have_strings; 450 451 my $depth = 0; 452 my $block = ""; 453 my $block_end = 0; 454 455 # Count braces while getting the block. 456 BRACE_LOOP: 457 { 458 do 459 { 460 strip_newline(); 461 462 foreach my $substring (split(/$brace_re/)) 463 { 464 if ($block_end) 465 { 466 Die("invalid data after last matching closing brace") 467 if $substring !~ /$whitespace_only_re/; 468 } 469 470 $block .= $substring; 471 472 if ($substring eq '{') 473 { 474 $depth++; 475 } 476 elsif ($substring eq '}') 477 { 478 $depth--; 479 480 $block_end = 1 if $depth == 0; 481 } 482 } 483 484 # If we are here, we have run out of substrings, so get next line 485 # or exit. 486 last BRACE_LOOP if $block_end; 487 488 $block .= "\n"; 489 490 } while (<DATA>); 491 } 492 493 if ($have_enum_element) 494 { 495 push @{$diversions{$curr_enum}}, enum_val_string($curr_enum_element); 496 $have_enum_element = 0; 497 } 498 499 push @{$diversions{$curr_array}}, $block . ",\n"; 500 501 $curr_offset++; 502 $curr_elem_size++; 503 504 $have_blocks = 1; 505 } 506 else 507 { 508 warn_before(); 509 } 510 511 next; 512 } 513 514 # Garbage. We weren't able to parse the data. 515 Die("syntax error"); 516} 517 518# Finalize data. 519end_curr_string(); 520update_max_elem_size(); 521 522 523# Filter stdin to stdout, replacing `@...@' templates. 524 525sub emit_diversion 526{ 527 my $diversion_name = shift; 528 return (exists($diversions{$1})) ? "@{$diversions{$1}}" 529 : "@" . $diversion_name . "@"; 530} 531 532 533$LIST_SEPARATOR = ''; 534 535my $s1 = "This file has been generated by the Perl script \`$prog',"; 536my $s1len = length $s1; 537my $s2 = "using data from file \`$datafile'."; 538my $s2len = length $s2; 539my $slen = ($s1len > $s2len) ? $s1len : $s2len; 540 541print "/* " . $s1 . " " x ($slen - $s1len) . " */\n" 542 . "/* " . $s2 . " " x ($slen - $s2len) . " */\n" 543 . "\n"; 544 545while (<STDIN>) 546{ 547 s/ @ ( [A-Za-z0-9_]+? ) @ / emit_diversion($1) /egx; 548 print; 549} 550 551# EOF 552