1#!/usr/bin/env python 2 3# Copyright JS Foundation and other contributors, http://js.foundation 4# 5# Licensed under the Apache License, Version 2.0 (the "License"); 6# you may not use this file except in compliance with the License. 7# You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an "AS IS" BASIS 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16 17from __future__ import print_function 18 19import argparse 20import bisect 21import csv 22import itertools 23import os 24import warnings 25 26from gen_c_source import LICENSE, format_code 27from settings import PROJECT_DIR 28 29 30RANGES_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-ranges.inc.h') 31CONVERSIONS_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-conversions.inc.h') 32 33 34# common code generation 35 36 37class UniCodeSource(object): 38 def __init__(self, filepath): 39 self.__filepath = filepath 40 self.__header = [LICENSE, ""] 41 self.__data = [] 42 43 def complete_header(self, completion): 44 self.__header.append(completion) 45 self.__header.append("") # for an extra empty line 46 47 def add_table(self, table, table_name, table_type, table_descr): 48 self.__data.append(table_descr) 49 self.__data.append("static const %s lit_%s[] JERRY_ATTR_CONST_DATA =" % (table_type, table_name)) 50 self.__data.append("{") 51 self.__data.append(format_code(table, 1)) 52 self.__data.append("};") 53 self.__data.append("") # for an extra empty line 54 55 def generate(self): 56 with open(self.__filepath, 'w') as generated_source: 57 generated_source.write("\n".join(self.__header)) 58 generated_source.write("\n".join(self.__data)) 59 60class UnicodeCategorizer(object): 61 def __init__(self): 62 # unicode categories: Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs 63 # Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So 64 # letter: Lu Ll Lt Lm Lo Nl 65 # non-letter-indent-part: 66 # digit: Nd 67 # punctuation mark: Mn Mc 68 # connector punctuation: Pc 69 # separators: Zs 70 self._unicode_categories = { 71 'letters_category' : ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"], 72 'non_letters_category' : ["Nd", "Mn", "Mc", "Pc"], 73 'separators_category' : ["Zs"] 74 } 75 76 self._categories = { 77 'letters' : [], 78 'non_letters' : [], 79 'separators' : [] 80 } 81 82 def _store_by_category(self, unicode_id, category): 83 """ 84 Store the given unicode_id by its category 85 """ 86 for target_category in self._categories: 87 if category in self._unicode_categories[target_category + '_category']: 88 self._categories[target_category].append(unicode_id) 89 90 def read_categories(self, unicode_data_file): 91 """ 92 Read the corresponding unicode values and store them in category lists. 93 94 :return: List of letters, non_letter and separators. 95 """ 96 97 range_start_id = 0 98 99 with open(unicode_data_file) as unicode_data: 100 for line in csv.reader(unicode_data, delimiter=';'): 101 unicode_id = int(line[0], 16) 102 103 # Skip supplementary planes and ascii chars 104 if unicode_id >= 0x10000 or unicode_id < 128: 105 continue 106 107 category = line[2] 108 109 if range_start_id != 0: 110 while range_start_id <= unicode_id: 111 self._store_by_category(range_start_id, category) 112 range_start_id += 1 113 range_start_id = 0 114 continue 115 116 if line[1].startswith('<'): 117 # Save the start position of the range 118 range_start_id = unicode_id 119 120 self._store_by_category(unicode_id, category) 121 122 # This separator char is handled separatly 123 separators = self._categories['separators'] 124 non_breaking_space = 0x00A0 125 if non_breaking_space in separators: 126 separators.remove(int(non_breaking_space)) 127 128 # These separator chars are not in the unicode data file or not in Zs category 129 mongolian_vowel_separator = 0x180E 130 medium_mathematical_space = 0x205F 131 zero_width_space = 0x200B 132 133 if mongolian_vowel_separator not in separators: 134 bisect.insort(separators, int(mongolian_vowel_separator)) 135 if medium_mathematical_space not in separators: 136 bisect.insort(separators, int(medium_mathematical_space)) 137 if zero_width_space not in separators: 138 bisect.insort(separators, int(zero_width_space)) 139 140 # https://www.ecma-international.org/ecma-262/5.1/#sec-7.1 format-control characters 141 non_letters = self._categories['non_letters'] 142 zero_width_non_joiner = 0x200C 143 zero_width_joiner = 0x200D 144 145 bisect.insort(non_letters, int(zero_width_non_joiner)) 146 bisect.insort(non_letters, int(zero_width_joiner)) 147 148 return self._categories['letters'], self._categories['non_letters'], self._categories['separators'] 149 150 151def group_ranges(i): 152 """ 153 Convert an increasing list of integers into a range list 154 155 :return: List of ranges. 156 """ 157 for _, group in itertools.groupby(enumerate(i), lambda q: (q[1] - q[0])): 158 group = list(group) 159 yield group[0][1], group[-1][1] 160 161 162def split_list(category_list): 163 """ 164 Split list of ranges into intervals and single char lists. 165 166 :return: List of interval starting points, interval lengths and single chars 167 """ 168 169 interval_sps = [] 170 interval_lengths = [] 171 chars = [] 172 173 for element in category_list: 174 interval_length = element[1] - element[0] 175 if interval_length == 0: 176 chars.append(element[0]) 177 elif interval_length > 255: 178 for i in range(element[0], element[1], 256): 179 length = 255 if (element[1] - i > 255) else (element[1] - i) 180 interval_sps.append(i) 181 interval_lengths.append(length) 182 else: 183 interval_sps.append(element[0]) 184 interval_lengths.append(element[1] - element[0]) 185 186 return interval_sps, interval_lengths, chars 187 188 189def generate_ranges(script_args): 190 categorizer = UnicodeCategorizer() 191 letters, non_letters, separators = categorizer.read_categories(script_args.unicode_data) 192 193 letter_tables = split_list(list(group_ranges(letters))) 194 non_letter_tables = split_list(list(group_ranges(non_letters))) 195 separator_tables = split_list(list(group_ranges(separators))) 196 197 c_source = UniCodeSource(RANGES_C_SOURCE) 198 199 header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__), 200 " * from %s. Do not edit! */" % os.path.basename(script_args.unicode_data), 201 ""] 202 203 c_source.complete_header("\n".join(header_completion)) 204 205 c_source.add_table(letter_tables[0], 206 "unicode_letter_interval_sps", 207 "uint16_t", 208 ("/**\n" 209 " * Character interval starting points for the unicode letters.\n" 210 " *\n" 211 " * The characters covered by these intervals are from\n" 212 " * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl\n" 213 " */")) 214 215 c_source.add_table(letter_tables[1], 216 "unicode_letter_interval_lengths", 217 "uint8_t", 218 ("/**\n" 219 " * Character lengths for the unicode letters.\n" 220 " *\n" 221 " * The characters covered by these intervals are from\n" 222 " * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl\n" 223 " */")) 224 225 c_source.add_table(letter_tables[2], 226 "unicode_letter_chars", 227 "uint16_t", 228 ("/**\n" 229 " * Those unicode letter characters that are not inside any of\n" 230 " * the intervals specified in lit_unicode_letter_interval_sps array.\n" 231 " *\n" 232 " * The characters are from the following Unicode categories:\n" 233 " * Lu, Ll, Lt, Lm, Lo, Nl\n" 234 " */")) 235 236 c_source.add_table(non_letter_tables[0], 237 "unicode_non_letter_ident_part_interval_sps", 238 "uint16_t", 239 ("/**\n" 240 " * Character interval starting points for non-letter character\n" 241 " * that can be used as a non-first character of an identifier.\n" 242 " *\n" 243 " * The characters covered by these intervals are from\n" 244 " * the following Unicode categories: Nd, Mn, Mc, Pc\n" 245 " */")) 246 247 c_source.add_table(non_letter_tables[1], 248 "unicode_non_letter_ident_part_interval_lengths", 249 "uint8_t", 250 ("/**\n" 251 " * Character interval lengths for non-letter character\n" 252 " * that can be used as a non-first character of an identifier.\n" 253 " *\n" 254 " * The characters covered by these intervals are from\n" 255 " * the following Unicode categories: Nd, Mn, Mc, Pc\n" 256 " */")) 257 258 c_source.add_table(non_letter_tables[2], 259 "unicode_non_letter_ident_part_chars", 260 "uint16_t", 261 ("/**\n" 262 " * Those non-letter characters that can be used as a non-first\n" 263 " * character of an identifier and not included in any of the intervals\n" 264 " * specified in lit_unicode_non_letter_ident_part_interval_sps array.\n" 265 " *\n" 266 " * The characters are from the following Unicode categories:\n" 267 " * Nd, Mn, Mc, Pc\n" 268 " */")) 269 270 c_source.add_table(separator_tables[0], 271 "unicode_separator_char_interval_sps", 272 "uint16_t", 273 ("/**\n" 274 " * Unicode separator character interval starting points from Unicode category: Zs\n" 275 " */")) 276 277 c_source.add_table(separator_tables[1], 278 "unicode_separator_char_interval_lengths", 279 "uint8_t", 280 ("/**\n" 281 " * Unicode separator character interval lengths from Unicode category: Zs\n" 282 " */")) 283 284 c_source.add_table(separator_tables[2], 285 "unicode_separator_chars", 286 "uint16_t", 287 ("/**\n" 288 " * Unicode separator characters that are not in the\n" 289 " * lit_unicode_separator_char_intervals array.\n" 290 " *\n" 291 " * Unicode category: Zs\n" 292 " */")) 293 294 c_source.generate() 295 296 297# functions for unicode conversions 298 299 300def parse_unicode_sequence(raw_data): 301 """ 302 Parse unicode sequence from raw data. 303 304 :param raw_data: Contains the unicode sequence which needs to parse. 305 :return: The parsed unicode sequence. 306 """ 307 308 result = '' 309 310 for unicode_char in raw_data.split(' '): 311 if unicode_char == '': 312 continue 313 314 # Convert it to unicode code point (from hex value without 0x prefix) 315 hex_val = int(unicode_char, 16) 316 try: 317 result += unichr(hex_val) 318 except NameError: 319 result += chr(hex_val) 320 321 return result 322 323 324def read_case_mappings(unicode_data_file, special_casing_file): 325 """ 326 Read the corresponding unicode values of lower and upper case letters and store these in tables. 327 328 :param unicode_data_file: Contains the default case mappings (one-to-one mappings). 329 :param special_casing_file: Contains additional informative case mappings that are either not one-to-one 330 or which are context-sensitive. 331 :return: Upper and lower case mappings. 332 """ 333 334 lower_case_mapping = {} 335 upper_case_mapping = {} 336 337 # Add one-to-one mappings 338 with open(unicode_data_file) as unicode_data: 339 unicode_data_reader = csv.reader(unicode_data, delimiter=';') 340 341 for line in unicode_data_reader: 342 letter_id = int(line[0], 16) 343 344 # Skip supplementary planes and ascii chars 345 if letter_id >= 0x10000 or letter_id < 128: 346 continue 347 348 capital_letter = line[12] 349 small_letter = line[13] 350 351 if capital_letter: 352 upper_case_mapping[letter_id] = parse_unicode_sequence(capital_letter) 353 354 if small_letter: 355 lower_case_mapping[letter_id] = parse_unicode_sequence(small_letter) 356 357 # Update the conversion tables with the special cases 358 with open(special_casing_file) as special_casing: 359 special_casing_reader = csv.reader(special_casing, delimiter=';') 360 361 for line in special_casing_reader: 362 # Skip comment sections and empty lines 363 if not line or line[0].startswith('#'): 364 continue 365 366 # Replace '#' character with empty string 367 for idx, i in enumerate(line): 368 if i.find('#') >= 0: 369 line[idx] = '' 370 371 letter_id = int(line[0], 16) 372 condition_list = line[4] 373 374 # Skip supplementary planes, ascii chars, and condition_list 375 if letter_id >= 0x10000 or letter_id < 128 or condition_list: 376 continue 377 378 small_letter = parse_unicode_sequence(line[1]) 379 capital_letter = parse_unicode_sequence(line[3]) 380 381 lower_case_mapping[letter_id] = small_letter 382 upper_case_mapping[letter_id] = capital_letter 383 384 return lower_case_mapping, upper_case_mapping 385 386 387def extract_ranges(letter_case, reverse_letter_case=None): 388 """ 389 Extract ranges from case mappings 390 (the second param is optional, if it's not empty, a range will contains bidirectional conversions only). 391 392 :param letter_id: An integer, representing the unicode code point of the character. 393 :param letter_case: case mappings dictionary which contains the conversions. 394 :param reverse_letter_case: Comparable case mapping table which contains the return direction of the conversion. 395 :return: A table with the start points and their mapped value, and another table with the lengths of the ranges. 396 """ 397 398 in_range = False 399 range_position = -1 400 ranges = [] 401 range_lengths = [] 402 403 for letter_id in sorted(letter_case.keys()): 404 prev_letter_id = letter_id - 1 405 406 # One-way conversions 407 if reverse_letter_case is None: 408 if len(letter_case[letter_id]) > 1: 409 in_range = False 410 continue 411 412 if prev_letter_id not in letter_case or len(letter_case[prev_letter_id]) > 1: 413 in_range = False 414 continue 415 416 # Two way conversions 417 else: 418 if not is_bidirectional_conversion(letter_id, letter_case, reverse_letter_case): 419 in_range = False 420 continue 421 422 if not is_bidirectional_conversion(prev_letter_id, letter_case, reverse_letter_case): 423 in_range = False 424 continue 425 426 conv_distance = calculate_conversion_distance(letter_case, letter_id) 427 prev_conv_distance = calculate_conversion_distance(letter_case, prev_letter_id) 428 429 if conv_distance != prev_conv_distance: 430 in_range = False 431 continue 432 433 if in_range: 434 range_lengths[range_position] += 1 435 else: 436 in_range = True 437 range_position += 1 438 439 # Add the start point of the range and its mapped value 440 ranges.extend([prev_letter_id, ord(letter_case[prev_letter_id])]) 441 range_lengths.append(2) 442 443 # Remove all ranges from the case mapping table. 444 for idx in range(0, len(ranges), 2): 445 range_length = range_lengths[idx // 2] 446 447 for incr in range(range_length): 448 del letter_case[ranges[idx] + incr] 449 if reverse_letter_case is not None: 450 del reverse_letter_case[ranges[idx + 1] + incr] 451 452 return ranges, range_lengths 453 454 455def extract_character_pair_ranges(letter_case, reverse_letter_case): 456 """ 457 Extract two or more character pairs from the case mapping tables. 458 459 :param letter_case: case mappings dictionary which contains the conversions. 460 :param reverse_letter_case: Comparable case mapping table which contains the return direction of the conversion. 461 :return: A table with the start points, and another table with the lengths of the ranges. 462 """ 463 464 start_points = [] 465 lengths = [] 466 in_range = False 467 element_counter = -1 468 469 for letter_id in sorted(letter_case.keys()): 470 # Only extract character pairs 471 if not is_bidirectional_conversion(letter_id, letter_case, reverse_letter_case): 472 in_range = False 473 continue 474 475 if ord(letter_case[letter_id]) == letter_id + 1: 476 prev_letter_id = letter_id - 2 477 478 if not is_bidirectional_conversion(prev_letter_id, letter_case, reverse_letter_case): 479 in_range = False 480 481 if in_range: 482 lengths[element_counter] += 2 483 else: 484 element_counter += 1 485 start_points.append(letter_id) 486 lengths.append(2) 487 in_range = True 488 489 else: 490 in_range = False 491 492 # Remove all found case mapping from the conversion tables after the scanning method 493 for idx, letter_id in enumerate(start_points): 494 conv_length = lengths[idx] 495 496 for incr in range(0, conv_length, 2): 497 del letter_case[letter_id + incr] 498 del reverse_letter_case[letter_id + 1 + incr] 499 500 return start_points, lengths 501 502 503def extract_character_pairs(letter_case, reverse_letter_case): 504 """ 505 Extract character pairs. Check that two unicode value are also a mapping value of each other. 506 507 :param letter_case: case mappings dictionary which contains the conversions. 508 :param reverse_letter_case: Comparable case mapping table which contains the return direction of the conversion. 509 :return: A table with character pairs. 510 """ 511 512 character_pairs = [] 513 514 for letter_id in sorted(letter_case.keys()): 515 if is_bidirectional_conversion(letter_id, letter_case, reverse_letter_case): 516 mapped_value = letter_case[letter_id] 517 character_pairs.extend([letter_id, ord(mapped_value)]) 518 519 # Remove character pairs from case mapping tables 520 del letter_case[letter_id] 521 del reverse_letter_case[ord(mapped_value)] 522 523 return character_pairs 524 525 526def extract_special_ranges(letter_case): 527 """ 528 Extract special ranges. It contains start points of one-to-two letter case ranges 529 where the second character is always the same. 530 531 :param letter_case: case mappings dictionary which contains the conversions. 532 533 :return: A table with the start points and their mapped values, and a table with the lengths of the ranges. 534 """ 535 536 special_ranges = [] 537 special_range_lengths = [] 538 539 range_position = -1 540 541 for letter_id in sorted(letter_case.keys()): 542 mapped_value = letter_case[letter_id] 543 544 if len(mapped_value) != 2: 545 continue 546 547 prev_letter_id = letter_id - 1 548 549 if prev_letter_id not in letter_case: 550 in_range = False 551 continue 552 553 prev_mapped_value = letter_case[prev_letter_id] 554 555 if len(prev_mapped_value) != 2: 556 continue 557 558 if prev_mapped_value[1] != mapped_value[1]: 559 continue 560 561 if (ord(prev_mapped_value[0]) - prev_letter_id) != (ord(mapped_value[0]) - letter_id): 562 in_range = False 563 continue 564 565 if in_range: 566 special_range_lengths[range_position] += 1 567 else: 568 range_position += 1 569 in_range = True 570 571 special_ranges.extend([prev_letter_id, ord(prev_mapped_value[0]), ord(prev_mapped_value[1])]) 572 special_range_lengths.append(1) 573 574 # Remove special ranges from the conversion table 575 for idx in range(0, len(special_ranges), 3): 576 range_length = special_range_lengths[idx // 3] 577 letter_id = special_ranges[idx] 578 579 for incr in range(range_length): 580 del letter_case[special_ranges[idx] + incr] 581 582 return special_ranges, special_range_lengths 583 584 585def extract_conversions(letter_case): 586 """ 587 Extract conversions. It provide the full (or remained) case mappings from the table. 588 The counter table contains the information of how much one-to-one, one-to-two or one-to-three mappings 589 exists successively in the conversion table. 590 591 :return: A table with conversions, and a table with counters. 592 """ 593 594 unicodes = [[], [], []] 595 unicode_lengths = [0, 0, 0] 596 597 # 1 to 1 byte 598 for letter_id in sorted(letter_case.keys()): 599 mapped_value = letter_case[letter_id] 600 601 if len(mapped_value) != 1: 602 continue 603 604 unicodes[0].extend([letter_id, ord(mapped_value)]) 605 del letter_case[letter_id] 606 607 # 1 to 2 bytes 608 for letter_id in sorted(letter_case.keys()): 609 mapped_value = letter_case[letter_id] 610 611 if len(mapped_value) != 2: 612 continue 613 614 unicodes[1].extend([letter_id, ord(mapped_value[0]), ord(mapped_value[1])]) 615 del letter_case[letter_id] 616 617 # 1 to 3 bytes 618 for letter_id in sorted(letter_case.keys()): 619 mapped_value = letter_case[letter_id] 620 621 if len(mapped_value) != 3: 622 continue 623 624 unicodes[2].extend([letter_id, ord(mapped_value[0]), ord(mapped_value[1]), ord(mapped_value[2])]) 625 del letter_case[letter_id] 626 627 unicode_lengths = [int(len(unicodes[0]) / 2), int(len(unicodes[1]) / 3), int(len(unicodes[2]) / 4)] 628 629 return list(itertools.chain.from_iterable(unicodes)), unicode_lengths 630 631 632def is_bidirectional_conversion(letter_id, letter_case, reverse_letter_case): 633 """ 634 Check that two unicode value are also a mapping value of each other. 635 636 :param letter_id: An integer, representing the unicode code point of the character. 637 :param other_case_mapping: Comparable case mapping table which possible contains 638 the return direction of the conversion. 639 :return: True, if it's a reverible conversion, false otherwise. 640 """ 641 642 if letter_id not in letter_case: 643 return False 644 645 # Check one-to-one mapping 646 mapped_value = letter_case[letter_id] 647 if len(mapped_value) > 1: 648 return False 649 650 # Check two way conversions 651 mapped_value_id = ord(mapped_value) 652 653 if mapped_value_id not in reverse_letter_case or len(reverse_letter_case[mapped_value_id]) > 1: 654 return False 655 656 if ord(reverse_letter_case[mapped_value_id]) != letter_id: 657 return False 658 659 return True 660 661 662def calculate_conversion_distance(letter_case, letter_id): 663 """ 664 Calculate the distance between the unicode character and its mapped value 665 (only needs and works with one-to-one mappings). 666 667 :param letter_case: case mappings dictionary which contains the conversions. 668 :param letter_id: An integer, representing the unicode code point of the character. 669 :return: The conversion distance. 670 """ 671 672 if letter_id not in letter_case or len(letter_case[letter_id]) > 1: 673 return None 674 675 return ord(letter_case[letter_id]) - letter_id 676 677 678def generate_conversions(script_args): 679 # Read the corresponding unicode values of lower and upper case letters and store these in tables 680 case_mappings = read_case_mappings(script_args.unicode_data, script_args.special_casing) 681 lower_case = case_mappings[0] 682 upper_case = case_mappings[1] 683 684 character_case_ranges = extract_ranges(lower_case, upper_case) 685 character_pair_ranges = extract_character_pair_ranges(lower_case, upper_case) 686 character_pairs = extract_character_pairs(lower_case, upper_case) 687 upper_case_special_ranges = extract_special_ranges(upper_case) 688 lower_case_ranges = extract_ranges(lower_case) 689 lower_case_conversions = extract_conversions(lower_case) 690 upper_case_conversions = extract_conversions(upper_case) 691 692 if lower_case: 693 warnings.warn('Not all elements extracted from the lowercase table!') 694 if upper_case: 695 warnings.warn('Not all elements extracted from the uppercase table!') 696 697 # Generate conversions output 698 c_source = UniCodeSource(CONVERSIONS_C_SOURCE) 699 700 unicode_file = os.path.basename(script_args.unicode_data) 701 spec_casing_file = os.path.basename(script_args.special_casing) 702 703 header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__), 704 " * from %s and %s files. Do not edit! */" % (unicode_file, spec_casing_file), 705 ""] 706 707 c_source.complete_header("\n".join(header_completion)) 708 709 c_source.add_table(character_case_ranges[0], 710 "character_case_ranges", 711 "uint16_t", 712 ("/* Contains start points of character case ranges " 713 "(these are bidirectional conversions). */")) 714 715 c_source.add_table(character_case_ranges[1], 716 "character_case_range_lengths", 717 "uint8_t", 718 "/* Interval lengths of start points in `character_case_ranges` table. */") 719 720 c_source.add_table(character_pair_ranges[0], 721 "character_pair_ranges", 722 "uint16_t", 723 "/* Contains the start points of bidirectional conversion ranges. */") 724 725 c_source.add_table(character_pair_ranges[1], 726 "character_pair_range_lengths", 727 "uint8_t", 728 "/* Interval lengths of start points in `character_pair_ranges` table. */") 729 730 c_source.add_table(character_pairs, 731 "character_pairs", 732 "uint16_t", 733 "/* Contains lower/upper case bidirectional conversion pairs. */") 734 735 c_source.add_table(upper_case_special_ranges[0], 736 "upper_case_special_ranges", 737 "uint16_t", 738 ("/* Contains start points of one-to-two uppercase ranges where the second character\n" 739 " * is always the same.\n" 740 " */")) 741 742 c_source.add_table(upper_case_special_ranges[1], 743 "upper_case_special_range_lengths", 744 "uint8_t", 745 "/* Interval lengths for start points in `upper_case_special_ranges` table. */") 746 747 c_source.add_table(lower_case_ranges[0], 748 "lower_case_ranges", 749 "uint16_t", 750 "/* Contains start points of lowercase ranges. */") 751 752 c_source.add_table(lower_case_ranges[1], 753 "lower_case_range_lengths", 754 "uint8_t", 755 "/* Interval lengths for start points in `lower_case_ranges` table. */") 756 757 c_source.add_table(lower_case_conversions[0], 758 "lower_case_conversions", 759 "uint16_t", 760 ("/* The remaining lowercase conversions. The lowercase variant can " 761 "be one-to-three character long. */")) 762 763 c_source.add_table(lower_case_conversions[1], 764 "lower_case_conversion_counters", 765 "uint8_t", 766 "/* Number of one-to-one, one-to-two, and one-to-three lowercase conversions. */") 767 768 c_source.add_table(upper_case_conversions[0], 769 "upper_case_conversions", 770 "uint16_t", 771 ("/* The remaining uppercase conversions. The uppercase variant can " 772 "be one-to-three character long. */")) 773 774 c_source.add_table(upper_case_conversions[1], 775 "upper_case_conversion_counters", 776 "uint8_t", 777 "/* Number of one-to-one, one-to-two, and one-to-three uppercase conversions. */") 778 779 c_source.generate() 780 781 782# entry point 783 784 785def main(): 786 parser = argparse.ArgumentParser(description='lit-unicode-{conversions,ranges}.inc.h generator', 787 epilog=''' 788 The input files (UnicodeData.txt, SpecialCasing.txt) 789 must be retrieved from 790 http://www.unicode.org/Public/<VERSION>/ucd/. 791 The last known good version is 13.0.0. 792 ''') 793 794 parser.add_argument('--unicode-data', metavar='FILE', action='store', required=True, 795 help='specify the unicode data file') 796 parser.add_argument('--special-casing', metavar='FILE', action='store', required=True, 797 help='specify the special casing file') 798 799 script_args = parser.parse_args() 800 801 if not os.path.isfile(script_args.unicode_data) or not os.access(script_args.unicode_data, os.R_OK): 802 parser.error('The %s file is missing or not readable!' % script_args.unicode_data) 803 804 if not os.path.isfile(script_args.special_casing) or not os.access(script_args.special_casing, os.R_OK): 805 parser.error('The %s file is missing or not readable!' % script_args.special_casing) 806 807 generate_ranges(script_args) 808 generate_conversions(script_args) 809 810 811if __name__ == "__main__": 812 main() 813