• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2
3# Copyright JS Foundation and other contributors, http://js.foundation
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#     http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17from __future__ import print_function
18
19import argparse
20import bisect
21import csv
22import itertools
23import os
24import warnings
25
26from gen_c_source import LICENSE, format_code
27from settings import PROJECT_DIR
28
29
30RANGES_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-ranges.inc.h')
31CONVERSIONS_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-conversions.inc.h')
32
33
34# common code generation
35
36
37class UniCodeSource(object):
38    def __init__(self, filepath):
39        self.__filepath = filepath
40        self.__header = [LICENSE, ""]
41        self.__data = []
42
43    def complete_header(self, completion):
44        self.__header.append(completion)
45        self.__header.append("")  # for an extra empty line
46
47    def add_table(self, table, table_name, table_type, table_descr):
48        self.__data.append(table_descr)
49        self.__data.append("static const %s lit_%s[] JERRY_ATTR_CONST_DATA =" % (table_type, table_name))
50        self.__data.append("{")
51        self.__data.append(format_code(table, 1))
52        self.__data.append("};")
53        self.__data.append("")  # for an extra empty line
54
55    def generate(self):
56        with open(self.__filepath, 'w') as generated_source:
57            generated_source.write("\n".join(self.__header))
58            generated_source.write("\n".join(self.__data))
59
60class UnicodeCategorizer(object):
61    def __init__(self):
62        # unicode categories:      Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs
63        #                          Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
64        # letter:                  Lu Ll Lt Lm Lo Nl
65        # non-letter-indent-part:
66        #   digit:                 Nd
67        #   punctuation mark:      Mn Mc
68        #   connector punctuation: Pc
69        # separators:              Zs
70        self._unicode_categories = {
71            'letters_category' : ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"],
72            'non_letters_category' : ["Nd", "Mn", "Mc", "Pc"],
73            'separators_category' : ["Zs"]
74        }
75
76        self._categories = {
77            'letters' : [],
78            'non_letters' : [],
79            'separators' : []
80        }
81
82    def _store_by_category(self, unicode_id, category):
83        """
84        Store the given unicode_id by its category
85        """
86        for target_category in self._categories:
87            if category in self._unicode_categories[target_category + '_category']:
88                self._categories[target_category].append(unicode_id)
89
90    def read_categories(self, unicode_data_file):
91        """
92        Read the corresponding unicode values and store them in category lists.
93
94        :return: List of letters, non_letter and separators.
95        """
96
97        range_start_id = 0
98
99        with open(unicode_data_file) as unicode_data:
100            for line in csv.reader(unicode_data, delimiter=';'):
101                unicode_id = int(line[0], 16)
102
103                # Skip supplementary planes and ascii chars
104                if unicode_id >= 0x10000 or unicode_id < 128:
105                    continue
106
107                category = line[2]
108
109                if range_start_id != 0:
110                    while range_start_id <= unicode_id:
111                        self._store_by_category(range_start_id, category)
112                        range_start_id += 1
113                    range_start_id = 0
114                    continue
115
116                if line[1].startswith('<'):
117                    # Save the start position of the range
118                    range_start_id = unicode_id
119
120                self._store_by_category(unicode_id, category)
121
122        # This separator char is handled separatly
123        separators = self._categories['separators']
124        non_breaking_space = 0x00A0
125        if non_breaking_space in separators:
126            separators.remove(int(non_breaking_space))
127
128        # These separator chars are not in the unicode data file or not in Zs category
129        mongolian_vowel_separator = 0x180E
130        medium_mathematical_space = 0x205F
131        zero_width_space = 0x200B
132
133        if mongolian_vowel_separator not in separators:
134            bisect.insort(separators, int(mongolian_vowel_separator))
135        if medium_mathematical_space not in separators:
136            bisect.insort(separators, int(medium_mathematical_space))
137        if zero_width_space not in separators:
138            bisect.insort(separators, int(zero_width_space))
139
140        # https://www.ecma-international.org/ecma-262/5.1/#sec-7.1 format-control characters
141        non_letters = self._categories['non_letters']
142        zero_width_non_joiner = 0x200C
143        zero_width_joiner = 0x200D
144
145        bisect.insort(non_letters, int(zero_width_non_joiner))
146        bisect.insort(non_letters, int(zero_width_joiner))
147
148        return self._categories['letters'], self._categories['non_letters'], self._categories['separators']
149
150
151def group_ranges(i):
152    """
153    Convert an increasing list of integers into a range list
154
155    :return: List of ranges.
156    """
157    for _, group in itertools.groupby(enumerate(i), lambda q: (q[1] - q[0])):
158        group = list(group)
159        yield group[0][1], group[-1][1]
160
161
162def split_list(category_list):
163    """
164    Split list of ranges into intervals and single char lists.
165
166    :return: List of interval starting points, interval lengths and single chars
167    """
168
169    interval_sps = []
170    interval_lengths = []
171    chars = []
172
173    for element in category_list:
174        interval_length = element[1] - element[0]
175        if interval_length == 0:
176            chars.append(element[0])
177        elif interval_length > 255:
178            for i in range(element[0], element[1], 256):
179                length = 255 if (element[1] - i > 255) else (element[1] - i)
180                interval_sps.append(i)
181                interval_lengths.append(length)
182        else:
183            interval_sps.append(element[0])
184            interval_lengths.append(element[1] - element[0])
185
186    return interval_sps, interval_lengths, chars
187
188
189def generate_ranges(script_args):
190    categorizer = UnicodeCategorizer()
191    letters, non_letters, separators = categorizer.read_categories(script_args.unicode_data)
192
193    letter_tables = split_list(list(group_ranges(letters)))
194    non_letter_tables = split_list(list(group_ranges(non_letters)))
195    separator_tables = split_list(list(group_ranges(separators)))
196
197    c_source = UniCodeSource(RANGES_C_SOURCE)
198
199    header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__),
200                         " * from %s. Do not edit! */" % os.path.basename(script_args.unicode_data),
201                         ""]
202
203    c_source.complete_header("\n".join(header_completion))
204
205    c_source.add_table(letter_tables[0],
206                       "unicode_letter_interval_sps",
207                       "uint16_t",
208                       ("/**\n"
209                        " * Character interval starting points for the unicode letters.\n"
210                        " *\n"
211                        " * The characters covered by these intervals are from\n"
212                        " * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl\n"
213                        " */"))
214
215    c_source.add_table(letter_tables[1],
216                       "unicode_letter_interval_lengths",
217                       "uint8_t",
218                       ("/**\n"
219                        " * Character lengths for the unicode letters.\n"
220                        " *\n"
221                        " * The characters covered by these intervals are from\n"
222                        " * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl\n"
223                        " */"))
224
225    c_source.add_table(letter_tables[2],
226                       "unicode_letter_chars",
227                       "uint16_t",
228                       ("/**\n"
229                        " * Those unicode letter characters that are not inside any of\n"
230                        " * the intervals specified in lit_unicode_letter_interval_sps array.\n"
231                        " *\n"
232                        " * The characters are from the following Unicode categories:\n"
233                        " * Lu, Ll, Lt, Lm, Lo, Nl\n"
234                        " */"))
235
236    c_source.add_table(non_letter_tables[0],
237                       "unicode_non_letter_ident_part_interval_sps",
238                       "uint16_t",
239                       ("/**\n"
240                        " * Character interval starting points for non-letter character\n"
241                        " * that can be used as a non-first character of an identifier.\n"
242                        " *\n"
243                        " * The characters covered by these intervals are from\n"
244                        " * the following Unicode categories: Nd, Mn, Mc, Pc\n"
245                        " */"))
246
247    c_source.add_table(non_letter_tables[1],
248                       "unicode_non_letter_ident_part_interval_lengths",
249                       "uint8_t",
250                       ("/**\n"
251                        " * Character interval lengths for non-letter character\n"
252                        " * that can be used as a non-first character of an identifier.\n"
253                        " *\n"
254                        " * The characters covered by these intervals are from\n"
255                        " * the following Unicode categories: Nd, Mn, Mc, Pc\n"
256                        " */"))
257
258    c_source.add_table(non_letter_tables[2],
259                       "unicode_non_letter_ident_part_chars",
260                       "uint16_t",
261                       ("/**\n"
262                        " * Those non-letter characters that can be used as a non-first\n"
263                        " * character of an identifier and not included in any of the intervals\n"
264                        " * specified in lit_unicode_non_letter_ident_part_interval_sps array.\n"
265                        " *\n"
266                        " * The characters are from the following Unicode categories:\n"
267                        " * Nd, Mn, Mc, Pc\n"
268                        " */"))
269
270    c_source.add_table(separator_tables[0],
271                       "unicode_separator_char_interval_sps",
272                       "uint16_t",
273                       ("/**\n"
274                        " * Unicode separator character interval starting points from Unicode category: Zs\n"
275                        " */"))
276
277    c_source.add_table(separator_tables[1],
278                       "unicode_separator_char_interval_lengths",
279                       "uint8_t",
280                       ("/**\n"
281                        " * Unicode separator character interval lengths from Unicode category: Zs\n"
282                        " */"))
283
284    c_source.add_table(separator_tables[2],
285                       "unicode_separator_chars",
286                       "uint16_t",
287                       ("/**\n"
288                        " * Unicode separator characters that are not in the\n"
289                        " * lit_unicode_separator_char_intervals array.\n"
290                        " *\n"
291                        " * Unicode category: Zs\n"
292                        " */"))
293
294    c_source.generate()
295
296
297# functions for unicode conversions
298
299
300def parse_unicode_sequence(raw_data):
301    """
302    Parse unicode sequence from raw data.
303
304    :param raw_data: Contains the unicode sequence which needs to parse.
305    :return: The parsed unicode sequence.
306    """
307
308    result = ''
309
310    for unicode_char in raw_data.split(' '):
311        if unicode_char == '':
312            continue
313
314        # Convert it to unicode code point (from hex value without 0x prefix)
315        hex_val = int(unicode_char, 16)
316        try:
317            result += unichr(hex_val)
318        except NameError:
319            result += chr(hex_val)
320
321    return result
322
323
324def read_case_mappings(unicode_data_file, special_casing_file):
325    """
326    Read the corresponding unicode values of lower and upper case letters and store these in tables.
327
328    :param unicode_data_file: Contains the default case mappings (one-to-one mappings).
329    :param special_casing_file: Contains additional informative case mappings that are either not one-to-one
330                                or which are context-sensitive.
331    :return: Upper and lower case mappings.
332    """
333
334    lower_case_mapping = {}
335    upper_case_mapping = {}
336
337    # Add one-to-one mappings
338    with open(unicode_data_file) as unicode_data:
339        unicode_data_reader = csv.reader(unicode_data, delimiter=';')
340
341        for line in unicode_data_reader:
342            letter_id = int(line[0], 16)
343
344            # Skip supplementary planes and ascii chars
345            if letter_id >= 0x10000 or letter_id < 128:
346                continue
347
348            capital_letter = line[12]
349            small_letter = line[13]
350
351            if capital_letter:
352                upper_case_mapping[letter_id] = parse_unicode_sequence(capital_letter)
353
354            if small_letter:
355                lower_case_mapping[letter_id] = parse_unicode_sequence(small_letter)
356
357    # Update the conversion tables with the special cases
358    with open(special_casing_file) as special_casing:
359        special_casing_reader = csv.reader(special_casing, delimiter=';')
360
361        for line in special_casing_reader:
362            # Skip comment sections and empty lines
363            if not line or line[0].startswith('#'):
364                continue
365
366            # Replace '#' character with empty string
367            for idx, i in enumerate(line):
368                if i.find('#') >= 0:
369                    line[idx] = ''
370
371            letter_id = int(line[0], 16)
372            condition_list = line[4]
373
374            # Skip supplementary planes, ascii chars, and condition_list
375            if letter_id >= 0x10000 or letter_id < 128 or condition_list:
376                continue
377
378            small_letter = parse_unicode_sequence(line[1])
379            capital_letter = parse_unicode_sequence(line[3])
380
381            lower_case_mapping[letter_id] = small_letter
382            upper_case_mapping[letter_id] = capital_letter
383
384    return lower_case_mapping, upper_case_mapping
385
386
387def extract_ranges(letter_case, reverse_letter_case=None):
388    """
389    Extract ranges from case mappings
390    (the second param is optional, if it's not empty, a range will contains bidirectional conversions only).
391
392    :param letter_id: An integer, representing the unicode code point of the character.
393    :param letter_case: case mappings dictionary which contains the conversions.
394    :param reverse_letter_case: Comparable case mapping table which contains the return direction of the conversion.
395    :return: A table with the start points and their mapped value, and another table with the lengths of the ranges.
396    """
397
398    in_range = False
399    range_position = -1
400    ranges = []
401    range_lengths = []
402
403    for letter_id in sorted(letter_case.keys()):
404        prev_letter_id = letter_id - 1
405
406        # One-way conversions
407        if reverse_letter_case is None:
408            if len(letter_case[letter_id]) > 1:
409                in_range = False
410                continue
411
412            if prev_letter_id not in letter_case or len(letter_case[prev_letter_id]) > 1:
413                in_range = False
414                continue
415
416        # Two way conversions
417        else:
418            if not is_bidirectional_conversion(letter_id, letter_case, reverse_letter_case):
419                in_range = False
420                continue
421
422            if not is_bidirectional_conversion(prev_letter_id, letter_case, reverse_letter_case):
423                in_range = False
424                continue
425
426        conv_distance = calculate_conversion_distance(letter_case, letter_id)
427        prev_conv_distance = calculate_conversion_distance(letter_case, prev_letter_id)
428
429        if conv_distance != prev_conv_distance:
430            in_range = False
431            continue
432
433        if in_range:
434            range_lengths[range_position] += 1
435        else:
436            in_range = True
437            range_position += 1
438
439            # Add the start point of the range and its mapped value
440            ranges.extend([prev_letter_id, ord(letter_case[prev_letter_id])])
441            range_lengths.append(2)
442
443    # Remove all ranges from the case mapping table.
444    for idx in range(0, len(ranges), 2):
445        range_length = range_lengths[idx // 2]
446
447        for incr in range(range_length):
448            del letter_case[ranges[idx] + incr]
449            if reverse_letter_case is not None:
450                del reverse_letter_case[ranges[idx + 1] + incr]
451
452    return ranges, range_lengths
453
454
455def extract_character_pair_ranges(letter_case, reverse_letter_case):
456    """
457    Extract two or more character pairs from the case mapping tables.
458
459    :param letter_case: case mappings dictionary which contains the conversions.
460    :param reverse_letter_case: Comparable case mapping table which contains the return direction of the conversion.
461    :return: A table with the start points, and another table with the lengths of the ranges.
462    """
463
464    start_points = []
465    lengths = []
466    in_range = False
467    element_counter = -1
468
469    for letter_id in sorted(letter_case.keys()):
470        # Only extract character pairs
471        if not is_bidirectional_conversion(letter_id, letter_case, reverse_letter_case):
472            in_range = False
473            continue
474
475        if ord(letter_case[letter_id]) == letter_id + 1:
476            prev_letter_id = letter_id - 2
477
478            if not is_bidirectional_conversion(prev_letter_id, letter_case, reverse_letter_case):
479                in_range = False
480
481            if in_range:
482                lengths[element_counter] += 2
483            else:
484                element_counter += 1
485                start_points.append(letter_id)
486                lengths.append(2)
487                in_range = True
488
489        else:
490            in_range = False
491
492    # Remove all found case mapping from the conversion tables after the scanning method
493    for idx, letter_id in enumerate(start_points):
494        conv_length = lengths[idx]
495
496        for incr in range(0, conv_length, 2):
497            del letter_case[letter_id + incr]
498            del reverse_letter_case[letter_id + 1 + incr]
499
500    return start_points, lengths
501
502
503def extract_character_pairs(letter_case, reverse_letter_case):
504    """
505    Extract character pairs. Check that two unicode value are also a mapping value of each other.
506
507    :param letter_case: case mappings dictionary which contains the conversions.
508    :param reverse_letter_case: Comparable case mapping table which contains the return direction of the conversion.
509    :return: A table with character pairs.
510    """
511
512    character_pairs = []
513
514    for letter_id in sorted(letter_case.keys()):
515        if is_bidirectional_conversion(letter_id, letter_case, reverse_letter_case):
516            mapped_value = letter_case[letter_id]
517            character_pairs.extend([letter_id, ord(mapped_value)])
518
519            # Remove character pairs from case mapping tables
520            del letter_case[letter_id]
521            del reverse_letter_case[ord(mapped_value)]
522
523    return character_pairs
524
525
526def extract_special_ranges(letter_case):
527    """
528    Extract special ranges. It contains start points of one-to-two letter case ranges
529    where the second character is always the same.
530
531    :param letter_case: case mappings dictionary which contains the conversions.
532
533    :return: A table with the start points and their mapped values, and a table with the lengths of the ranges.
534    """
535
536    special_ranges = []
537    special_range_lengths = []
538
539    range_position = -1
540
541    for letter_id in sorted(letter_case.keys()):
542        mapped_value = letter_case[letter_id]
543
544        if len(mapped_value) != 2:
545            continue
546
547        prev_letter_id = letter_id - 1
548
549        if prev_letter_id not in letter_case:
550            in_range = False
551            continue
552
553        prev_mapped_value = letter_case[prev_letter_id]
554
555        if len(prev_mapped_value) != 2:
556            continue
557
558        if prev_mapped_value[1] != mapped_value[1]:
559            continue
560
561        if (ord(prev_mapped_value[0]) - prev_letter_id) != (ord(mapped_value[0]) - letter_id):
562            in_range = False
563            continue
564
565        if in_range:
566            special_range_lengths[range_position] += 1
567        else:
568            range_position += 1
569            in_range = True
570
571            special_ranges.extend([prev_letter_id, ord(prev_mapped_value[0]), ord(prev_mapped_value[1])])
572            special_range_lengths.append(1)
573
574    # Remove special ranges from the conversion table
575    for idx in range(0, len(special_ranges), 3):
576        range_length = special_range_lengths[idx // 3]
577        letter_id = special_ranges[idx]
578
579        for incr in range(range_length):
580            del letter_case[special_ranges[idx] + incr]
581
582    return special_ranges, special_range_lengths
583
584
585def extract_conversions(letter_case):
586    """
587    Extract conversions. It provide the full (or remained) case mappings from the table.
588    The counter table contains the information of how much one-to-one, one-to-two or one-to-three mappings
589    exists successively in the conversion table.
590
591    :return: A table with conversions, and a table with counters.
592    """
593
594    unicodes = [[], [], []]
595    unicode_lengths = [0, 0, 0]
596
597    # 1 to 1 byte
598    for letter_id in sorted(letter_case.keys()):
599        mapped_value = letter_case[letter_id]
600
601        if len(mapped_value) != 1:
602            continue
603
604        unicodes[0].extend([letter_id, ord(mapped_value)])
605        del letter_case[letter_id]
606
607    # 1 to 2 bytes
608    for letter_id in sorted(letter_case.keys()):
609        mapped_value = letter_case[letter_id]
610
611        if len(mapped_value) != 2:
612            continue
613
614        unicodes[1].extend([letter_id, ord(mapped_value[0]), ord(mapped_value[1])])
615        del letter_case[letter_id]
616
617    # 1 to 3 bytes
618    for letter_id in sorted(letter_case.keys()):
619        mapped_value = letter_case[letter_id]
620
621        if len(mapped_value) != 3:
622            continue
623
624        unicodes[2].extend([letter_id, ord(mapped_value[0]), ord(mapped_value[1]), ord(mapped_value[2])])
625        del letter_case[letter_id]
626
627    unicode_lengths = [int(len(unicodes[0]) / 2), int(len(unicodes[1]) / 3), int(len(unicodes[2]) / 4)]
628
629    return list(itertools.chain.from_iterable(unicodes)), unicode_lengths
630
631
632def is_bidirectional_conversion(letter_id, letter_case, reverse_letter_case):
633    """
634    Check that two unicode value are also a mapping value of each other.
635
636    :param letter_id: An integer, representing the unicode code point of the character.
637    :param other_case_mapping: Comparable case mapping table which possible contains
638                               the return direction of the conversion.
639    :return: True, if it's a reverible conversion, false otherwise.
640    """
641
642    if letter_id not in letter_case:
643        return False
644
645    # Check one-to-one mapping
646    mapped_value = letter_case[letter_id]
647    if len(mapped_value) > 1:
648        return False
649
650    # Check two way conversions
651    mapped_value_id = ord(mapped_value)
652
653    if mapped_value_id not in reverse_letter_case or len(reverse_letter_case[mapped_value_id]) > 1:
654        return False
655
656    if ord(reverse_letter_case[mapped_value_id]) != letter_id:
657        return False
658
659    return True
660
661
662def calculate_conversion_distance(letter_case, letter_id):
663    """
664    Calculate the distance between the unicode character and its mapped value
665    (only needs and works with one-to-one mappings).
666
667    :param letter_case: case mappings dictionary which contains the conversions.
668    :param letter_id: An integer, representing the unicode code point of the character.
669    :return: The conversion distance.
670    """
671
672    if letter_id not in letter_case or len(letter_case[letter_id]) > 1:
673        return None
674
675    return ord(letter_case[letter_id]) - letter_id
676
677
678def generate_conversions(script_args):
679    # Read the corresponding unicode values of lower and upper case letters and store these in tables
680    case_mappings = read_case_mappings(script_args.unicode_data, script_args.special_casing)
681    lower_case = case_mappings[0]
682    upper_case = case_mappings[1]
683
684    character_case_ranges = extract_ranges(lower_case, upper_case)
685    character_pair_ranges = extract_character_pair_ranges(lower_case, upper_case)
686    character_pairs = extract_character_pairs(lower_case, upper_case)
687    upper_case_special_ranges = extract_special_ranges(upper_case)
688    lower_case_ranges = extract_ranges(lower_case)
689    lower_case_conversions = extract_conversions(lower_case)
690    upper_case_conversions = extract_conversions(upper_case)
691
692    if lower_case:
693        warnings.warn('Not all elements extracted from the lowercase table!')
694    if upper_case:
695        warnings.warn('Not all elements extracted from the uppercase table!')
696
697    # Generate conversions output
698    c_source = UniCodeSource(CONVERSIONS_C_SOURCE)
699
700    unicode_file = os.path.basename(script_args.unicode_data)
701    spec_casing_file = os.path.basename(script_args.special_casing)
702
703    header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__),
704                         " * from %s and %s files. Do not edit! */" % (unicode_file, spec_casing_file),
705                         ""]
706
707    c_source.complete_header("\n".join(header_completion))
708
709    c_source.add_table(character_case_ranges[0],
710                       "character_case_ranges",
711                       "uint16_t",
712                       ("/* Contains start points of character case ranges "
713                        "(these are bidirectional conversions). */"))
714
715    c_source.add_table(character_case_ranges[1],
716                       "character_case_range_lengths",
717                       "uint8_t",
718                       "/* Interval lengths of start points in `character_case_ranges` table. */")
719
720    c_source.add_table(character_pair_ranges[0],
721                       "character_pair_ranges",
722                       "uint16_t",
723                       "/* Contains the start points of bidirectional conversion ranges. */")
724
725    c_source.add_table(character_pair_ranges[1],
726                       "character_pair_range_lengths",
727                       "uint8_t",
728                       "/* Interval lengths of start points in `character_pair_ranges` table. */")
729
730    c_source.add_table(character_pairs,
731                       "character_pairs",
732                       "uint16_t",
733                       "/* Contains lower/upper case bidirectional conversion pairs. */")
734
735    c_source.add_table(upper_case_special_ranges[0],
736                       "upper_case_special_ranges",
737                       "uint16_t",
738                       ("/* Contains start points of one-to-two uppercase ranges where the second character\n"
739                        " * is always the same.\n"
740                        " */"))
741
742    c_source.add_table(upper_case_special_ranges[1],
743                       "upper_case_special_range_lengths",
744                       "uint8_t",
745                       "/* Interval lengths for start points in `upper_case_special_ranges` table. */")
746
747    c_source.add_table(lower_case_ranges[0],
748                       "lower_case_ranges",
749                       "uint16_t",
750                       "/* Contains start points of lowercase ranges. */")
751
752    c_source.add_table(lower_case_ranges[1],
753                       "lower_case_range_lengths",
754                       "uint8_t",
755                       "/* Interval lengths for start points in `lower_case_ranges` table. */")
756
757    c_source.add_table(lower_case_conversions[0],
758                       "lower_case_conversions",
759                       "uint16_t",
760                       ("/* The remaining lowercase conversions. The lowercase variant can "
761                        "be one-to-three character long. */"))
762
763    c_source.add_table(lower_case_conversions[1],
764                       "lower_case_conversion_counters",
765                       "uint8_t",
766                       "/* Number of one-to-one, one-to-two, and one-to-three lowercase conversions. */")
767
768    c_source.add_table(upper_case_conversions[0],
769                       "upper_case_conversions",
770                       "uint16_t",
771                       ("/* The remaining uppercase conversions. The uppercase variant can "
772                        "be one-to-three character long. */"))
773
774    c_source.add_table(upper_case_conversions[1],
775                       "upper_case_conversion_counters",
776                       "uint8_t",
777                       "/* Number of one-to-one, one-to-two, and one-to-three uppercase conversions. */")
778
779    c_source.generate()
780
781
782# entry point
783
784
785def main():
786    parser = argparse.ArgumentParser(description='lit-unicode-{conversions,ranges}.inc.h generator',
787                                     epilog='''
788                                        The input files (UnicodeData.txt, SpecialCasing.txt)
789                                        must be retrieved from
790                                        http://www.unicode.org/Public/<VERSION>/ucd/.
791                                        The last known good version is 13.0.0.
792                                        ''')
793
794    parser.add_argument('--unicode-data', metavar='FILE', action='store', required=True,
795                        help='specify the unicode data file')
796    parser.add_argument('--special-casing', metavar='FILE', action='store', required=True,
797                        help='specify the special casing file')
798
799    script_args = parser.parse_args()
800
801    if not os.path.isfile(script_args.unicode_data) or not os.access(script_args.unicode_data, os.R_OK):
802        parser.error('The %s file is missing or not readable!' % script_args.unicode_data)
803
804    if not os.path.isfile(script_args.special_casing) or not os.access(script_args.special_casing, os.R_OK):
805        parser.error('The %s file is missing or not readable!' % script_args.special_casing)
806
807    generate_ranges(script_args)
808    generate_conversions(script_args)
809
810
811if __name__ == "__main__":
812    main()
813