• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2#
3# Copyright 2016 The Android Open Source Project. All Rights Reserved.
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#    http://www.apache.org/licenses/LICENSE-2.0
10#
11#    Unless required by applicable law or agreed to in writing, software
12#    distributed under the License is distributed on an "AS IS" BASIS,
13#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14#    See the License for the specific language governing permissions and
15#    limitations under the License.
16#
17
18"""Generate a C++ data table containing locale data."""
19
20import collections
21import glob
22import os.path
23import sys
24
25import xml.etree.ElementTree as ElementTree
26
27
28def get_locale_parts(locale):
29    """Split a locale into three parts, for langauge, script, and region."""
30    parts = locale.split('_')
31    if len(parts) == 1:
32        return (parts[0], None, None)
33    elif len(parts) == 2:
34        if len(parts[1]) == 4:  # parts[1] is a script
35            return (parts[0], parts[1], None)
36        else:
37            return (parts[0], None, parts[1])
38    else:
39        assert len(parts) == 3
40        return tuple(parts)
41
42
43def read_likely_subtags(input_file_name):
44    """Read and parse ICU's likelySubtags.txt."""
45    likely_script_dict = {
46        # Android's additions for pseudo-locales. These internal codes make
47        # sure that the pseudo-locales would not match other English or
48        # Arabic locales. (We can't use private-use ISO 15924 codes, since
49        # they may be used by apps for other purposes.)
50        "en_XA": "~~~A",
51        "ar_XB": "~~~B",
52        # Removed data from later versions of ICU
53        "ji": "Hebr", # Old code for Yiddish, still used in Java and Android
54    }
55    representative_locales = {
56        # Android's additions
57        "en_Latn_GB", # representative for en_Latn_001
58        "es_Latn_MX", # representative for es_Latn_419
59        "es_Latn_US", # representative for es_Latn_419 (not the best idea,
60        # but Android has been shipping with it for quite a
61        # while. Fortunately, MX < US, so if both exist, MX
62        # would be chosen.)
63    }
64    xml_tree = ElementTree.parse(input_file_name)
65    likely_subtags = xml_tree.find('likelySubtags')
66    for child in likely_subtags:
67        from_locale = child.get('from')
68        to_locale = child.get('to')
69        # print(f'from: {from_locale} to: {to_locale}')
70        from_lang, from_scr, from_region = get_locale_parts(from_locale)
71        _, to_scr, to_region = get_locale_parts(to_locale)
72        if to_locale == "FAIL":
73            continue # "FAIL" cases are not useful here.
74        if from_lang == 'und':
75            continue  # not very useful for our purposes
76        if from_region is None and to_region not in ['001', 'ZZ']:
77            representative_locales.add(to_locale)
78        if from_scr is None:
79            likely_script_dict[from_locale] = to_scr
80
81    return likely_script_dict, frozenset(representative_locales)
82
83
84# From packLanguageOrRegion() in ResourceTypes.cpp
85def pack_language_or_region(inp, base):
86    """Pack langauge or region in a two-byte tuple."""
87    if inp is None:
88        return (0, 0)
89    elif len(inp) == 2:
90        return ord(inp[0]), ord(inp[1])
91    else:
92        assert len(inp) == 3, f'Expects a 3-character string, but "{inp}" '
93        base = ord(base)
94        first = ord(inp[0]) - base
95        second = ord(inp[1]) - base
96        third = ord(inp[2]) - base
97
98        return (0x80 | (third << 2) | (second >>3),
99                ((second << 5) | first) & 0xFF)
100
101
102# From packLanguage() in ResourceTypes.cpp
103def pack_language(language):
104    """Pack language in a two-byte tuple."""
105    return pack_language_or_region(language, 'a')
106
107
108# From packRegion() in ResourceTypes.cpp
109def pack_region(region):
110    """Pack region in a two-byte tuple."""
111    return pack_language_or_region(region, '0')
112
113
114def pack_to_uint32(locale):
115    """Pack language+region of locale into a 32-bit unsigned integer."""
116    lang, _, region = get_locale_parts(locale)
117    plang = pack_language(lang)
118    pregion = pack_region(region)
119    return (plang[0] << 24) | (plang[1] << 16) | (pregion[0] << 8) | pregion[1]
120
121
122def dump_script_codes(all_scripts):
123    """Dump the SCRIPT_CODES table."""
124    print('const char SCRIPT_CODES[][4] = {')
125    for index, script in enumerate(all_scripts):
126        print("    /* %-2d */ {'%c', '%c', '%c', '%c'}," % (
127            index, script[0], script[1], script[2], script[3]))
128    print('};')
129    print()
130
131
132def dump_script_data(likely_script_dict, all_scripts):
133    """Dump the script data."""
134    print()
135    print('const std::unordered_map<uint32_t, uint8_t> LIKELY_SCRIPTS({')
136    for locale in sorted(likely_script_dict.keys()):
137        script = likely_script_dict[locale]
138        print('    {0x%08Xu, %2du}, // %s -> %s' % (
139            pack_to_uint32(locale),
140            all_scripts.index(script),
141            locale.replace('_', '-'),
142            script))
143    print('});')
144
145
146def pack_to_uint64(locale):
147    """Pack a full locale into a 64-bit unsigned integer."""
148    _, script, _ = get_locale_parts(locale)
149    return ((pack_to_uint32(locale) << 32) |
150            (ord(script[0]) << 24) |
151            (ord(script[1]) << 16) |
152            (ord(script[2]) << 8) |
153            ord(script[3]))
154
155
156def dump_representative_locales(representative_locales):
157    """Dump the set of representative locales."""
158    print()
159    print('std::unordered_set<uint64_t> REPRESENTATIVE_LOCALES({')
160    for locale in sorted(representative_locales):
161        print('    0x%08XLLU, // %s' % (
162            pack_to_uint64(locale),
163            locale))
164    print('});')
165
166
167def read_and_dump_likely_data(cldr_source_dir):
168    """Read and dump the likely-script data."""
169    likely_subtags_txt = os.path.join(cldr_source_dir,
170                                      'common', 'supplemental', 'likelySubtags.xml')
171    likely_script_dict, representative_locales = read_likely_subtags(
172        likely_subtags_txt)
173
174    all_scripts = list(set(likely_script_dict.values()))
175    assert len(all_scripts) <= 256
176    all_scripts.sort()
177
178    dump_script_codes(all_scripts)
179    dump_script_data(likely_script_dict, all_scripts)
180    dump_representative_locales(representative_locales)
181    return likely_script_dict
182
183def escape_script_variable_name(script):
184    """Escape characters, e.g. '~', in a C++ variable name"""
185    return script.replace("~", "_")
186
187def read_parent_data(icu_data_dir):
188    """Read locale parent data from ICU data files."""
189    all_icu_data_files = glob.glob(os.path.join(icu_data_dir, '*', '*.txt'))
190    parent_dict = {}
191    for data_file in all_icu_data_files:
192        locale = os.path.splitext(os.path.basename(data_file))[0]
193        with open(data_file) as input_file:
194            for line in input_file:
195                if '%%Parent' in line:
196                    parent = line[line.index('"')+1:line.rindex('"')]
197                    if locale in parent_dict:
198                        # Different files shouldn't have different parent info
199                        assert parent_dict[locale] == parent
200                    else:
201                        parent_dict[locale] = parent
202                elif locale.startswith('ar_') and 'default{"latn"}' in line:
203                    # Arabic parent overrides for ASCII digits. Since
204                    # Unicode extensions are not supported in ResourceTypes,
205                    # we will use ar-015 (Arabic, Northern Africa) instead
206                    # of the more correct ar-u-nu-latn.
207                    parent_dict[locale] = 'ar_015'
208    return parent_dict
209
210
211def get_likely_script(locale, likely_script_dict):
212    """Find the likely script for a locale, given the likely-script dictionary.
213    """
214    if locale.count('_') == 2:
215        # it already has a script
216        return locale.split('_')[1]
217    elif locale in likely_script_dict:
218        return likely_script_dict[locale]
219    else:
220        language = locale.split('_')[0]
221        return likely_script_dict[language]
222
223
224def dump_parent_data(script_organized_dict):
225    """Dump information for parents of locales."""
226    sorted_scripts = sorted(script_organized_dict.keys())
227    print()
228    for script in sorted_scripts:
229        parent_dict = script_organized_dict[script]
230        print ('const std::unordered_map<uint32_t, uint32_t> %s_PARENTS({'
231            % escape_script_variable_name(script.upper()))
232        for locale in sorted(parent_dict.keys()):
233            parent = parent_dict[locale]
234            print('    {0x%08Xu, 0x%08Xu}, // %s -> %s' % (
235                pack_to_uint32(locale),
236                pack_to_uint32(parent),
237                locale.replace('_', '-'),
238                parent.replace('_', '-')))
239        print('});')
240        print()
241
242    print('const struct {')
243    print('    const char script[4];')
244    print('    const std::unordered_map<uint32_t, uint32_t>* map;')
245    print('} SCRIPT_PARENTS[] = {')
246    for script in sorted_scripts:
247        print("    {{'%c', '%c', '%c', '%c'}, &%s_PARENTS}," % (
248            script[0], script[1], script[2], script[3],
249            escape_script_variable_name(script.upper())))
250    print('};')
251
252
253def dump_parent_tree_depth(parent_dict):
254    """Find and dump the depth of the parent tree."""
255    max_depth = 1
256    for locale, _ in parent_dict.items():
257        depth = 1
258        while locale in parent_dict:
259            locale = parent_dict[locale]
260            depth += 1
261        max_depth = max(max_depth, depth)
262    assert max_depth < 5 # Our algorithms assume small max_depth
263    print()
264    print('const size_t MAX_PARENT_DEPTH = %d;' % max_depth)
265
266
267def read_and_dump_parent_data(icu_data_dir, likely_script_dict):
268    """Read parent data from ICU and dump it."""
269    parent_dict = read_parent_data(icu_data_dir)
270    script_organized_dict = collections.defaultdict(dict)
271    for locale in parent_dict:
272        parent = parent_dict[locale]
273        if parent == 'root':
274            continue
275        script = get_likely_script(locale, likely_script_dict)
276        script_organized_dict[script][locale] = parent_dict[locale]
277    dump_parent_data(script_organized_dict)
278    dump_parent_tree_depth(parent_dict)
279
280
281def main():
282    """Read the data files from ICU and dump the output to a C++ file."""
283    source_root = sys.argv[1]
284    icu_data_dir = os.path.join(
285        source_root,
286        'external', 'icu', 'icu4c', 'source', 'data')
287    cldr_source_dir = os.path.join(source_root, 'external', 'cldr')
288
289    print('// Auto-generated by %s' % sys.argv[0])
290    print()
291    likely_script_dict = read_and_dump_likely_data(cldr_source_dir)
292    read_and_dump_parent_data(icu_data_dir, likely_script_dict)
293
294
295if __name__ == '__main__':
296    main()
297