1#!/usr/bin/env python3 2# 3# Copyright 2016 The Android Open Source Project. All Rights Reserved. 4# 5# Licensed under the Apache License, Version 2.0 (the "License"); 6# you may not use this file except in compliance with the License. 7# You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an "AS IS" BASIS, 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16# 17 18"""Generate a C++ data table containing locale data.""" 19 20import collections 21import glob 22import os.path 23import sys 24 25import xml.etree.ElementTree as ElementTree 26 27 28def get_locale_parts(locale): 29 """Split a locale into three parts, for langauge, script, and region.""" 30 parts = locale.split('_') 31 if len(parts) == 1: 32 return (parts[0], None, None) 33 elif len(parts) == 2: 34 if len(parts[1]) == 4: # parts[1] is a script 35 return (parts[0], parts[1], None) 36 else: 37 return (parts[0], None, parts[1]) 38 else: 39 assert len(parts) == 3 40 return tuple(parts) 41 42 43def read_likely_subtags(input_file_name): 44 """Read and parse ICU's likelySubtags.txt.""" 45 likely_script_dict = { 46 # Android's additions for pseudo-locales. These internal codes make 47 # sure that the pseudo-locales would not match other English or 48 # Arabic locales. (We can't use private-use ISO 15924 codes, since 49 # they may be used by apps for other purposes.) 50 "en_XA": "~~~A", 51 "ar_XB": "~~~B", 52 # Removed data from later versions of ICU 53 "ji": "Hebr", # Old code for Yiddish, still used in Java and Android 54 } 55 representative_locales = { 56 # Android's additions 57 "en_Latn_GB", # representative for en_Latn_001 58 "es_Latn_MX", # representative for es_Latn_419 59 "es_Latn_US", # representative for es_Latn_419 (not the best idea, 60 # but Android has been shipping with it for quite a 61 # while. Fortunately, MX < US, so if both exist, MX 62 # would be chosen.) 63 } 64 xml_tree = ElementTree.parse(input_file_name) 65 likely_subtags = xml_tree.find('likelySubtags') 66 for child in likely_subtags: 67 from_locale = child.get('from') 68 to_locale = child.get('to') 69 # print(f'from: {from_locale} to: {to_locale}') 70 from_lang, from_scr, from_region = get_locale_parts(from_locale) 71 _, to_scr, to_region = get_locale_parts(to_locale) 72 if to_locale == "FAIL": 73 continue # "FAIL" cases are not useful here. 74 if from_lang == 'und': 75 continue # not very useful for our purposes 76 if from_region is None and to_region not in ['001', 'ZZ']: 77 representative_locales.add(to_locale) 78 if from_scr is None: 79 likely_script_dict[from_locale] = to_scr 80 81 return likely_script_dict, frozenset(representative_locales) 82 83 84# From packLanguageOrRegion() in ResourceTypes.cpp 85def pack_language_or_region(inp, base): 86 """Pack langauge or region in a two-byte tuple.""" 87 if inp is None: 88 return (0, 0) 89 elif len(inp) == 2: 90 return ord(inp[0]), ord(inp[1]) 91 else: 92 assert len(inp) == 3, f'Expects a 3-character string, but "{inp}" ' 93 base = ord(base) 94 first = ord(inp[0]) - base 95 second = ord(inp[1]) - base 96 third = ord(inp[2]) - base 97 98 return (0x80 | (third << 2) | (second >>3), 99 ((second << 5) | first) & 0xFF) 100 101 102# From packLanguage() in ResourceTypes.cpp 103def pack_language(language): 104 """Pack language in a two-byte tuple.""" 105 return pack_language_or_region(language, 'a') 106 107 108# From packRegion() in ResourceTypes.cpp 109def pack_region(region): 110 """Pack region in a two-byte tuple.""" 111 return pack_language_or_region(region, '0') 112 113 114def pack_to_uint32(locale): 115 """Pack language+region of locale into a 32-bit unsigned integer.""" 116 lang, _, region = get_locale_parts(locale) 117 plang = pack_language(lang) 118 pregion = pack_region(region) 119 return (plang[0] << 24) | (plang[1] << 16) | (pregion[0] << 8) | pregion[1] 120 121 122def dump_script_codes(all_scripts): 123 """Dump the SCRIPT_CODES table.""" 124 print('const char SCRIPT_CODES[][4] = {') 125 for index, script in enumerate(all_scripts): 126 print(" /* %-2d */ {'%c', '%c', '%c', '%c'}," % ( 127 index, script[0], script[1], script[2], script[3])) 128 print('};') 129 print() 130 131 132def dump_script_data(likely_script_dict, all_scripts): 133 """Dump the script data.""" 134 print() 135 print('const std::unordered_map<uint32_t, uint8_t> LIKELY_SCRIPTS({') 136 for locale in sorted(likely_script_dict.keys()): 137 script = likely_script_dict[locale] 138 print(' {0x%08Xu, %2du}, // %s -> %s' % ( 139 pack_to_uint32(locale), 140 all_scripts.index(script), 141 locale.replace('_', '-'), 142 script)) 143 print('});') 144 145 146def pack_to_uint64(locale): 147 """Pack a full locale into a 64-bit unsigned integer.""" 148 _, script, _ = get_locale_parts(locale) 149 return ((pack_to_uint32(locale) << 32) | 150 (ord(script[0]) << 24) | 151 (ord(script[1]) << 16) | 152 (ord(script[2]) << 8) | 153 ord(script[3])) 154 155 156def dump_representative_locales(representative_locales): 157 """Dump the set of representative locales.""" 158 print() 159 print('std::unordered_set<uint64_t> REPRESENTATIVE_LOCALES({') 160 for locale in sorted(representative_locales): 161 print(' 0x%08XLLU, // %s' % ( 162 pack_to_uint64(locale), 163 locale)) 164 print('});') 165 166 167def read_and_dump_likely_data(cldr_source_dir): 168 """Read and dump the likely-script data.""" 169 likely_subtags_txt = os.path.join(cldr_source_dir, 170 'common', 'supplemental', 'likelySubtags.xml') 171 likely_script_dict, representative_locales = read_likely_subtags( 172 likely_subtags_txt) 173 174 all_scripts = list(set(likely_script_dict.values())) 175 assert len(all_scripts) <= 256 176 all_scripts.sort() 177 178 dump_script_codes(all_scripts) 179 dump_script_data(likely_script_dict, all_scripts) 180 dump_representative_locales(representative_locales) 181 return likely_script_dict 182 183def escape_script_variable_name(script): 184 """Escape characters, e.g. '~', in a C++ variable name""" 185 return script.replace("~", "_") 186 187def read_parent_data(icu_data_dir): 188 """Read locale parent data from ICU data files.""" 189 all_icu_data_files = glob.glob(os.path.join(icu_data_dir, '*', '*.txt')) 190 parent_dict = {} 191 for data_file in all_icu_data_files: 192 locale = os.path.splitext(os.path.basename(data_file))[0] 193 with open(data_file) as input_file: 194 for line in input_file: 195 if '%%Parent' in line: 196 parent = line[line.index('"')+1:line.rindex('"')] 197 if locale in parent_dict: 198 # Different files shouldn't have different parent info 199 assert parent_dict[locale] == parent 200 else: 201 parent_dict[locale] = parent 202 elif locale.startswith('ar_') and 'default{"latn"}' in line: 203 # Arabic parent overrides for ASCII digits. Since 204 # Unicode extensions are not supported in ResourceTypes, 205 # we will use ar-015 (Arabic, Northern Africa) instead 206 # of the more correct ar-u-nu-latn. 207 parent_dict[locale] = 'ar_015' 208 return parent_dict 209 210 211def get_likely_script(locale, likely_script_dict): 212 """Find the likely script for a locale, given the likely-script dictionary. 213 """ 214 if locale.count('_') == 2: 215 # it already has a script 216 return locale.split('_')[1] 217 elif locale in likely_script_dict: 218 return likely_script_dict[locale] 219 else: 220 language = locale.split('_')[0] 221 return likely_script_dict[language] 222 223 224def dump_parent_data(script_organized_dict): 225 """Dump information for parents of locales.""" 226 sorted_scripts = sorted(script_organized_dict.keys()) 227 print() 228 for script in sorted_scripts: 229 parent_dict = script_organized_dict[script] 230 print ('const std::unordered_map<uint32_t, uint32_t> %s_PARENTS({' 231 % escape_script_variable_name(script.upper())) 232 for locale in sorted(parent_dict.keys()): 233 parent = parent_dict[locale] 234 print(' {0x%08Xu, 0x%08Xu}, // %s -> %s' % ( 235 pack_to_uint32(locale), 236 pack_to_uint32(parent), 237 locale.replace('_', '-'), 238 parent.replace('_', '-'))) 239 print('});') 240 print() 241 242 print('const struct {') 243 print(' const char script[4];') 244 print(' const std::unordered_map<uint32_t, uint32_t>* map;') 245 print('} SCRIPT_PARENTS[] = {') 246 for script in sorted_scripts: 247 print(" {{'%c', '%c', '%c', '%c'}, &%s_PARENTS}," % ( 248 script[0], script[1], script[2], script[3], 249 escape_script_variable_name(script.upper()))) 250 print('};') 251 252 253def dump_parent_tree_depth(parent_dict): 254 """Find and dump the depth of the parent tree.""" 255 max_depth = 1 256 for locale, _ in parent_dict.items(): 257 depth = 1 258 while locale in parent_dict: 259 locale = parent_dict[locale] 260 depth += 1 261 max_depth = max(max_depth, depth) 262 assert max_depth < 5 # Our algorithms assume small max_depth 263 print() 264 print('const size_t MAX_PARENT_DEPTH = %d;' % max_depth) 265 266 267def read_and_dump_parent_data(icu_data_dir, likely_script_dict): 268 """Read parent data from ICU and dump it.""" 269 parent_dict = read_parent_data(icu_data_dir) 270 script_organized_dict = collections.defaultdict(dict) 271 for locale in parent_dict: 272 parent = parent_dict[locale] 273 if parent == 'root': 274 continue 275 script = get_likely_script(locale, likely_script_dict) 276 script_organized_dict[script][locale] = parent_dict[locale] 277 dump_parent_data(script_organized_dict) 278 dump_parent_tree_depth(parent_dict) 279 280 281def main(): 282 """Read the data files from ICU and dump the output to a C++ file.""" 283 source_root = sys.argv[1] 284 icu_data_dir = os.path.join( 285 source_root, 286 'external', 'icu', 'icu4c', 'source', 'data') 287 cldr_source_dir = os.path.join(source_root, 'external', 'cldr') 288 289 print('// Auto-generated by %s' % sys.argv[0]) 290 print() 291 likely_script_dict = read_and_dump_likely_data(cldr_source_dir) 292 read_and_dump_parent_data(icu_data_dir, likely_script_dict) 293 294 295if __name__ == '__main__': 296 main() 297