1#!/usr/bin/python3 -B 2# -*- coding: utf-8 -*- 3# 4# Copyright (C) 2017 and later: Unicode, Inc. and others. 5# License & terms of use: http://www.unicode.org/copyright.html 6# 7# Copyright (c) 2013-2016 International Business Machines 8# Corporation and others. All Rights Reserved. 9# 10# parsescriptmetadata.py 11# 12# 2013feb15 Markus W. Scherer 13# 14# ./parsescriptmetadata.py 15# ~/svn.icu/trunk/src/source/common/unicode/uscript.h 16# ~/svn.cldr/trunk/common/properties/scriptMetadata.txt 17 18"""Parses ICU4C uscript.h & CLDR scriptMetadata.txt, 19and writes ICU script data initializers.""" 20 21import re 22import sys 23 24def main(): 25 if len(sys.argv) < 3: 26 print("Usage: {} path/to/ICU4C/uscript.h " 27 "path/to/CLDR/scriptMetadata.txt".format(sys.argv[0])) 28 return 29 (uscript_path, smd_path) = sys.argv[1:3] 30 31 iso_to_icu = {} 32 max_icu_num = 0 33 34 # Parse lines like 35 # USCRIPT_ARABIC = 2, /* Arab */ 36 # and extract the ICU numeric script code and the ISO script code. 37 script_num_re = re.compile(r" *= *([0-9]+), */\* *([A-Z][a-z]{3}) *\*/") 38 with open(uscript_path, "r") as uscript_file: 39 for line in uscript_file: 40 line = line.strip() 41 if not line: continue 42 if line.startswith("#"): continue # whole-line comment 43 match = script_num_re.search(line) 44 if match: 45 icu_num = int(match.group(1)) 46 iso_to_icu[match.group(2)] = icu_num 47 if icu_num > max_icu_num: max_icu_num = icu_num 48 49 icu_data = [None] * (max_icu_num + 1) 50 51 # Parse lines like 52 # Arab; 8; 0628; SA; 1; RECOMMENDED; YES; NO; YES; NO; NO 53 # and put the data (as strings) into the icu_data list. 54 with open(smd_path, "r") as smd_file: 55 for line in smd_file: 56 comment_start = line.find("#") 57 if comment_start >= 0: line = line[0:comment_start] 58 line = line.strip() 59 if not line: continue 60 61 fields = line.split(";") 62 if not fields or len(fields) < 11: continue 63 iso_code = fields[0].strip() 64 icu_num = iso_to_icu[iso_code] 65 icu_data[icu_num] = (iso_code, 66 # sample, usage 67 fields[2].strip(), fields[5].strip(), 68 # RTL, LB, cased 69 fields[6].strip(), fields[7].strip(), fields[10].strip()) 70 71 # Print ICU array initializers with the relevant data. 72 for t in icu_data: 73 if t: 74 (iso_code, sample, usage, rtl, lb, cased) = t 75 s = "0x" + sample + " | " + usage 76 if rtl == "YES": s += " | RTL" 77 if lb == "YES": s += " | LB_LETTERS" 78 if cased == "YES": s += " | CASED" 79 print(" " + s + ", // " + iso_code) 80 else: 81 print(" 0,") 82 83 84if __name__ == "__main__": 85 main() 86