• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python3 -B
2# -*- coding: utf-8 -*-
3#
4# Copyright (C) 2017 and later: Unicode, Inc. and others.
5# License & terms of use: http://www.unicode.org/copyright.html
6#
7# Copyright (c) 2013-2016 International Business Machines
8# Corporation and others. All Rights Reserved.
9#
10# parsescriptmetadata.py
11#
12# 2013feb15 Markus W. Scherer
13#
14# ./parsescriptmetadata.py
15#   ~/svn.icu/trunk/src/source/common/unicode/uscript.h
16#   ~/svn.cldr/trunk/common/properties/scriptMetadata.txt
17
18"""Parses ICU4C uscript.h & CLDR scriptMetadata.txt,
19and writes ICU script data initializers."""
20
21import re
22import sys
23
24def main():
25  if len(sys.argv) < 3:
26    print("Usage: {}  path/to/ICU4C/uscript.h  "
27          "path/to/CLDR/scriptMetadata.txt".format(sys.argv[0]))
28    return
29  (uscript_path, smd_path) = sys.argv[1:3]
30
31  iso_to_icu = {}
32  max_icu_num = 0
33
34  # Parse lines like
35  #   USCRIPT_ARABIC       =  2,  /* Arab */
36  # and extract the ICU numeric script code and the ISO script code.
37  script_num_re = re.compile(r" *= *([0-9]+), */\* *([A-Z][a-z]{3}) *\*/")
38  with open(uscript_path, "r") as uscript_file:
39    for line in uscript_file:
40      line = line.strip()
41      if not line: continue
42      if line.startswith("#"): continue  # whole-line comment
43      match = script_num_re.search(line)
44      if match:
45        icu_num = int(match.group(1))
46        iso_to_icu[match.group(2)] = icu_num
47        if icu_num > max_icu_num: max_icu_num = icu_num
48
49  icu_data = [None] * (max_icu_num + 1)
50
51  # Parse lines like
52  #   Arab; 8; 0628; SA; 1; RECOMMENDED; YES; NO; YES; NO; NO
53  # and put the data (as strings) into the icu_data list.
54  with open(smd_path, "r") as smd_file:
55    for line in smd_file:
56      comment_start = line.find("#")
57      if comment_start >= 0: line = line[0:comment_start]
58      line = line.strip()
59      if not line: continue
60
61      fields = line.split(";")
62      if not fields or len(fields) < 11: continue
63      iso_code = fields[0].strip()
64      icu_num = iso_to_icu[iso_code]
65      icu_data[icu_num] = (iso_code,
66          # sample, usage
67          fields[2].strip(), fields[5].strip(),
68          # RTL, LB, cased
69          fields[6].strip(), fields[7].strip(), fields[10].strip())
70
71  # Print ICU array initializers with the relevant data.
72  for t in icu_data:
73    if t:
74      (iso_code, sample, usage, rtl, lb, cased) = t
75      s = "0x" + sample + " | " + usage
76      if rtl == "YES": s += " | RTL"
77      if lb == "YES": s += " | LB_LETTERS"
78      if cased == "YES": s += " | CASED"
79      print("    " + s + ",  // " + iso_code)
80    else:
81      print("    0,")
82
83
84if __name__ == "__main__":
85  main()
86