1#!/usr/bin/env python 2# -*- coding: utf-8 3# 4# Copyright 2015 The Rust Project Developers. See the COPYRIGHT 5# file at the top-level directory of this distribution and at 6# http://rust-lang.org/COPYRIGHT. 7# 8# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 9# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 10# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 11# option. This file may not be copied, modified, or distributed 12# except according to those terms. 13 14# This script uses the following Unicode tables: 15# - auxiliary/GraphemeBreakTest.txt 16# - auxiliary/WordBreakTest.txt 17# 18# Since this should not require frequent updates, we just store this 19# out-of-line and check the unicode.rs file into git. 20from __future__ import print_function 21 22import unicode, re, os, fileinput 23 24def load_test_data(f, optsplit=[]): 25 testRe1 = re.compile(r"^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$") 26 27 unicode.fetch(f) 28 data = [] 29 for line in fileinput.input(os.path.basename(f)): 30 # lines that include a test start with the ÷ character 31 if len(line) < 2 or not line.startswith('÷'): 32 continue 33 34 m = testRe1.match(line) 35 if not m: 36 print("error: no match on line where test was expected: %s" % line) 37 continue 38 39 # process the characters in this test case 40 chars = process_split_string(m.group(1)) 41 # skip test case if it contains invalid characters (viz., surrogates) 42 if not chars: 43 continue 44 45 # now process test cases 46 (chars, info) = process_split_info(m.group(2), chars, optsplit) 47 48 # make sure that we have break info for each break! 49 assert len(chars) - 1 == len(info) 50 51 data.append((chars, info)) 52 53 return data 54 55def process_split_info(s, c, o): 56 outcs = [] 57 outis = [] 58 workcs = c.pop(0) 59 60 # are we on a × or a ÷? 61 isX = False 62 if s.startswith('×'): 63 isX = True 64 65 # find each instance of '(÷|×) [x.y] ' 66 while s: 67 # find the currently considered rule number 68 sInd = s.index('[') + 1 69 eInd = s.index(']') 70 71 # if it's '× [a.b]' where 'a.b' is in o, then 72 # we consider it a split even though it's not 73 # marked as one 74 # if it's ÷ then it's always a split 75 if not isX or s[sInd:eInd] in o: 76 outis.append(s[sInd:eInd]) 77 outcs.append(workcs) 78 workcs = c.pop(0) 79 else: 80 workcs.extend(c.pop(0)) 81 82 idx = 1 83 while idx < len(s): 84 if s[idx:].startswith('×'): 85 isX = True 86 break 87 if s[idx:].startswith('÷'): 88 isX = False 89 break 90 idx += 1 91 s = s[idx:] 92 93 outcs.append(workcs) 94 return (outcs, outis) 95 96def process_split_string(s): 97 outls = [] 98 workls = [] 99 100 inls = s.split() 101 102 for i in inls: 103 if i == '÷' or i == '×': 104 outls.append(workls) 105 workls = [] 106 continue 107 108 ival = int(i,16) 109 110 if unicode.is_surrogate(ival): 111 return [] 112 113 workls.append(ival) 114 115 if workls: 116 outls.append(workls) 117 118 return outls 119 120def showfun(x): 121 outstr = '("' 122 for c in x[0]: 123 outstr += "\\u{%x}" % c 124 outstr += '",&[' 125 xfirst = True 126 for xx in x[1:]: 127 if not xfirst: 128 outstr += '],&[' 129 xfirst = False 130 sfirst = True 131 for sp in xx: 132 if not sfirst: 133 outstr += ',' 134 sfirst = False 135 outstr += '"' 136 for c in sp: 137 outstr += "\\u{%x}" % c 138 outstr += '"' 139 outstr += '])' 140 return outstr 141 142def create_grapheme_data(f): 143 # rules 9.1 and 9.2 are for extended graphemes only 144 optsplits = ['9.1','9.2'] 145 d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits) 146 147 test_same = [] 148 test_diff = [] 149 150 for (c, i) in d: 151 allchars = [cn for s in c for cn in s] 152 extgraphs = [] 153 extwork = [] 154 155 extwork.extend(c[0]) 156 for n in range(0,len(i)): 157 if i[n] in optsplits: 158 extwork.extend(c[n+1]) 159 else: 160 extgraphs.append(extwork) 161 extwork = [] 162 extwork.extend(c[n+1]) 163 164 # these are the extended grapheme clusters 165 extgraphs.append(extwork) 166 167 if extgraphs == c: 168 test_same.append((allchars, c)) 169 else: 170 test_diff.append((allchars, extgraphs, c)) 171 172 stype = "&'static [(&'static str, &'static [&'static str])]" 173 dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]" 174 f.write(" // official Unicode test data\n") 175 f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER) 176 unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True) 177 unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True) 178 179def create_words_data(f): 180 d = load_test_data("auxiliary/WordBreakTest.txt") 181 182 test = [] 183 184 for (c, i) in d: 185 allchars = [cn for s in c for cn in s] 186 test.append((allchars, c)) 187 188 wtype = "&'static [(&'static str, &'static [&'static str])]" 189 f.write(" // official Unicode test data\n") 190 f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER) 191 unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True) 192 193def create_sentence_data(f): 194 d = load_test_data("auxiliary/SentenceBreakTest.txt") 195 196 test = [] 197 198 for (c, i) in d: 199 allchars = [cn for s in c for cn in s] 200 test.append((allchars, c)) 201 202 wtype = "&'static [(&'static str, &'static [&'static str])]" 203 f.write(" // official Unicode test data\n") 204 f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER) 205 unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True) 206 207if __name__ == "__main__": 208 with open("testdata.rs", "w") as rf: 209 rf.write(unicode.preamble) 210 create_grapheme_data(rf) 211 create_words_data(rf) 212 create_sentence_data(rf) 213