1#!/usr/bin/env python 2# 3# Copyright 2011-2018 The Rust Project Developers. See the COPYRIGHT 4# file at the top-level directory of this distribution and at 5# http://rust-lang.org/COPYRIGHT. 6# 7# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 8# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 9# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 10# option. This file may not be copied, modified, or distributed 11# except according to those terms. 12 13# This script uses the following Unicode tables: 14# - DerivedNormalizationProps.txt 15# - NormalizationTest.txt 16# - UnicodeData.txt 17# - StandardizedVariants.txt 18# 19# Since this should not require frequent updates, we just store this 20# out-of-line and check the tables.rs and normalization_tests.rs files into git. 21import collections 22import urllib.request 23from itertools import batched 24 25UNICODE_VERSION = "16.0.0" 26UCD_URL = "https://www.unicode.org/Public/%s/ucd/" % UNICODE_VERSION 27 28PREAMBLE = """// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT 29// file at the top-level directory of this distribution and at 30// http://rust-lang.org/COPYRIGHT. 31// 32// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 33// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 34// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 35// option. This file may not be copied, modified, or distributed 36// except according to those terms. 37 38// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly 39 40#![allow(missing_docs)] 41""" 42 43NormalizationTest = collections.namedtuple( 44 "NormalizationTest", 45 ["source", "nfc", "nfd", "nfkc", "nfkd"], 46) 47 48# Mapping taken from Table 12 from: 49# http://www.unicode.org/reports/tr44/#General_Category_Values 50expanded_categories = { 51 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'], 52 'Lm': ['L'], 'Lo': ['L'], 53 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'], 54 'Nd': ['N'], 'Nl': ['N'], 'No': ['No'], 55 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'], 56 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'], 57 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'], 58 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'], 59 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'], 60} 61 62# Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior 63# http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior 64S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28 65S_COUNT = L_COUNT * V_COUNT * T_COUNT 66 67class UnicodeData(object): 68 def __init__(self): 69 self._load_unicode_data() 70 self.norm_props = self._load_norm_props() 71 self.norm_tests = self._load_norm_tests() 72 73 self.canon_comp = self._compute_canonical_comp() 74 self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed() 75 76 self.cjk_compat_variants_fully_decomp = {} 77 self._load_cjk_compat_ideograph_variants() 78 79 def stats(name, table): 80 count = sum(len(v) for v in table.values()) 81 print("%s: %d chars => %d decomposed chars" % (name, len(table), count)) 82 83 print("Decomposition table stats:") 84 stats("Canonical decomp", self.canon_decomp) 85 stats("Compatible decomp", self.compat_decomp) 86 stats("Canonical fully decomp", self.canon_fully_decomp) 87 stats("Compatible fully decomp", self.compat_fully_decomp) 88 stats("CJK Compat Variants fully decomp", self.cjk_compat_variants_fully_decomp) 89 90 self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables() 91 92 def _fetch(self, filename): 93 resp = urllib.request.urlopen(UCD_URL + filename) 94 return resp.read().decode('utf-8') 95 96 def _load_unicode_data(self): 97 self.name_to_char_int = {} 98 self.combining_classes = {} 99 self.compat_decomp = {} 100 self.canon_decomp = {} 101 self.general_category_mark = [] 102 self.general_category_public_assigned = [] 103 104 assigned_start = 0; 105 prev_char_int = -1; 106 prev_name = ""; 107 108 for line in self._fetch("UnicodeData.txt").splitlines(): 109 # See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html 110 pieces = line.split(';') 111 assert len(pieces) == 15 112 char, name, category, cc, decomp = pieces[0], pieces[1], pieces[2], pieces[3], pieces[5] 113 char_int = int(char, 16) 114 115 name = pieces[1].strip() 116 self.name_to_char_int[name] = char_int 117 118 if cc != '0': 119 self.combining_classes[char_int] = cc 120 121 if decomp.startswith('<'): 122 self.compat_decomp[char_int] = [int(c, 16) for c in decomp.split()[1:]] 123 elif decomp != '': 124 self.canon_decomp[char_int] = [int(c, 16) for c in decomp.split()] 125 126 if category == 'M' or 'M' in expanded_categories.get(category, []): 127 self.general_category_mark.append(char_int) 128 129 assert category != 'Cn', "Unexpected: Unassigned codepoint in UnicodeData.txt" 130 if category not in ['Co', 'Cs']: 131 if char_int != prev_char_int + 1 and not is_first_and_last(prev_name, name): 132 self.general_category_public_assigned.append((assigned_start, prev_char_int)) 133 assigned_start = char_int 134 prev_char_int = char_int 135 prev_name = name; 136 137 self.general_category_public_assigned.append((assigned_start, prev_char_int)) 138 139 def _load_cjk_compat_ideograph_variants(self): 140 for line in self._fetch("StandardizedVariants.txt").splitlines(): 141 strip_comments = line.split('#', 1)[0].strip() 142 if not strip_comments: 143 continue 144 145 variation_sequence, description, differences = strip_comments.split(';') 146 description = description.strip() 147 148 # Don't use variations that only apply in particular shaping environments. 149 if differences: 150 continue 151 152 # Look for entries where the description field is a codepoint name. 153 if description not in self.name_to_char_int: 154 continue 155 156 # Only consider the CJK Compatibility Ideographs. 157 if not description.startswith('CJK COMPATIBILITY IDEOGRAPH-'): 158 continue 159 160 char_int = self.name_to_char_int[description] 161 162 assert not char_int in self.combining_classes, "Unexpected: CJK compat variant with a combining class" 163 assert not char_int in self.compat_decomp, "Unexpected: CJK compat variant and compatibility decomposition" 164 assert len(self.canon_decomp[char_int]) == 1, "Unexpected: CJK compat variant and non-singleton canonical decomposition" 165 # If we ever need to handle Hangul here, we'll need to handle it separately. 166 assert not (S_BASE <= char_int < S_BASE + S_COUNT) 167 168 cjk_compat_variant_parts = [int(c, 16) for c in variation_sequence.split()] 169 for c in cjk_compat_variant_parts: 170 assert not c in self.canon_decomp, "Unexpected: CJK compat variant is unnormalized (canon)" 171 assert not c in self.compat_decomp, "Unexpected: CJK compat variant is unnormalized (compat)" 172 self.cjk_compat_variants_fully_decomp[char_int] = cjk_compat_variant_parts 173 174 def _load_norm_props(self): 175 props = collections.defaultdict(list) 176 177 for line in self._fetch("DerivedNormalizationProps.txt").splitlines(): 178 (prop_data, _, _) = line.partition("#") 179 prop_pieces = prop_data.split(";") 180 181 if len(prop_pieces) < 2: 182 continue 183 184 assert len(prop_pieces) <= 3 185 (low, _, high) = prop_pieces[0].strip().partition("..") 186 187 prop = prop_pieces[1].strip() 188 189 data = None 190 if len(prop_pieces) == 3: 191 data = prop_pieces[2].strip() 192 193 props[prop].append((low, high, data)) 194 195 return props 196 197 def _load_norm_tests(self): 198 tests = [] 199 for line in self._fetch("NormalizationTest.txt").splitlines(): 200 (test_data, _, _) = line.partition("#") 201 test_pieces = test_data.split(";") 202 203 if len(test_pieces) < 5: 204 continue 205 206 source, nfc, nfd, nfkc, nfkd = [[c.strip() for c in p.split()] for p in test_pieces[:5]] 207 tests.append(NormalizationTest(source, nfc, nfd, nfkc, nfkd)) 208 209 return tests 210 211 def _compute_canonical_comp(self): 212 canon_comp = {} 213 comp_exclusions = [ 214 (int(low, 16), int(high or low, 16)) 215 for low, high, _ in self.norm_props["Full_Composition_Exclusion"] 216 ] 217 for char_int, decomp in self.canon_decomp.items(): 218 if any(lo <= char_int <= hi for lo, hi in comp_exclusions): 219 continue 220 221 assert len(decomp) == 2 222 assert (decomp[0], decomp[1]) not in canon_comp 223 canon_comp[(decomp[0], decomp[1])] = char_int 224 225 return canon_comp 226 227 def _compute_fully_decomposed(self): 228 """ 229 Even though the decomposition algorithm is recursive, it is possible 230 to precompute the recursion at table generation time with modest 231 increase to the table size. Then, for these precomputed tables, we 232 note that 1) compatible decomposition is a subset of canonical 233 decomposition and 2) they mostly agree on their intersection. 234 Therefore, we don't store entries in the compatible table for 235 characters that decompose the same way under canonical decomposition. 236 237 Decomposition table stats: 238 Canonical decomp: 2060 chars => 3085 decomposed chars 239 Compatible decomp: 3662 chars => 5440 decomposed chars 240 Canonical fully decomp: 2060 chars => 3404 decomposed chars 241 Compatible fully decomp: 3678 chars => 5599 decomposed chars 242 243 The upshot is that decomposition code is very simple and easy to inline 244 at mild code size cost. 245 """ 246 def _decompose(char_int, compatible): 247 # 7-bit ASCII never decomposes 248 if char_int <= 0x7f: 249 yield char_int 250 return 251 252 # Assert that we're handling Hangul separately. 253 assert not (S_BASE <= char_int < S_BASE + S_COUNT) 254 255 decomp = self.canon_decomp.get(char_int) 256 if decomp is not None: 257 for decomposed_ch in decomp: 258 for fully_decomposed_ch in _decompose(decomposed_ch, compatible): 259 yield fully_decomposed_ch 260 return 261 262 if compatible and char_int in self.compat_decomp: 263 for decomposed_ch in self.compat_decomp[char_int]: 264 for fully_decomposed_ch in _decompose(decomposed_ch, compatible): 265 yield fully_decomposed_ch 266 return 267 268 yield char_int 269 return 270 271 end_codepoint = max( 272 max(self.canon_decomp.keys()), 273 max(self.compat_decomp.keys()), 274 ) 275 276 canon_fully_decomp = {} 277 compat_fully_decomp = {} 278 279 for char_int in range(0, end_codepoint + 1): 280 # Always skip Hangul, since it's more efficient to represent its 281 # decomposition programmatically. 282 if S_BASE <= char_int < S_BASE + S_COUNT: 283 continue 284 285 canon = list(_decompose(char_int, False)) 286 if not (len(canon) == 1 and canon[0] == char_int): 287 canon_fully_decomp[char_int] = canon 288 289 compat = list(_decompose(char_int, True)) 290 if not (len(compat) == 1 and compat[0] == char_int): 291 compat_fully_decomp[char_int] = compat 292 293 # Since canon_fully_decomp is a subset of compat_fully_decomp, we don't 294 # need to store their overlap when they agree. When they don't agree, 295 # store the decomposition in the compatibility table since we'll check 296 # that first when normalizing to NFKD. 297 assert set(canon_fully_decomp) <= set(compat_fully_decomp) 298 299 for ch in set(canon_fully_decomp) & set(compat_fully_decomp): 300 if canon_fully_decomp[ch] == compat_fully_decomp[ch]: 301 del compat_fully_decomp[ch] 302 303 return canon_fully_decomp, compat_fully_decomp 304 305 def _compute_stream_safe_tables(self): 306 """ 307 To make a text stream-safe with the Stream-Safe Text Process (UAX15-D4), 308 we need to be able to know the number of contiguous non-starters *after* 309 applying compatibility decomposition to each character. 310 311 We can do this incrementally by computing the number of leading and 312 trailing non-starters for each character's compatibility decomposition 313 with the following rules: 314 315 1) If a character is not affected by compatibility decomposition, look 316 up its canonical combining class to find out if it's a non-starter. 317 2) All Hangul characters are starters, even under decomposition. 318 3) Otherwise, very few decomposing characters have a nonzero count 319 of leading or trailing non-starters, so store these characters 320 with their associated counts in a separate table. 321 """ 322 leading_nonstarters = {} 323 trailing_nonstarters = {} 324 325 for c in set(self.canon_fully_decomp) | set(self.compat_fully_decomp): 326 decomposed = self.compat_fully_decomp.get(c) or self.canon_fully_decomp[c] 327 328 num_leading = 0 329 for d in decomposed: 330 if d not in self.combining_classes: 331 break 332 num_leading += 1 333 334 num_trailing = 0 335 for d in reversed(decomposed): 336 if d not in self.combining_classes: 337 break 338 num_trailing += 1 339 340 if num_leading > 0: 341 leading_nonstarters[c] = num_leading 342 if num_trailing > 0: 343 trailing_nonstarters[c] = num_trailing 344 345 return leading_nonstarters, trailing_nonstarters 346 347hexify = lambda c: '{:04X}'.format(c) 348 349# Test whether `first` and `last` are corresponding "<..., First>" and 350# "<..., Last>" markers. 351def is_first_and_last(first, last): 352 if not first.startswith('<') or not first.endswith(', First>'): 353 return False 354 if not last.startswith('<') or not last.endswith(', Last>'): 355 return False 356 return first[1:-8] == last[1:-7] 357 358def gen_mph_data(name, d, kv_type, kv_callback, kv_row_width): 359 (salt, keys) = minimal_perfect_hash(d) 360 out.write(f"\npub(crate) const {name.upper()}_SALT: &[u16] = &[\n") 361 for s_row in batched(salt, 13): 362 out.write(" ") 363 for s in s_row: 364 out.write(f" 0x{s:03X},") 365 out.write("\n") 366 out.write("];\n") 367 out.write(f"pub(crate) const {name.upper()}_KV: &[{kv_type}] = &[\n") 368 for k_row in batched(keys, kv_row_width): 369 out.write(" ") 370 for k in k_row: 371 out.write(f" {kv_callback(k)},") 372 out.write("\n") 373 out.write("];\n") 374 375def gen_combining_class(combining_classes, out): 376 gen_mph_data('canonical_combining_class', combining_classes, 'u32', 377 lambda k: f"0x{int(combining_classes[k]) | (k << 8):07X}", 8) 378 379def gen_composition_table(canon_comp, out): 380 table = {} 381 for (c1, c2), c3 in canon_comp.items(): 382 if c1 < 0x10000 and c2 < 0x10000: 383 table[(c1 << 16) | c2] = c3 384 (salt, keys) = minimal_perfect_hash(table) 385 gen_mph_data('COMPOSITION_TABLE', table, '(u32, char)', 386 lambda k: f"(0x{k:08X}, '\\u{{{table[k]:06X}}}')", 1) 387 388 out.write("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n") 389 out.write(" match (c1, c2) {\n") 390 for (c1, c2), c3 in sorted(canon_comp.items()): 391 if c1 >= 0x10000 or c2 >= 0x10000: 392 out.write(" ('\\u{%s}', '\\u{%s}') => Some('\\u{%s}'),\n" % (hexify(c1), hexify(c2), hexify(c3))) 393 394 out.write(" _ => None,\n") 395 out.write(" }\n") 396 out.write("}\n") 397 398def gen_decomposition_tables(canon_decomp, compat_decomp, cjk_compat_variants_decomp, out): 399 tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility'), (cjk_compat_variants_decomp, 'cjk_compat_variants')] 400 for table, name in tables: 401 offsets = {} 402 offset = 0 403 out.write("pub(crate) const %s_DECOMPOSED_CHARS: &[char] = &[\n" % name.upper()) 404 for k, v in table.items(): 405 offsets[k] = offset 406 offset += len(v) 407 for c in v: 408 out.write(" '\\u{%s}',\n" % hexify(c)) 409 # The largest offset must fit in a u16. 410 assert offset < 65536 411 out.write("];\n") 412 gen_mph_data(name + '_decomposed', table, "(u32, (u16, u16))", 413 lambda k: f"(0x{k:05X}, (0x{offsets[k]:03X}, 0x{len(table[k]):X}))", 1) 414 415def gen_qc_match(prop_table, out): 416 out.write(" match c {\n") 417 418 for low, high, data in prop_table: 419 assert data in ('N', 'M') 420 result = "No" if data == 'N' else "Maybe" 421 if high: 422 out.write(r" '\u{%s}'..='\u{%s}' => %s," % (low, high, result)) 423 else: 424 out.write(r" '\u{%s}' => %s," % (low, result)) 425 out.write("\n") 426 427 out.write(" _ => Yes,\n") 428 out.write(" }\n") 429 430def gen_nfc_qc(prop_tables, out): 431 out.write("\n#[inline]\n") 432 out.write("#[allow(ellipsis_inclusive_range_patterns)]\n") 433 out.write("pub fn qc_nfc(c: char) -> IsNormalized {\n") 434 gen_qc_match(prop_tables['NFC_QC'], out) 435 out.write("}\n") 436 437def gen_nfkc_qc(prop_tables, out): 438 out.write("#[inline]\n") 439 out.write("#[allow(ellipsis_inclusive_range_patterns)]\n") 440 out.write("pub fn qc_nfkc(c: char) -> IsNormalized {\n") 441 gen_qc_match(prop_tables['NFKC_QC'], out) 442 out.write("}\n") 443 444def gen_nfd_qc(prop_tables, out): 445 out.write("#[inline]\n") 446 out.write("#[allow(ellipsis_inclusive_range_patterns)]\n") 447 out.write("pub fn qc_nfd(c: char) -> IsNormalized {\n") 448 gen_qc_match(prop_tables['NFD_QC'], out) 449 out.write("}\n") 450 451def gen_nfkd_qc(prop_tables, out): 452 out.write("#[inline]\n") 453 out.write("#[allow(ellipsis_inclusive_range_patterns)]\n") 454 out.write("pub fn qc_nfkd(c: char) -> IsNormalized {\n") 455 gen_qc_match(prop_tables['NFKD_QC'], out) 456 out.write("}\n") 457 458def gen_combining_mark(general_category_mark, out): 459 gen_mph_data('combining_mark', general_category_mark, 'u32', 460 lambda k: '0x{:05X}'.format(k), 10) 461 462def gen_public_assigned(general_category_public_assigned, out): 463 # This could be done as a hash but the table is somewhat small. 464 out.write("#[inline]\n") 465 out.write("pub fn is_public_assigned(c: char) -> bool {\n") 466 out.write(" match c {\n") 467 468 start = True 469 for first, last in general_category_public_assigned: 470 if start: 471 out.write(" ") 472 start = False 473 else: 474 out.write("\n | ") 475 if first == last: 476 out.write("'\\u{%s}'" % hexify(first)) 477 else: 478 out.write("'\\u{%s}'..='\\u{%s}'" % (hexify(first), hexify(last))) 479 out.write(" => true,\n") 480 481 out.write(" _ => false,\n") 482 out.write(" }\n") 483 out.write("}\n") 484 485def gen_stream_safe(leading, trailing, out): 486 # This could be done as a hash but the table is very small. 487 out.write("#[inline]\n") 488 out.write("pub fn stream_safe_leading_nonstarters(c: char) -> usize {\n") 489 out.write(" match c {\n") 490 491 for char, num_leading in sorted(leading.items()): 492 out.write(" '\\u{%s}' => %d,\n" % (hexify(char), num_leading)) 493 494 out.write(" _ => 0,\n") 495 out.write(" }\n") 496 out.write("}\n") 497 498 gen_mph_data('trailing_nonstarters', trailing, 'u32', 499 lambda k: f"0x{int(trailing[k]) | (k << 8):07X}", 8) 500 501def gen_tests(tests, out): 502 out.write("""#[derive(Debug)] 503pub struct NormalizationTest { 504 pub source: &'static str, 505 pub nfc: &'static str, 506 pub nfd: &'static str, 507 pub nfkc: &'static str, 508 pub nfkd: &'static str, 509} 510 511""") 512 513 out.write("pub const NORMALIZATION_TESTS: &[NormalizationTest] = &[\n") 514 str_literal = lambda s: '"%s"' % "".join("\\u{%s}" % c for c in s) 515 516 for test in tests: 517 out.write(" NormalizationTest {\n") 518 out.write(" source: %s,\n" % str_literal(test.source)) 519 out.write(" nfc: %s,\n" % str_literal(test.nfc)) 520 out.write(" nfd: %s,\n" % str_literal(test.nfd)) 521 out.write(" nfkc: %s,\n" % str_literal(test.nfkc)) 522 out.write(" nfkd: %s,\n" % str_literal(test.nfkd)) 523 out.write(" },\n") 524 525 out.write("];\n") 526 527# Guaranteed to be less than n. 528def my_hash(x, salt, n): 529 # This is hash based on the theory that multiplication is efficient 530 mask_32 = 0xffffffff 531 y = ((x + salt) * 2654435769) & mask_32 532 y ^= (x * 0x31415926) & mask_32 533 return (y * n) >> 32 534 535# Compute minimal perfect hash function, d can be either a dict or list of keys. 536def minimal_perfect_hash(d): 537 n = len(d) 538 buckets = dict((h, []) for h in range(n)) 539 for key in d: 540 h = my_hash(key, 0, n) 541 buckets[h].append(key) 542 bsorted = [(len(buckets[h]), h) for h in range(n)] 543 bsorted.sort(reverse = True) 544 claimed = [False] * n 545 salts = [0] * n 546 keys = [0] * n 547 for (bucket_size, h) in bsorted: 548 # Note: the traditional perfect hashing approach would also special-case 549 # bucket_size == 1 here and assign any empty slot, rather than iterating 550 # until rehash finds an empty slot. But we're not doing that so we can 551 # avoid the branch. 552 if bucket_size == 0: 553 break 554 else: 555 for salt in range(1, 32768): 556 rehashes = [my_hash(key, salt, n) for key in buckets[h]] 557 # Make sure there are no rehash collisions within this bucket. 558 if all(not claimed[hash] for hash in rehashes): 559 if len(set(rehashes)) < bucket_size: 560 continue 561 salts[h] = salt 562 for key in buckets[h]: 563 rehash = my_hash(key, salt, n) 564 claimed[rehash] = True 565 keys[rehash] = key 566 break 567 if salts[h] == 0: 568 print("minimal perfect hashing failed") 569 # Note: if this happens (because of unfortunate data), then there are 570 # a few things that could be done. First, the hash function could be 571 # tweaked. Second, the bucket order could be scrambled (especially the 572 # singletons). Right now, the buckets are sorted, which has the advantage 573 # of being deterministic. 574 # 575 # As a more extreme approach, the singleton bucket optimization could be 576 # applied (give the direct address for singleton buckets, rather than 577 # relying on a rehash). That is definitely the more standard approach in 578 # the minimal perfect hashing literature, but in testing the branch was a 579 # significant slowdown. 580 exit(1) 581 return (salts, keys) 582 583if __name__ == '__main__': 584 data = UnicodeData() 585 with open("tables.rs", "w", newline = "\n") as out: 586 out.write(PREAMBLE) 587 out.write("use crate::quick_check::IsNormalized;\n") 588 out.write("use crate::quick_check::IsNormalized::*;\n") 589 out.write("\n") 590 591 version = "(%s, %s, %s)" % tuple(UNICODE_VERSION.split(".")) 592 out.write("#[allow(unused)]\n") 593 out.write("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n" % version) 594 595 gen_combining_class(data.combining_classes, out) 596 597 gen_composition_table(data.canon_comp, out) 598 599 gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, data.cjk_compat_variants_fully_decomp, out) 600 601 gen_combining_mark(data.general_category_mark, out) 602 603 gen_public_assigned(data.general_category_public_assigned, out) 604 605 gen_nfc_qc(data.norm_props, out) 606 607 gen_nfkc_qc(data.norm_props, out) 608 609 gen_nfd_qc(data.norm_props, out) 610 611 gen_nfkd_qc(data.norm_props, out) 612 613 gen_stream_safe(data.ss_leading, data.ss_trailing, out) 614 615 with open("normalization_tests.rs", "w", newline = "\n") as out: 616 out.write(PREAMBLE) 617 gen_tests(data.norm_tests, out) 618