• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2#
3# Copyright 2011-2018 The Rust Project Developers. See the COPYRIGHT
4# file at the top-level directory of this distribution and at
5# http://rust-lang.org/COPYRIGHT.
6#
7# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
8# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
9# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
10# option. This file may not be copied, modified, or distributed
11# except according to those terms.
12
13# This script uses the following Unicode tables:
14# - DerivedNormalizationProps.txt
15# - NormalizationTest.txt
16# - UnicodeData.txt
17# - StandardizedVariants.txt
18#
19# Since this should not require frequent updates, we just store this
20# out-of-line and check the tables.rs and normalization_tests.rs files into git.
21import collections
22import urllib.request
23from itertools import batched
24
25UNICODE_VERSION = "16.0.0"
26UCD_URL = "https://www.unicode.org/Public/%s/ucd/" % UNICODE_VERSION
27
28PREAMBLE = """// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
29// file at the top-level directory of this distribution and at
30// http://rust-lang.org/COPYRIGHT.
31//
32// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
33// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
34// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
35// option. This file may not be copied, modified, or distributed
36// except according to those terms.
37
38// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
39
40#![allow(missing_docs)]
41"""
42
43NormalizationTest = collections.namedtuple(
44    "NormalizationTest",
45    ["source", "nfc", "nfd", "nfkc", "nfkd"],
46)
47
48# Mapping taken from Table 12 from:
49# http://www.unicode.org/reports/tr44/#General_Category_Values
50expanded_categories = {
51    'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
52    'Lm': ['L'], 'Lo': ['L'],
53    'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
54    'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
55    'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
56    'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
57    'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
58    'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
59    'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
60}
61
62# Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
63# http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
64S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28
65S_COUNT = L_COUNT * V_COUNT * T_COUNT
66
67class UnicodeData(object):
68    def __init__(self):
69        self._load_unicode_data()
70        self.norm_props = self._load_norm_props()
71        self.norm_tests = self._load_norm_tests()
72
73        self.canon_comp = self._compute_canonical_comp()
74        self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed()
75
76        self.cjk_compat_variants_fully_decomp = {}
77        self._load_cjk_compat_ideograph_variants()
78
79        def stats(name, table):
80            count = sum(len(v) for v in table.values())
81            print("%s: %d chars => %d decomposed chars" % (name, len(table), count))
82
83        print("Decomposition table stats:")
84        stats("Canonical decomp", self.canon_decomp)
85        stats("Compatible decomp", self.compat_decomp)
86        stats("Canonical fully decomp", self.canon_fully_decomp)
87        stats("Compatible fully decomp", self.compat_fully_decomp)
88        stats("CJK Compat Variants fully decomp", self.cjk_compat_variants_fully_decomp)
89
90        self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables()
91
92    def _fetch(self, filename):
93        resp = urllib.request.urlopen(UCD_URL + filename)
94        return resp.read().decode('utf-8')
95
96    def _load_unicode_data(self):
97        self.name_to_char_int = {}
98        self.combining_classes = {}
99        self.compat_decomp = {}
100        self.canon_decomp = {}
101        self.general_category_mark = []
102        self.general_category_public_assigned = []
103
104        assigned_start = 0;
105        prev_char_int = -1;
106        prev_name = "";
107
108        for line in self._fetch("UnicodeData.txt").splitlines():
109            # See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
110            pieces = line.split(';')
111            assert len(pieces) == 15
112            char, name, category, cc, decomp = pieces[0], pieces[1], pieces[2], pieces[3], pieces[5]
113            char_int = int(char, 16)
114
115            name = pieces[1].strip()
116            self.name_to_char_int[name] = char_int
117
118            if cc != '0':
119                self.combining_classes[char_int] = cc
120
121            if decomp.startswith('<'):
122                self.compat_decomp[char_int] = [int(c, 16) for c in decomp.split()[1:]]
123            elif decomp != '':
124                self.canon_decomp[char_int] = [int(c, 16) for c in decomp.split()]
125
126            if category == 'M' or 'M' in expanded_categories.get(category, []):
127                self.general_category_mark.append(char_int)
128
129            assert category != 'Cn', "Unexpected: Unassigned codepoint in UnicodeData.txt"
130            if category not in ['Co', 'Cs']:
131                if char_int != prev_char_int + 1 and not is_first_and_last(prev_name, name):
132                    self.general_category_public_assigned.append((assigned_start, prev_char_int))
133                    assigned_start = char_int
134                prev_char_int = char_int
135                prev_name = name;
136
137        self.general_category_public_assigned.append((assigned_start, prev_char_int))
138
139    def _load_cjk_compat_ideograph_variants(self):
140        for line in self._fetch("StandardizedVariants.txt").splitlines():
141            strip_comments = line.split('#', 1)[0].strip()
142            if not strip_comments:
143                continue
144
145            variation_sequence, description, differences = strip_comments.split(';')
146            description = description.strip()
147
148            # Don't use variations that only apply in particular shaping environments.
149            if differences:
150                continue
151
152            # Look for entries where the description field is a codepoint name.
153            if description not in self.name_to_char_int:
154                continue
155
156            # Only consider the CJK Compatibility Ideographs.
157            if not description.startswith('CJK COMPATIBILITY IDEOGRAPH-'):
158                continue
159
160            char_int = self.name_to_char_int[description]
161
162            assert not char_int in self.combining_classes, "Unexpected: CJK compat variant with a combining class"
163            assert not char_int in self.compat_decomp, "Unexpected: CJK compat variant and compatibility decomposition"
164            assert len(self.canon_decomp[char_int]) == 1, "Unexpected: CJK compat variant and non-singleton canonical decomposition"
165            # If we ever need to handle Hangul here, we'll need to handle it separately.
166            assert not (S_BASE <= char_int < S_BASE + S_COUNT)
167
168            cjk_compat_variant_parts = [int(c, 16) for c in variation_sequence.split()]
169            for c in cjk_compat_variant_parts:
170                assert not c in self.canon_decomp, "Unexpected: CJK compat variant is unnormalized (canon)"
171                assert not c in self.compat_decomp, "Unexpected: CJK compat variant is unnormalized (compat)"
172            self.cjk_compat_variants_fully_decomp[char_int] = cjk_compat_variant_parts
173
174    def _load_norm_props(self):
175        props = collections.defaultdict(list)
176
177        for line in self._fetch("DerivedNormalizationProps.txt").splitlines():
178            (prop_data, _, _) = line.partition("#")
179            prop_pieces = prop_data.split(";")
180
181            if len(prop_pieces) < 2:
182                continue
183
184            assert len(prop_pieces) <= 3
185            (low, _, high) = prop_pieces[0].strip().partition("..")
186
187            prop = prop_pieces[1].strip()
188
189            data = None
190            if len(prop_pieces) == 3:
191                data = prop_pieces[2].strip()
192
193            props[prop].append((low, high, data))
194
195        return props
196
197    def _load_norm_tests(self):
198        tests = []
199        for line in self._fetch("NormalizationTest.txt").splitlines():
200            (test_data, _, _) = line.partition("#")
201            test_pieces = test_data.split(";")
202
203            if len(test_pieces) < 5:
204                continue
205
206            source, nfc, nfd, nfkc, nfkd = [[c.strip() for c in p.split()] for p in test_pieces[:5]]
207            tests.append(NormalizationTest(source, nfc, nfd, nfkc, nfkd))
208
209        return tests
210
211    def _compute_canonical_comp(self):
212        canon_comp = {}
213        comp_exclusions = [
214            (int(low, 16), int(high or low, 16))
215            for low, high, _ in self.norm_props["Full_Composition_Exclusion"]
216        ]
217        for char_int, decomp in self.canon_decomp.items():
218            if any(lo <= char_int <= hi for lo, hi in comp_exclusions):
219                continue
220
221            assert len(decomp) == 2
222            assert (decomp[0], decomp[1]) not in canon_comp
223            canon_comp[(decomp[0], decomp[1])] = char_int
224
225        return canon_comp
226
227    def _compute_fully_decomposed(self):
228        """
229        Even though the decomposition algorithm is recursive, it is possible
230        to precompute the recursion at table generation time with modest
231        increase to the table size.  Then, for these precomputed tables, we
232        note that 1) compatible decomposition is a subset of canonical
233        decomposition and 2) they mostly agree on their intersection.
234        Therefore, we don't store entries in the compatible table for
235        characters that decompose the same way under canonical decomposition.
236
237            Decomposition table stats:
238            Canonical decomp: 2060 chars => 3085 decomposed chars
239            Compatible decomp: 3662 chars => 5440 decomposed chars
240            Canonical fully decomp: 2060 chars => 3404 decomposed chars
241            Compatible fully decomp: 3678 chars => 5599 decomposed chars
242
243        The upshot is that decomposition code is very simple and easy to inline
244        at mild code size cost.
245        """
246        def _decompose(char_int, compatible):
247            # 7-bit ASCII never decomposes
248            if char_int <= 0x7f:
249                yield char_int
250                return
251
252            # Assert that we're handling Hangul separately.
253            assert not (S_BASE <= char_int < S_BASE + S_COUNT)
254
255            decomp = self.canon_decomp.get(char_int)
256            if decomp is not None:
257                for decomposed_ch in decomp:
258                    for fully_decomposed_ch in _decompose(decomposed_ch, compatible):
259                        yield fully_decomposed_ch
260                return
261
262            if compatible and char_int in self.compat_decomp:
263                for decomposed_ch in self.compat_decomp[char_int]:
264                    for fully_decomposed_ch in _decompose(decomposed_ch, compatible):
265                        yield fully_decomposed_ch
266                return
267
268            yield char_int
269            return
270
271        end_codepoint = max(
272            max(self.canon_decomp.keys()),
273            max(self.compat_decomp.keys()),
274        )
275
276        canon_fully_decomp = {}
277        compat_fully_decomp = {}
278
279        for char_int in range(0, end_codepoint + 1):
280            # Always skip Hangul, since it's more efficient to represent its
281            # decomposition programmatically.
282            if S_BASE <= char_int < S_BASE + S_COUNT:
283                continue
284
285            canon = list(_decompose(char_int, False))
286            if not (len(canon) == 1 and canon[0] == char_int):
287                canon_fully_decomp[char_int] = canon
288
289            compat = list(_decompose(char_int, True))
290            if not (len(compat) == 1 and compat[0] == char_int):
291                compat_fully_decomp[char_int] = compat
292
293        # Since canon_fully_decomp is a subset of compat_fully_decomp, we don't
294        # need to store their overlap when they agree.  When they don't agree,
295        # store the decomposition in the compatibility table since we'll check
296        # that first when normalizing to NFKD.
297        assert set(canon_fully_decomp) <= set(compat_fully_decomp)
298
299        for ch in set(canon_fully_decomp) & set(compat_fully_decomp):
300            if canon_fully_decomp[ch] == compat_fully_decomp[ch]:
301                del compat_fully_decomp[ch]
302
303        return canon_fully_decomp, compat_fully_decomp
304
305    def _compute_stream_safe_tables(self):
306        """
307        To make a text stream-safe with the Stream-Safe Text Process (UAX15-D4),
308        we need to be able to know the number of contiguous non-starters *after*
309        applying compatibility decomposition to each character.
310
311        We can do this incrementally by computing the number of leading and
312        trailing non-starters for each character's compatibility decomposition
313        with the following rules:
314
315        1) If a character is not affected by compatibility decomposition, look
316           up its canonical combining class to find out if it's a non-starter.
317        2) All Hangul characters are starters, even under decomposition.
318        3) Otherwise, very few decomposing characters have a nonzero count
319           of leading or trailing non-starters, so store these characters
320           with their associated counts in a separate table.
321        """
322        leading_nonstarters = {}
323        trailing_nonstarters = {}
324
325        for c in set(self.canon_fully_decomp) | set(self.compat_fully_decomp):
326            decomposed = self.compat_fully_decomp.get(c) or self.canon_fully_decomp[c]
327
328            num_leading = 0
329            for d in decomposed:
330                if d not in self.combining_classes:
331                    break
332                num_leading += 1
333
334            num_trailing = 0
335            for d in reversed(decomposed):
336                if d not in self.combining_classes:
337                    break
338                num_trailing += 1
339
340            if num_leading > 0:
341                leading_nonstarters[c] = num_leading
342            if num_trailing > 0:
343                trailing_nonstarters[c] = num_trailing
344
345        return leading_nonstarters, trailing_nonstarters
346
347hexify = lambda c: '{:04X}'.format(c)
348
349# Test whether `first` and `last` are corresponding "<..., First>" and
350# "<..., Last>" markers.
351def is_first_and_last(first, last):
352    if not first.startswith('<') or not first.endswith(', First>'):
353        return False
354    if not last.startswith('<') or not last.endswith(', Last>'):
355        return False
356    return first[1:-8] == last[1:-7]
357
358def gen_mph_data(name, d, kv_type, kv_callback, kv_row_width):
359    (salt, keys) = minimal_perfect_hash(d)
360    out.write(f"\npub(crate) const {name.upper()}_SALT: &[u16] = &[\n")
361    for s_row in batched(salt, 13):
362        out.write("   ")
363        for s in s_row:
364            out.write(f" 0x{s:03X},")
365        out.write("\n")
366    out.write("];\n")
367    out.write(f"pub(crate) const {name.upper()}_KV: &[{kv_type}] = &[\n")
368    for k_row in batched(keys, kv_row_width):
369        out.write("   ")
370        for k in k_row:
371            out.write(f" {kv_callback(k)},")
372        out.write("\n")
373    out.write("];\n")
374
375def gen_combining_class(combining_classes, out):
376    gen_mph_data('canonical_combining_class', combining_classes, 'u32',
377        lambda k: f"0x{int(combining_classes[k]) | (k << 8):07X}", 8)
378
379def gen_composition_table(canon_comp, out):
380    table = {}
381    for (c1, c2), c3 in canon_comp.items():
382        if c1 < 0x10000 and c2 < 0x10000:
383            table[(c1 << 16) | c2] = c3
384    (salt, keys) = minimal_perfect_hash(table)
385    gen_mph_data('COMPOSITION_TABLE', table, '(u32, char)',
386        lambda k: f"(0x{k:08X}, '\\u{{{table[k]:06X}}}')", 1)
387
388    out.write("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n")
389    out.write("    match (c1, c2) {\n")
390    for (c1, c2), c3 in sorted(canon_comp.items()):
391        if c1 >= 0x10000 or c2 >= 0x10000:
392            out.write("        ('\\u{%s}', '\\u{%s}') => Some('\\u{%s}'),\n" % (hexify(c1), hexify(c2), hexify(c3)))
393
394    out.write("        _ => None,\n")
395    out.write("    }\n")
396    out.write("}\n")
397
398def gen_decomposition_tables(canon_decomp, compat_decomp, cjk_compat_variants_decomp, out):
399    tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility'), (cjk_compat_variants_decomp, 'cjk_compat_variants')]
400    for table, name in tables:
401        offsets = {}
402        offset = 0
403        out.write("pub(crate) const %s_DECOMPOSED_CHARS: &[char] = &[\n" % name.upper())
404        for k, v in table.items():
405            offsets[k] = offset
406            offset += len(v)
407            for c in v:
408                out.write("    '\\u{%s}',\n" % hexify(c))
409        # The largest offset must fit in a u16.
410        assert offset < 65536
411        out.write("];\n")
412        gen_mph_data(name + '_decomposed', table, "(u32, (u16, u16))",
413            lambda k: f"(0x{k:05X}, (0x{offsets[k]:03X}, 0x{len(table[k]):X}))", 1)
414
415def gen_qc_match(prop_table, out):
416    out.write("    match c {\n")
417
418    for low, high, data in prop_table:
419        assert data in ('N', 'M')
420        result = "No" if data == 'N' else "Maybe"
421        if high:
422            out.write(r"        '\u{%s}'..='\u{%s}' => %s," % (low, high, result))
423        else:
424            out.write(r"        '\u{%s}' => %s," % (low, result))
425        out.write("\n")
426
427    out.write("        _ => Yes,\n")
428    out.write("    }\n")
429
430def gen_nfc_qc(prop_tables, out):
431    out.write("\n#[inline]\n")
432    out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
433    out.write("pub fn qc_nfc(c: char) -> IsNormalized {\n")
434    gen_qc_match(prop_tables['NFC_QC'], out)
435    out.write("}\n")
436
437def gen_nfkc_qc(prop_tables, out):
438    out.write("#[inline]\n")
439    out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
440    out.write("pub fn qc_nfkc(c: char) -> IsNormalized {\n")
441    gen_qc_match(prop_tables['NFKC_QC'], out)
442    out.write("}\n")
443
444def gen_nfd_qc(prop_tables, out):
445    out.write("#[inline]\n")
446    out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
447    out.write("pub fn qc_nfd(c: char) -> IsNormalized {\n")
448    gen_qc_match(prop_tables['NFD_QC'], out)
449    out.write("}\n")
450
451def gen_nfkd_qc(prop_tables, out):
452    out.write("#[inline]\n")
453    out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
454    out.write("pub fn qc_nfkd(c: char) -> IsNormalized {\n")
455    gen_qc_match(prop_tables['NFKD_QC'], out)
456    out.write("}\n")
457
458def gen_combining_mark(general_category_mark, out):
459    gen_mph_data('combining_mark', general_category_mark, 'u32',
460        lambda k: '0x{:05X}'.format(k), 10)
461
462def gen_public_assigned(general_category_public_assigned, out):
463    # This could be done as a hash but the table is somewhat small.
464    out.write("#[inline]\n")
465    out.write("pub fn is_public_assigned(c: char) -> bool {\n")
466    out.write("    match c {\n")
467
468    start = True
469    for first, last in general_category_public_assigned:
470        if start:
471            out.write("        ")
472            start = False
473        else:
474            out.write("\n        | ")
475        if first == last:
476            out.write("'\\u{%s}'" % hexify(first))
477        else:
478            out.write("'\\u{%s}'..='\\u{%s}'" % (hexify(first), hexify(last)))
479    out.write(" => true,\n")
480
481    out.write("        _ => false,\n")
482    out.write("    }\n")
483    out.write("}\n")
484
485def gen_stream_safe(leading, trailing, out):
486    # This could be done as a hash but the table is very small.
487    out.write("#[inline]\n")
488    out.write("pub fn stream_safe_leading_nonstarters(c: char) -> usize {\n")
489    out.write("    match c {\n")
490
491    for char, num_leading in sorted(leading.items()):
492        out.write("        '\\u{%s}' => %d,\n" % (hexify(char), num_leading))
493
494    out.write("        _ => 0,\n")
495    out.write("    }\n")
496    out.write("}\n")
497
498    gen_mph_data('trailing_nonstarters', trailing, 'u32',
499        lambda k: f"0x{int(trailing[k]) | (k << 8):07X}", 8)
500
501def gen_tests(tests, out):
502    out.write("""#[derive(Debug)]
503pub struct NormalizationTest {
504    pub source: &'static str,
505    pub nfc: &'static str,
506    pub nfd: &'static str,
507    pub nfkc: &'static str,
508    pub nfkd: &'static str,
509}
510
511""")
512
513    out.write("pub const NORMALIZATION_TESTS: &[NormalizationTest] = &[\n")
514    str_literal = lambda s: '"%s"' % "".join("\\u{%s}" % c for c in s)
515
516    for test in tests:
517        out.write("    NormalizationTest {\n")
518        out.write("        source: %s,\n" % str_literal(test.source))
519        out.write("        nfc: %s,\n" % str_literal(test.nfc))
520        out.write("        nfd: %s,\n" % str_literal(test.nfd))
521        out.write("        nfkc: %s,\n" % str_literal(test.nfkc))
522        out.write("        nfkd: %s,\n" % str_literal(test.nfkd))
523        out.write("    },\n")
524
525    out.write("];\n")
526
527# Guaranteed to be less than n.
528def my_hash(x, salt, n):
529    # This is hash based on the theory that multiplication is efficient
530    mask_32 = 0xffffffff
531    y = ((x + salt) * 2654435769) & mask_32
532    y ^= (x * 0x31415926) & mask_32
533    return (y * n) >> 32
534
535# Compute minimal perfect hash function, d can be either a dict or list of keys.
536def minimal_perfect_hash(d):
537    n = len(d)
538    buckets = dict((h, []) for h in range(n))
539    for key in d:
540        h = my_hash(key, 0, n)
541        buckets[h].append(key)
542    bsorted = [(len(buckets[h]), h) for h in range(n)]
543    bsorted.sort(reverse = True)
544    claimed = [False] * n
545    salts = [0] * n
546    keys = [0] * n
547    for (bucket_size, h) in bsorted:
548        # Note: the traditional perfect hashing approach would also special-case
549        # bucket_size == 1 here and assign any empty slot, rather than iterating
550        # until rehash finds an empty slot. But we're not doing that so we can
551        # avoid the branch.
552        if bucket_size == 0:
553            break
554        else:
555            for salt in range(1, 32768):
556                rehashes = [my_hash(key, salt, n) for key in buckets[h]]
557                # Make sure there are no rehash collisions within this bucket.
558                if all(not claimed[hash] for hash in rehashes):
559                    if len(set(rehashes)) < bucket_size:
560                        continue
561                    salts[h] = salt
562                    for key in buckets[h]:
563                        rehash = my_hash(key, salt, n)
564                        claimed[rehash] = True
565                        keys[rehash] = key
566                    break
567            if salts[h] == 0:
568                print("minimal perfect hashing failed")
569                # Note: if this happens (because of unfortunate data), then there are
570                # a few things that could be done. First, the hash function could be
571                # tweaked. Second, the bucket order could be scrambled (especially the
572                # singletons). Right now, the buckets are sorted, which has the advantage
573                # of being deterministic.
574                #
575                # As a more extreme approach, the singleton bucket optimization could be
576                # applied (give the direct address for singleton buckets, rather than
577                # relying on a rehash). That is definitely the more standard approach in
578                # the minimal perfect hashing literature, but in testing the branch was a
579                # significant slowdown.
580                exit(1)
581    return (salts, keys)
582
583if __name__ == '__main__':
584    data = UnicodeData()
585    with open("tables.rs", "w", newline = "\n") as out:
586        out.write(PREAMBLE)
587        out.write("use crate::quick_check::IsNormalized;\n")
588        out.write("use crate::quick_check::IsNormalized::*;\n")
589        out.write("\n")
590
591        version = "(%s, %s, %s)" % tuple(UNICODE_VERSION.split("."))
592        out.write("#[allow(unused)]\n")
593        out.write("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n" % version)
594
595        gen_combining_class(data.combining_classes, out)
596
597        gen_composition_table(data.canon_comp, out)
598
599        gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, data.cjk_compat_variants_fully_decomp, out)
600
601        gen_combining_mark(data.general_category_mark, out)
602
603        gen_public_assigned(data.general_category_public_assigned, out)
604
605        gen_nfc_qc(data.norm_props, out)
606
607        gen_nfkc_qc(data.norm_props, out)
608
609        gen_nfd_qc(data.norm_props, out)
610
611        gen_nfkd_qc(data.norm_props, out)
612
613        gen_stream_safe(data.ss_leading, data.ss_trailing, out)
614
615    with open("normalization_tests.rs", "w", newline = "\n") as out:
616        out.write(PREAMBLE)
617        gen_tests(data.norm_tests, out)
618