• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2#
3# Copyright 2011-2022 The Rust Project Developers. See the COPYRIGHT
4# file at the top-level directory of this distribution and at
5# http://rust-lang.org/COPYRIGHT.
6#
7# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
8# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
9# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
10# option. This file may not be copied, modified, or distributed
11# except according to those terms.
12
13# This script uses the following Unicode tables:
14#
15# - DerivedCoreProperties.txt
16# - EastAsianWidth.txt
17# - HangulSyllableType.txt
18# - NormalizationTest.txt (for tests only)
19# - PropList.txt
20# - ReadMe.txt
21# - UnicodeData.txt
22# - auxiliary/GraphemeBreakProperty.txt
23# - emoji/emoji-data.txt
24# - emoji/emoji-variation-sequences.txt
25# - extracted/DerivedGeneralCategory.txt
26#
27# Since this should not require frequent updates, we just store this
28# out-of-line and check the generated module into git.
29
30import enum
31import math
32import operator
33import os
34import re
35import sys
36import urllib.request
37from collections import defaultdict
38from itertools import batched
39from typing import Callable, Iterable
40
41UNICODE_VERSION = "15.1.0"
42"""The version of the Unicode data files to download."""
43
44NUM_CODEPOINTS = 0x110000
45"""An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace."""
46
47MAX_CODEPOINT_BITS = math.ceil(math.log2(NUM_CODEPOINTS - 1))
48"""The maximum number of bits required to represent a Unicode codepoint."""
49
50
51class OffsetType(enum.IntEnum):
52    """Represents the data type of a lookup table's offsets. Each variant's value represents the
53    number of bits required to represent that variant's type."""
54
55    U2 = 2
56    """Offsets are 2-bit unsigned integers, packed four-per-byte."""
57    U4 = 4
58    """Offsets are 4-bit unsigned integers, packed two-per-byte."""
59    U8 = 8
60    """Each offset is a single byte (u8)."""
61
62
63MODULE_PATH = "../src/tables.rs"
64"""The path of the emitted Rust module (relative to the working directory)"""
65
66TABLE_SPLITS = [7, 13]
67"""The splits between the bits of the codepoint used to index each subtable.
68Adjust these values to change the sizes of the subtables"""
69
70Codepoint = int
71BitPos = int
72
73
74def fetch_open(filename: str, local_prefix: str = "", emoji: bool = False):
75    """Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
76    fetches it from `https://www.unicode.org/Public/`. Exits with code 1 on failure.
77    """
78    basename = os.path.basename(filename)
79    localname = os.path.join(local_prefix, basename)
80    if not os.path.exists(localname):
81        if emoji:
82            prefix = f"emoji/{UNICODE_VERSION[:-2]}"
83        else:
84            prefix = f"{UNICODE_VERSION}/ucd"
85        urllib.request.urlretrieve(
86            f"https://www.unicode.org/Public/{prefix}/{filename}",
87            localname,
88        )
89    try:
90        return open(localname, encoding="utf-8")
91    except OSError:
92        sys.stderr.write(f"cannot load {localname}")
93        sys.exit(1)
94
95
96def load_unicode_version() -> tuple[int, int, int]:
97    """Returns the current Unicode version by fetching and processing `ReadMe.txt`."""
98    with fetch_open("ReadMe.txt") as readme:
99        pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
100        return tuple(map(int, re.search(pattern, readme.read()).groups()))  # type: ignore
101
102
103def load_property(filename: str, pattern: str, action: Callable[[int], None]):
104    with fetch_open(filename) as properties:
105        single = re.compile(rf"^([0-9A-F]+)\s*;\s*{pattern}\s+")
106        multiple = re.compile(rf"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*{pattern}\s+")
107
108        for line in properties.readlines():
109            raw_data = None  # (low, high)
110            if match := single.match(line):
111                raw_data = (match.group(1), match.group(1))
112            elif match := multiple.match(line):
113                raw_data = (match.group(1), match.group(2))
114            else:
115                continue
116            low = int(raw_data[0], 16)
117            high = int(raw_data[1], 16)
118            for cp in range(low, high + 1):
119                action(cp)
120
121
122def to_sorted_ranges(iter: Iterable[Codepoint]) -> list[tuple[Codepoint, Codepoint]]:
123    "Creates a sorted list of ranges from an iterable of codepoints"
124    lst = [c for c in iter]
125    lst.sort()
126    ret = []
127    for cp in lst:
128        if len(ret) > 0 and ret[-1][1] == cp - 1:
129            ret[-1] = (ret[-1][0], cp)
130        else:
131            ret.append((cp, cp))
132    return ret
133
134
135class EastAsianWidth(enum.IntEnum):
136    """Represents the width of a Unicode character according to UAX 16.
137    All East Asian Width classes resolve into either
138    `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`.
139    """
140
141    NARROW = 1
142    """ One column wide. """
143    WIDE = 2
144    """ Two columns wide. """
145    AMBIGUOUS = 3
146    """ Two columns wide in a CJK context. One column wide in all other contexts. """
147
148
149class CharWidthInTable(enum.IntEnum):
150    """Represents the width of a Unicode character
151    as stored in the tables."""
152
153    ZERO = 0
154    ONE = 1
155    TWO = 2
156    SPECIAL = 3
157
158
159class WidthState(enum.IntEnum):
160    """
161    Width calculation proceeds according to a state machine.
162    We iterate over the characters of the string from back to front;
163    the next character encountered determines the transition to take.
164
165    The integer values of these variants have special meaning:
166    - Top bit: whether this is Vs16
167    - 2nd from top: whether this is Vs15
168    - 3rd bit from top: whether this is transparent to emoji/text presentation
169      (if set, should also set 4th)
170    - 4th bit: whether to set top bit on emoji presentation.
171      If this is set but 3rd is not, the width mode is related to zwj sequences
172    - 5th from top: whether this is unaffected by ligature-transparent
173    - 6th bit: if 4th is set but this one is not, then this is a ZWJ ligature state
174      where no ZWJ has been encountered yet; encountering one flips this on"""
175
176    # BASIC WIDTHS
177
178    ZERO = 0x1_0000
179    "Zero columns wide."
180
181    NARROW = 0x1_0001
182    "One column wide."
183
184    WIDE = 0x1_0002
185    "Two columns wide."
186
187    THREE = 0x1_0003
188    "Three columns wide."
189
190    # \r\n
191    LINE_FEED = 0b0000_0000_0000_0001
192    "\\n (CRLF has width 1)"
193
194    # EMOJI
195
196    # Emoji skintone modifiers
197    EMOJI_MODIFIER = 0b0000_0000_0000_0010
198    "`Emoji_Modifier`"
199
200    # Emoji ZWJ sequences
201
202    REGIONAL_INDICATOR = 0b0000_0000_0000_0011
203    "`Regional_Indicator`"
204
205    SEVERAL_REGIONAL_INDICATOR = 0b0000_0000_0000_0100
206    "At least two `Regional_Indicator`in sequence"
207
208    EMOJI_PRESENTATION = 0b0000_0000_0000_0101
209    "`Emoji_Presentation`"
210
211    ZWJ_EMOJI_PRESENTATION = 0b0001_0000_0000_0110
212    "\\u200D `Emoji_Presentation`"
213
214    VS16_ZWJ_EMOJI_PRESENTATION = 0b1001_0000_0000_0110
215    "\\uFE0F \\u200D `Emoji_Presentation`"
216
217    KEYCAP_ZWJ_EMOJI_PRESENTATION = 0b0001_0000_0000_0111
218    "\\u20E3 \\u200D `Emoji_Presentation`"
219
220    VS16_KEYCAP_ZWJ_EMOJI_PRESENTATION = 0b1001_0000_0000_0111
221    "\\uFE0F \\u20E3 \\u200D `Emoji_Presentation`"
222
223    REGIONAL_INDICATOR_ZWJ_PRESENTATION = 0b0000_0000_0000_1001
224    "`Regional_Indicator` \\u200D `Emoji_Presentation`"
225
226    EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION = 0b0000_0000_0000_1010
227    "(`Regional_Indicator` `Regional_Indicator`)+ \\u200D `Emoji_Presentation`"
228
229    ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION = 0b0000_0000_0000_1011
230    "(`Regional_Indicator` `Regional_Indicator`)+ `Regional_Indicator` \\u200D `Emoji_Presentation`"
231
232    TAG_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_0000
233    "\\uE007F \\u200D `Emoji_Presentation`"
234
235    TAG_D1_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_0001
236    "\\uE0030..=\\uE0039 \\uE007F \\u200D `Emoji_Presentation`"
237
238    TAG_D2_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_0010
239    "(\\uE0030..=\\uE0039){2} \\uE007F \\u200D `Emoji_Presentation`"
240
241    TAG_D3_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_0011
242    "(\\uE0030..=\\uE0039){3} \\uE007F \\u200D `Emoji_Presentation`"
243
244    TAG_A1_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1001
245    "\\uE0061..=\\uE007A \\uE007F \\u200D `Emoji_Presentation`"
246
247    TAG_A2_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1010
248    "(\\uE0061..=\\uE007A){2} \\uE007F \\u200D `Emoji_Presentation`"
249
250    TAG_A3_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1011
251    "(\\uE0061..=\\uE007A){3} \\uE007F \\u200D `Emoji_Presentation`"
252
253    TAG_A4_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1100
254    "(\\uE0061..=\\uE007A){4} \\uE007F \\u200D `Emoji_Presentation`"
255
256    TAG_A5_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1101
257    "(\\uE0061..=\\uE007A){35} \\uE007F \\u200D `Emoji_Presentation`"
258
259    TAG_A6_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1110
260    "(\\uE0061..=\\uE007A){6} \\uE007F \\u200D `Emoji_Presentation`"
261
262    # VARIATION SELECTORS
263
264    # Text presentation sequences (not CJK)
265    VARIATION_SELECTOR_15 = 0b0100_0000_0000_0000
266    "\\uFE0E (text presentation sequences)"
267
268    # Emoji presentation sequences
269    VARIATION_SELECTOR_16 = 0b1000_0000_0000_0000
270    "\\uFE0F (emoji presentation sequences)"
271
272    # ARABIC LAM ALEF
273
274    JOINING_GROUP_ALEF = 0b0011_0000_1111_1111
275    "Joining_Group=Alef (Arabic Lam-Alef ligature)"
276
277    # COMBINING SOLIDUS (CJK only)
278
279    COMBINING_LONG_SOLIDUS_OVERLAY = 0b0011_1100_1111_1111
280    "\\u0338 (CJK only, makes <, =, > width 2)"
281
282    # SOLIDUS + ALEF (solidus is Joining_Type=Transparent)
283    SOLIDUS_OVERLAY_ALEF = 0b0011_1000_1111_1111
284    "\\u0338 followed by Joining_Group=Alef"
285
286    # SCRIPT ZWJ LIGATURES
287
288    # Hebrew alef lamed
289
290    HEBREW_LETTER_LAMED = 0b0011_1000_0000_0000
291    "\\u05DC (Alef-ZWJ-Lamed ligature)"
292
293    ZWJ_HEBREW_LETTER_LAMED = 0b0011_1100_0000_0000
294    "\\u200D\\u05DC (Alef-ZWJ-Lamed ligature)"
295
296    # Buginese <a -i> ya
297
298    BUGINESE_LETTER_YA = 0b0011_1000_0000_0001
299    "\\u1A10 (<a, -i> + ya ligature)"
300
301    ZWJ_BUGINESE_LETTER_YA = 0b0011_1100_0000_0001
302    "\\u200D\\u1A10 (<a, -i> + ya ligature)"
303
304    BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA = 0b0011_1100_0000_0010
305    "\\u1A17\\u200D\\u1A10 (<a, -i> + ya ligature)"
306
307    # Tifinagh bi-consonants
308
309    TIFINAGH_CONSONANT = 0b0011_1000_0000_0011
310    "\\u2D31..=\\u2D65 or \\u2D6F (joined by ZWJ or \\u2D7F TIFINAGH CONSONANT JOINER)"
311
312    ZWJ_TIFINAGH_CONSONANT = 0b0011_1100_0000_0011
313    "ZWJ then \\u2D31..=\\u2D65 or \\u2D6F"
314
315    TIFINAGH_JOINER_CONSONANT = 0b0011_1100_0000_0100
316    "\\u2D7F then \\u2D31..=\\u2D65 or \\u2D6F"
317
318    # Lisu tone letters
319    LISU_TONE_LETTER_MYA_NA_JEU = 0b0011_1100_0000_0101
320    "\\uA4FC or \\uA4FD (https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078)"
321
322    # Old Turkic orkhon ec - orkhon i
323
324    OLD_TURKIC_LETTER_ORKHON_I = 0b0011_1000_0000_0110
325    "\\u10C03 (ORKHON EC-ZWJ-ORKHON I ligature)"
326
327    ZWJ_OLD_TURKIC_LETTER_ORKHON_I = 0b0011_1100_0000_0110
328    "\\u10C03 (ORKHON EC-ZWJ-ORKHON I ligature)"
329
330    # Khmer coeng signs
331
332    KHMER_COENG_ELIGIBLE_LETTER = 0b0011_1100_0000_0111
333    "\\u1780..=\\u17A2 | \\u17A7 | \\u17AB | \\u17AC | \\u17AF"
334
335    def table_width(self) -> CharWidthInTable:
336        "The width of a character as stored in the lookup tables."
337        match self:
338            case WidthState.ZERO:
339                return CharWidthInTable.ZERO
340            case WidthState.NARROW:
341                return CharWidthInTable.ONE
342            case WidthState.WIDE:
343                return CharWidthInTable.TWO
344            case _:
345                return CharWidthInTable.SPECIAL
346
347    def is_carried(self) -> bool:
348        "Whether this corresponds to a non-default `WidthInfo`."
349        return int(self) <= 0xFFFF
350
351    def width_alone(self) -> int:
352        "The width of a character with this type when it appears alone."
353        match self:
354            case (
355                WidthState.ZERO
356                | WidthState.COMBINING_LONG_SOLIDUS_OVERLAY
357                | WidthState.VARIATION_SELECTOR_15
358                | WidthState.VARIATION_SELECTOR_16
359            ):
360                return 0
361            case (
362                WidthState.WIDE
363                | WidthState.EMOJI_MODIFIER
364                | WidthState.EMOJI_PRESENTATION
365            ):
366                return 2
367            case WidthState.THREE:
368                return 3
369            case _:
370                return 1
371
372    def is_cjk_only(self) -> bool:
373        return self in [
374            WidthState.COMBINING_LONG_SOLIDUS_OVERLAY,
375            WidthState.SOLIDUS_OVERLAY_ALEF,
376        ]
377
378    def is_non_cjk_only(self) -> bool:
379        return self == WidthState.VARIATION_SELECTOR_15
380
381
382assert len(set([v.value for v in WidthState])) == len([v.value for v in WidthState])
383
384
385def load_east_asian_widths() -> list[EastAsianWidth]:
386    """Return a list of effective widths, indexed by codepoint.
387    Widths are determined by fetching and parsing `EastAsianWidth.txt`.
388
389    `Neutral`, `Narrow`, and `Halfwidth` characters are assigned `EffectiveWidth.NARROW`.
390
391    `Wide` and `Fullwidth` characters are assigned `EffectiveWidth.WIDE`.
392
393    `Ambiguous` characters are assigned `EffectiveWidth.AMBIGUOUS`."""
394
395    with fetch_open("EastAsianWidth.txt") as eaw:
396        # matches a width assignment for a single codepoint, i.e. "1F336;N  # ..."
397        single = re.compile(r"^([0-9A-F]+)\s*;\s*(\w+) +# (\w+)")
398        # matches a width assignment for a range of codepoints, i.e. "3001..3003;W  # ..."
399        multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(\w+) +# (\w+)")
400        # map between width category code and condensed width
401        width_codes = {
402            **{c: EastAsianWidth.NARROW for c in ["N", "Na", "H"]},
403            **{c: EastAsianWidth.WIDE for c in ["W", "F"]},
404            "A": EastAsianWidth.AMBIGUOUS,
405        }
406
407        width_map = []
408        current = 0
409        for line in eaw.readlines():
410            raw_data = None  # (low, high, width)
411            if match := single.match(line):
412                raw_data = (match.group(1), match.group(1), match.group(2))
413            elif match := multiple.match(line):
414                raw_data = (match.group(1), match.group(2), match.group(3))
415            else:
416                continue
417            low = int(raw_data[0], 16)
418            high = int(raw_data[1], 16)
419            width = width_codes[raw_data[2]]
420
421            assert current <= high
422            while current <= high:
423                # Some codepoints don't fall into any of the ranges in EastAsianWidth.txt.
424                # All such codepoints are implicitly given Neural width (resolves to narrow)
425                width_map.append(EastAsianWidth.NARROW if current < low else width)
426                current += 1
427
428        while len(width_map) < NUM_CODEPOINTS:
429            # Catch any leftover codepoints and assign them implicit Neutral/narrow width.
430            width_map.append(EastAsianWidth.NARROW)
431
432    # Ambiguous `Letter`s and `Modifier_Symbol`s are narrow
433    load_property(
434        "extracted/DerivedGeneralCategory.txt",
435        r"(:?Lu|Ll|Lt|Lm|Lo|Sk)",
436        lambda cp: (
437            operator.setitem(width_map, cp, EastAsianWidth.NARROW)
438            if width_map[cp] == EastAsianWidth.AMBIGUOUS
439            else None
440        ),
441    )
442
443    # GREEK ANO TELEIA: NFC decomposes to U+00B7 MIDDLE DOT
444    width_map[0x0387] = EastAsianWidth.AMBIGUOUS
445
446    # Canonical equivalence for symbols with stroke
447    with fetch_open("UnicodeData.txt") as udata:
448        single = re.compile(r"([0-9A-Z]+);.*?;.*?;.*?;.*?;([0-9A-Z]+) 0338;")
449        for line in udata.readlines():
450            if match := single.match(line):
451                composed = int(match.group(1), 16)
452                decomposed = int(match.group(2), 16)
453                if width_map[decomposed] == EastAsianWidth.AMBIGUOUS:
454                    width_map[composed] = EastAsianWidth.AMBIGUOUS
455
456    return width_map
457
458
459def load_zero_widths() -> list[bool]:
460    """Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
461    character. `c` is considered a zero-width character if
462
463    - it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
464    - or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`),
465    - or if it one of eight characters that should be `Grapheme_Extend` but aren't due to a Unicode spec bug,
466    - or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
467    """
468
469    zw_map = [False] * NUM_CODEPOINTS
470
471    # `Default_Ignorable_Code_Point`s also have 0 width:
472    # https://www.unicode.org/faq/unsup_char.html#3
473    # https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095
474    #
475    # `Grapheme_Extend` includes characters with general category `Mn` or `Me`,
476    # as well as a few `Mc` characters that need to be included so that
477    # canonically equivalent sequences have the same width.
478    load_property(
479        "DerivedCoreProperties.txt",
480        r"(?:Default_Ignorable_Code_Point|Grapheme_Extend)",
481        lambda cp: operator.setitem(zw_map, cp, True),
482    )
483
484    # Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`,
485    # as they canonically decompose to two characters with this property,
486    # but they aren't.
487    for c in [0x0CC0, 0x0CC7, 0x0CC8, 0x0CCA, 0x0CCB, 0x1B3B, 0x1B3D, 0x1B43]:
488        zw_map[c] = True
489
490    # Treat `Hangul_Syllable_Type`s of `Vowel_Jamo` and `Trailing_Jamo`
491    # as zero-width. This matches the behavior of glibc `wcwidth`.
492    #
493    # Decomposed Hangul characters consist of 3 parts: a `Leading_Jamo`,
494    # a `Vowel_Jamo`, and an optional `Trailing_Jamo`. Together these combine
495    # into a single wide grapheme. So we treat vowel and trailing jamo as
496    # 0-width, such that only the width of the leading jamo is counted
497    # and the resulting grapheme has width 2.
498    #
499    # (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul)
500    load_property(
501        "HangulSyllableType.txt",
502        r"(?:V|T)",
503        lambda cp: operator.setitem(zw_map, cp, True),
504    )
505
506    # Syriac abbreviation mark:
507    # Zero-width `Prepended_Concatenation_Mark`
508    zw_map[0x070F] = True
509
510    # Some Arabic Prepended_Concatenation_Mark`s
511    # https://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G27820
512    zw_map[0x0605] = True
513    zw_map[0x0890] = True
514    zw_map[0x0891] = True
515    zw_map[0x08E2] = True
516
517    # `[:Grapheme_Cluster_Break=Prepend:]-[:Prepended_Concatenation_Mark:]`
518    gcb_prepend = set()
519    load_property(
520        "auxiliary/GraphemeBreakProperty.txt",
521        "Prepend",
522        lambda cp: gcb_prepend.add(cp),
523    )
524    load_property(
525        "PropList.txt",
526        "Prepended_Concatenation_Mark",
527        lambda cp: gcb_prepend.remove(cp),
528    )
529    for cp in gcb_prepend:
530        zw_map[cp] = True
531
532    # HANGUL CHOSEONG FILLER
533    # U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
534    # zero width. However, the expected usage is to combine it with vowel or trailing jamo
535    # (which are considered 0-width on their own) to form a composed Hangul syllable with
536    # width 2. Therefore, we treat it as having width 2.
537    zw_map[0x115F] = False
538
539    # TIFINAGH CONSONANT JOINER
540    # (invisible only when used to join two Tifinagh consonants
541    zw_map[0x2D7F] = False
542
543    # DEVANAGARI CARET
544    # https://www.unicode.org/versions/Unicode15.0.0/ch12.pdf#G667447
545    zw_map[0xA8FA] = True
546
547    return zw_map
548
549
550def load_width_maps() -> tuple[list[WidthState], list[WidthState]]:
551    """Load complete width table, including characters needing special handling.
552    (Returns 2 tables, one for East Asian and one for not.)"""
553
554    eaws = load_east_asian_widths()
555    zws = load_zero_widths()
556
557    not_ea = []
558    ea = []
559
560    for eaw, zw in zip(eaws, zws):
561        if zw:
562            not_ea.append(WidthState.ZERO)
563            ea.append(WidthState.ZERO)
564        else:
565            if eaw == EastAsianWidth.WIDE:
566                not_ea.append(WidthState.WIDE)
567            else:
568                not_ea.append(WidthState.NARROW)
569
570            if eaw == EastAsianWidth.NARROW:
571                ea.append(WidthState.NARROW)
572            else:
573                ea.append(WidthState.WIDE)
574
575    # Joining_Group=Alef (Arabic Lam-Alef ligature)
576    alef_joining = []
577    load_property(
578        "extracted/DerivedJoiningGroup.txt",
579        "Alef",
580        lambda cp: alef_joining.append(cp),
581    )
582
583    # Regional indicators
584    regional_indicators = []
585    load_property(
586        "PropList.txt",
587        "Regional_Indicator",
588        lambda cp: regional_indicators.append(cp),
589    )
590
591    # Emoji modifiers
592    emoji_modifiers = []
593    load_property(
594        "emoji/emoji-data.txt",
595        "Emoji_Modifier",
596        lambda cp: emoji_modifiers.append(cp),
597    )
598
599    # Default emoji presentation (for ZWJ sequences)
600    emoji_presentation = []
601    load_property(
602        "emoji/emoji-data.txt",
603        "Emoji_Presentation",
604        lambda cp: emoji_presentation.append(cp),
605    )
606
607    for cps, width in [
608        ([0x0A], WidthState.LINE_FEED),
609        ([0x05DC], WidthState.HEBREW_LETTER_LAMED),
610        (alef_joining, WidthState.JOINING_GROUP_ALEF),
611        (range(0x1780, 0x1783), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
612        (range(0x1784, 0x1788), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
613        (range(0x1789, 0x178D), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
614        (range(0x178E, 0x1794), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
615        (range(0x1795, 0x1799), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
616        (range(0x179B, 0x179E), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
617        (
618            [0x17A0, 0x17A2, 0x17A7, 0x17AB, 0x17AC, 0x17AF],
619            WidthState.KHMER_COENG_ELIGIBLE_LETTER,
620        ),
621        ([0x17A4], WidthState.WIDE),
622        ([0x17D8], WidthState.THREE),
623        ([0x1A10], WidthState.BUGINESE_LETTER_YA),
624        (range(0x2D31, 0x2D66), WidthState.TIFINAGH_CONSONANT),
625        ([0x2D6F], WidthState.TIFINAGH_CONSONANT),
626        ([0xA4FC], WidthState.LISU_TONE_LETTER_MYA_NA_JEU),
627        ([0xA4FD], WidthState.LISU_TONE_LETTER_MYA_NA_JEU),
628        ([0xFE0F], WidthState.VARIATION_SELECTOR_16),
629        ([0x10C03], WidthState.OLD_TURKIC_LETTER_ORKHON_I),
630        (emoji_presentation, WidthState.EMOJI_PRESENTATION),
631        (emoji_modifiers, WidthState.EMOJI_MODIFIER),
632        (regional_indicators, WidthState.REGIONAL_INDICATOR),
633    ]:
634        for cp in cps:
635            not_ea[cp] = width
636            ea[cp] = width
637
638    # East-Asian only
639    ea[0x0338] = WidthState.COMBINING_LONG_SOLIDUS_OVERLAY
640
641    # Not East Asian only
642    not_ea[0xFE0E] = WidthState.VARIATION_SELECTOR_15
643
644    return (not_ea, ea)
645
646
647def load_joining_group_lam() -> list[tuple[Codepoint, Codepoint]]:
648    "Returns a list of character ranges with Joining_Group=Lam"
649    lam_joining = []
650    load_property(
651        "extracted/DerivedJoiningGroup.txt",
652        "Lam",
653        lambda cp: lam_joining.append(cp),
654    )
655
656    return to_sorted_ranges(lam_joining)
657
658
659def load_non_transparent_zero_widths(
660    width_map: list[WidthState],
661) -> list[tuple[Codepoint, Codepoint]]:
662    "Returns a list of characters with zero width but not 'Joining_Type=Transparent'"
663
664    zero_widths = set()
665    for cp, width in enumerate(width_map):
666        if width.width_alone() == 0:
667            zero_widths.add(cp)
668    transparent = set()
669    load_property(
670        "extracted/DerivedJoiningType.txt",
671        "T",
672        lambda cp: transparent.add(cp),
673    )
674
675    return to_sorted_ranges(zero_widths - transparent)
676
677
678def load_ligature_transparent() -> list[tuple[Codepoint, Codepoint]]:
679    """Returns a list of character ranges corresponding to all combining marks that are also
680    `Default_Ignorable_Code_Point`s, plus ZWJ. This is the set of characters that won't interrupt
681    a ligature."""
682    default_ignorables = set()
683    load_property(
684        "DerivedCoreProperties.txt",
685        "Default_Ignorable_Code_Point",
686        lambda cp: default_ignorables.add(cp),
687    )
688
689    combining_marks = set()
690    load_property(
691        "extracted/DerivedGeneralCategory.txt",
692        "(?:Mc|Mn|Me)",
693        lambda cp: combining_marks.add(cp),
694    )
695
696    default_ignorable_combinings = default_ignorables.intersection(combining_marks)
697    default_ignorable_combinings.add(0x200D)  # ZWJ
698
699    return to_sorted_ranges(default_ignorable_combinings)
700
701
702def load_solidus_transparent(
703    ligature_transparents: list[tuple[Codepoint, Codepoint]],
704    cjk_width_map: list[WidthState],
705) -> list[tuple[Codepoint, Codepoint]]:
706    """Characters expanding to a canonical combining class above 1, plus `ligature_transparent`s from above.
707    Ranges matching ones in `ligature_transparent` exactly are excluded (for compression), so it needs to bechecked also.
708    """
709
710    ccc_above_1 = set()
711    load_property(
712        "extracted/DerivedCombiningClass.txt",
713        "(?:[2-9]|(?:[1-9][0-9]+))",
714        lambda cp: ccc_above_1.add(cp),
715    )
716
717    for lo, hi in ligature_transparents:
718        for cp in range(lo, hi + 1):
719            ccc_above_1.add(cp)
720
721    num_chars = len(ccc_above_1)
722
723    # Recursive decompositions
724    while True:
725        with fetch_open("UnicodeData.txt") as udata:
726            single = re.compile(r"([0-9A-Z]+);.*?;.*?;.*?;.*?;([0-9A-F ]+);")
727            for line in udata.readlines():
728                if match := single.match(line):
729                    composed = int(match.group(1), 16)
730                    decomposed = [int(c, 16) for c in match.group(2).split(" ")]
731                    if all([c in ccc_above_1 for c in decomposed]):
732                        ccc_above_1.add(composed)
733        if len(ccc_above_1) == num_chars:
734            break
735        else:
736            num_chars = len(ccc_above_1)
737
738    for cp in ccc_above_1:
739        if cp != 0xFE0F:
740            assert (
741                cjk_width_map[cp].table_width() != CharWidthInTable.SPECIAL
742            ), f"U+{cp:X}"
743
744    sorted = to_sorted_ranges(ccc_above_1)
745    return list(filter(lambda range: range not in ligature_transparents, sorted))
746
747
748def load_normalization_tests() -> list[tuple[str, str, str, str, str]]:
749    def parse_codepoints(cps: str) -> str:
750        return "".join(map(lambda cp: chr(int(cp, 16)), cps.split(" ")))
751
752    with fetch_open("NormalizationTest.txt") as normtests:
753        ret = []
754        single = re.compile(
755            r"^([0-9A-F ]+);([0-9A-F ]+);([0-9A-F ]+);([0-9A-F ]+);([0-9A-F ]+);"
756        )
757        for line in normtests.readlines():
758            if match := single.match(line):
759                ret.append(
760                    (
761                        parse_codepoints(match.group(1)),
762                        parse_codepoints(match.group(2)),
763                        parse_codepoints(match.group(3)),
764                        parse_codepoints(match.group(4)),
765                        parse_codepoints(match.group(5)),
766                    )
767                )
768        return ret
769
770
771def make_special_ranges(
772    width_map: list[WidthState],
773) -> list[tuple[tuple[Codepoint, Codepoint], WidthState]]:
774    "Assign ranges of characters to their special behavior (used in match)"
775    ret = []
776    can_merge_with_prev = False
777    for cp, width in enumerate(width_map):
778        if width == WidthState.EMOJI_PRESENTATION:
779            can_merge_with_prev = False
780        elif width.table_width() == CharWidthInTable.SPECIAL:
781            if can_merge_with_prev and ret[-1][1] == width:
782                ret[-1] = ((ret[-1][0][0], cp), width)
783            else:
784                ret.append(((cp, cp), width))
785                can_merge_with_prev = True
786    return ret
787
788
789class Bucket:
790    """A bucket contains a group of codepoints and an ordered width list. If one bucket's width
791    list overlaps with another's width list, those buckets can be merged via `try_extend`.
792    """
793
794    def __init__(self):
795        """Creates an empty bucket."""
796        self.entry_set = set()
797        self.widths = []
798
799    def append(self, codepoint: Codepoint, width: CharWidthInTable):
800        """Adds a codepoint/width pair to the bucket, and appends `width` to the width list."""
801        self.entry_set.add((codepoint, width))
802        self.widths.append(width)
803
804    def try_extend(self, attempt: "Bucket") -> bool:
805        """If either `self` or `attempt`'s width list starts with the other bucket's width list,
806        set `self`'s width list to the longer of the two, add all of `attempt`'s codepoints
807        into `self`, and return `True`. Otherwise, return `False`."""
808        (less, more) = (self.widths, attempt.widths)
809        if len(self.widths) > len(attempt.widths):
810            (less, more) = (attempt.widths, self.widths)
811        if less != more[: len(less)]:
812            return False
813        self.entry_set |= attempt.entry_set
814        self.widths = more
815        return True
816
817    def entries(self) -> list[tuple[Codepoint, CharWidthInTable]]:
818        """Return a list of the codepoint/width pairs in this bucket, sorted by codepoint."""
819        result = list(self.entry_set)
820        result.sort()
821        return result
822
823    def width(self) -> CharWidthInTable | None:
824        """If all codepoints in this bucket have the same width, return that width; otherwise,
825        return `None`."""
826        if len(self.widths) == 0:
827            return None
828        potential_width = self.widths[0]
829        for width in self.widths[1:]:
830            if potential_width != width:
831                return None
832        return potential_width
833
834
835def make_buckets(
836    entries: Iterable[tuple[int, CharWidthInTable]], low_bit: BitPos, cap_bit: BitPos
837) -> list[Bucket]:
838    """Partitions the `(Codepoint, EffectiveWidth)` tuples in `entries` into `Bucket`s. All
839    codepoints with identical bits from `low_bit` to `cap_bit` (exclusive) are placed in the
840    same bucket. Returns a list of the buckets in increasing order of those bits."""
841    num_bits = cap_bit - low_bit
842    assert num_bits > 0
843    buckets = [Bucket() for _ in range(0, 2**num_bits)]
844    mask = (1 << num_bits) - 1
845    for codepoint, width in entries:
846        buckets[(codepoint >> low_bit) & mask].append(codepoint, width)
847    return buckets
848
849
850class Table:
851    """Represents a lookup table. Each table contains a certain number of subtables; each
852    subtable is indexed by a contiguous bit range of the codepoint and contains a list
853    of `2**(number of bits in bit range)` entries. (The bit range is the same for all subtables.)
854
855    Typically, tables contain a list of buckets of codepoints. Bucket `i`'s codepoints should
856    be indexed by sub-table `i` in the next-level lookup table. The entries of this table are
857    indexes into the bucket list (~= indexes into the sub-tables of the next-level table.) The
858    key to compression is that two different buckets in two different sub-tables may have the
859    same width list, which means that they can be merged into the same bucket.
860
861    If no bucket contains two codepoints with different widths, calling `indices_to_widths` will
862    discard the buckets and convert the entries into `EffectiveWidth` values."""
863
864    def __init__(
865        self,
866        name: str,
867        entry_groups: Iterable[Iterable[tuple[int, CharWidthInTable]]],
868        secondary_entry_groups: Iterable[Iterable[tuple[int, CharWidthInTable]]],
869        low_bit: BitPos,
870        cap_bit: BitPos,
871        offset_type: OffsetType,
872        align: int,
873        bytes_per_row: int | None = None,
874        starting_indexed: list[Bucket] = [],
875        cfged: bool = False,
876    ):
877        """Create a lookup table with a sub-table for each `(Codepoint, EffectiveWidth)` iterator
878        in `entry_groups`. Each sub-table is indexed by codepoint bits in `low_bit..cap_bit`,
879        and each table entry is represented in the format specified by  `offset_type`. Asserts
880        that this table is actually representable with `offset_type`."""
881        starting_indexed_len = len(starting_indexed)
882        self.name = name
883        self.low_bit = low_bit
884        self.cap_bit = cap_bit
885        self.offset_type = offset_type
886        self.entries: list[int] = []
887        self.indexed: list[Bucket] = list(starting_indexed)
888        self.align = align
889        self.bytes_per_row = bytes_per_row
890        self.cfged = cfged
891
892        buckets: list[Bucket] = []
893        for entries in entry_groups:
894            buckets.extend(make_buckets(entries, self.low_bit, self.cap_bit))
895
896        for bucket in buckets:
897            for i, existing in enumerate(self.indexed):
898                if existing.try_extend(bucket):
899                    self.entries.append(i)
900                    break
901            else:
902                self.entries.append(len(self.indexed))
903                self.indexed.append(bucket)
904
905        self.primary_len = len(self.entries)
906        self.primary_bucket_len = len(self.indexed)
907
908        buckets = []
909        for entries in secondary_entry_groups:
910            buckets.extend(make_buckets(entries, self.low_bit, self.cap_bit))
911
912        for bucket in buckets:
913            for i, existing in enumerate(self.indexed):
914                if existing.try_extend(bucket):
915                    self.entries.append(i)
916                    break
917            else:
918                self.entries.append(len(self.indexed))
919                self.indexed.append(bucket)
920
921        # Validate offset type
922        max_index = 1 << int(self.offset_type)
923        for index in self.entries:
924            assert index < max_index, f"{index} <= {max_index}"
925
926        self.indexed = self.indexed[starting_indexed_len:]
927
928    def indices_to_widths(self):
929        """Destructively converts the indices in this table to the `EffectiveWidth` values of
930        their buckets. Assumes that no bucket contains codepoints with different widths.
931        """
932        self.entries = list(map(lambda i: int(self.indexed[i].width()), self.entries))  # type: ignore
933        del self.indexed
934
935    def buckets(self):
936        """Returns an iterator over this table's buckets."""
937        return self.indexed
938
939    def to_bytes(self) -> list[int]:
940        """Returns this table's entries as a list of bytes. The bytes are formatted according to
941        the `OffsetType` which the table was created with, converting any `EffectiveWidth` entries
942        to their enum variant's integer value. For example, with `OffsetType.U2`, each byte will
943        contain four packed 2-bit entries."""
944        entries_per_byte = 8 // int(self.offset_type)
945        byte_array = []
946        for i in range(0, len(self.entries), entries_per_byte):
947            byte = 0
948            for j in range(0, entries_per_byte):
949                byte |= self.entries[i + j] << (j * int(self.offset_type))
950            byte_array.append(byte)
951        return byte_array
952
953
954def make_tables(
955    width_map: list[WidthState],
956    cjk_width_map: list[WidthState],
957) -> list[Table]:
958    """Creates a table for each configuration in `table_cfgs`, with the first config corresponding
959    to the top-level lookup table, the second config corresponding to the second-level lookup
960    table, and so forth. `entries` is an iterator over the `(Codepoint, EffectiveWidth)` pairs
961    to include in the top-level table."""
962
963    entries = enumerate([w.table_width() for w in width_map])
964    cjk_entries = enumerate([w.table_width() for w in cjk_width_map])
965
966    root_table = Table(
967        "WIDTH_ROOT",
968        [entries],
969        [],
970        TABLE_SPLITS[1],
971        MAX_CODEPOINT_BITS,
972        OffsetType.U8,
973        128,
974    )
975
976    cjk_root_table = Table(
977        "WIDTH_ROOT_CJK",
978        [cjk_entries],
979        [],
980        TABLE_SPLITS[1],
981        MAX_CODEPOINT_BITS,
982        OffsetType.U8,
983        128,
984        starting_indexed=root_table.indexed,
985        cfged=True,
986    )
987
988    middle_table = Table(
989        "WIDTH_MIDDLE",
990        map(lambda bucket: bucket.entries(), root_table.buckets()),
991        map(lambda bucket: bucket.entries(), cjk_root_table.buckets()),
992        TABLE_SPLITS[0],
993        TABLE_SPLITS[1],
994        OffsetType.U8,
995        2 ** (TABLE_SPLITS[1] - TABLE_SPLITS[0]),
996        bytes_per_row=2 ** (TABLE_SPLITS[1] - TABLE_SPLITS[0]),
997    )
998
999    leaves_table = Table(
1000        "WIDTH_LEAVES",
1001        map(
1002            lambda bucket: bucket.entries(),
1003            middle_table.buckets()[: middle_table.primary_bucket_len],
1004        ),
1005        map(
1006            lambda bucket: bucket.entries(),
1007            middle_table.buckets()[middle_table.primary_bucket_len :],
1008        ),
1009        0,
1010        TABLE_SPLITS[0],
1011        OffsetType.U2,
1012        2 ** (TABLE_SPLITS[0] - 2),
1013        bytes_per_row=2 ** (TABLE_SPLITS[0] - 2),
1014    )
1015
1016    return [root_table, cjk_root_table, middle_table, leaves_table]
1017
1018
1019def load_emoji_presentation_sequences() -> list[Codepoint]:
1020    """Outputs a list of cpodepoints, corresponding to all the valid characters for starting
1021    an emoji presentation sequence."""
1022
1023    with fetch_open("emoji/emoji-variation-sequences.txt") as sequences:
1024        # Match all emoji presentation sequences
1025        # (one codepoint followed by U+FE0F, and labeled "emoji style")
1026        sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s*emoji style")
1027        codepoints = []
1028        for line in sequences.readlines():
1029            if match := sequence.match(line):
1030                cp = int(match.group(1), 16)
1031                codepoints.append(cp)
1032    return codepoints
1033
1034
1035def load_text_presentation_sequences() -> list[Codepoint]:
1036    """Outputs a list of codepoints, corresponding to all the valid characters
1037    whose widths change with a text presentation sequence."""
1038
1039    text_presentation_seq_codepoints = set()
1040    with fetch_open("emoji/emoji-variation-sequences.txt") as sequences:
1041        # Match all text presentation sequences
1042        # (one codepoint followed by U+FE0E, and labeled "text style")
1043        sequence = re.compile(r"^([0-9A-F]+)\s+FE0E\s*;\s*text style")
1044        for line in sequences.readlines():
1045            if match := sequence.match(line):
1046                cp = int(match.group(1), 16)
1047                text_presentation_seq_codepoints.add(cp)
1048
1049    default_emoji_codepoints = set()
1050
1051    load_property(
1052        "emoji/emoji-data.txt",
1053        "Emoji_Presentation",
1054        lambda cp: default_emoji_codepoints.add(cp),
1055    )
1056
1057    codepoints = []
1058    for cp in text_presentation_seq_codepoints.intersection(default_emoji_codepoints):
1059        # "Enclosed Ideographic Supplement" block;
1060        # wide even in text presentation
1061        if not cp in range(0x1F200, 0x1F300):
1062            codepoints.append(cp)
1063
1064    codepoints.sort()
1065    return codepoints
1066
1067
1068def load_emoji_modifier_bases() -> list[Codepoint]:
1069    """Outputs a list of codepoints, corresponding to all the valid characters
1070    whose widths change with a text presentation sequence."""
1071
1072    ret = []
1073    load_property(
1074        "emoji/emoji-data.txt",
1075        "Emoji_Modifier_Base",
1076        lambda cp: ret.append(cp),
1077    )
1078    ret.sort()
1079    return ret
1080
1081
1082def make_presentation_sequence_table(
1083    seqs: list[Codepoint],
1084    lsb: int = 10,
1085) -> tuple[list[tuple[int, int]], list[list[int]]]:
1086    """Generates 2-level lookup table for whether a codepoint might start an emoji variation sequence.
1087    The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.
1088    """
1089
1090    prefixes_dict = defaultdict(set)
1091    for cp in seqs:
1092        prefixes_dict[cp >> lsb].add(cp & (2**lsb - 1))
1093
1094    msbs: list[int] = list(prefixes_dict.keys())
1095
1096    leaves: list[list[int]] = []
1097    for cps in prefixes_dict.values():
1098        leaf = [0] * (2 ** (lsb - 3))
1099        for cp in cps:
1100            idx_in_leaf, bit_shift = divmod(cp, 8)
1101            leaf[idx_in_leaf] |= 1 << bit_shift
1102        leaves.append(leaf)
1103
1104    indexes = [(msb, index) for (index, msb) in enumerate(msbs)]
1105
1106    # Cull duplicate leaves
1107    i = 0
1108    while i < len(leaves):
1109        first_idx = leaves.index(leaves[i])
1110        if first_idx == i:
1111            i += 1
1112        else:
1113            for j in range(0, len(indexes)):
1114                if indexes[j][1] == i:
1115                    indexes[j] = (indexes[j][0], first_idx)
1116                elif indexes[j][1] > i:
1117                    indexes[j] = (indexes[j][0], indexes[j][1] - 1)
1118
1119            leaves.pop(i)
1120
1121    return (indexes, leaves)
1122
1123
1124def make_ranges_table(
1125    seqs: list[Codepoint],
1126) -> tuple[list[tuple[int, int]], list[list[tuple[int, int]]]]:
1127    """Generates 2-level lookup table for a binary property of a codepoint.
1128    First level is all but the last byte, second level is ranges for last byte
1129    """
1130
1131    prefixes_dict = defaultdict(list)
1132    for cp in seqs:
1133        prefixes_dict[cp >> 8].append(cp & 0xFF)
1134
1135    msbs: list[int] = list(prefixes_dict.keys())
1136
1137    leaves: list[list[tuple[int, int]]] = []
1138    for cps in prefixes_dict.values():
1139        leaf = []
1140        for cp in cps:
1141            if len(leaf) > 0 and leaf[-1][1] == cp - 1:
1142                leaf[-1] = (leaf[-1][0], cp)
1143            else:
1144                leaf.append((cp, cp))
1145        leaves.append(leaf)
1146
1147    indexes = [(msb, index) for (index, msb) in enumerate(msbs)]
1148
1149    # Cull duplicate leaves
1150    i = 0
1151    while i < len(leaves):
1152        first_idx = leaves.index(leaves[i])
1153        if first_idx == i:
1154            i += 1
1155        else:
1156            for j in range(0, len(indexes)):
1157                if indexes[j][1] == i:
1158                    indexes[j] = (indexes[j][0], first_idx)
1159                elif indexes[j][1] > i:
1160                    indexes[j] = (indexes[j][0], indexes[j][1] - 1)
1161
1162            leaves.pop(i)
1163
1164    return (indexes, leaves)
1165
1166
1167def lookup_fns(
1168    is_cjk: bool,
1169    special_ranges: list[tuple[tuple[Codepoint, Codepoint], WidthState]],
1170    joining_group_lam: list[tuple[Codepoint, Codepoint]],
1171) -> str:
1172    if is_cjk:
1173        cfg = '#[cfg(feature = "cjk")]\n'
1174        cjk_lo = "_cjk"
1175        cjk_cap = "_CJK"
1176        ambig = "wide"
1177    else:
1178        cfg = ""
1179        cjk_lo = ""
1180        cjk_cap = ""
1181        ambig = "narrow"
1182    s = f"""
1183/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c` by
1184/// consulting a multi-level lookup table.
1185///
1186/// # Maintenance
1187/// The tables themselves are autogenerated but this function is hardcoded. You should have
1188/// nothing to worry about if you re-run `unicode.py` (for example, when updating Unicode.)
1189/// However, if you change the *actual structure* of the lookup tables (perhaps by editing the
1190/// `make_tables` function in `unicode.py`) you must ensure that this code reflects those changes.
1191{cfg}#[inline]
1192fn lookup_width{cjk_lo}(c: char) -> (u8, WidthInfo) {{
1193    let cp = c as usize;
1194
1195    let t1_offset = WIDTH_ROOT{cjk_cap}.0[cp >> {TABLE_SPLITS[1]}];
1196
1197    // Each sub-table in WIDTH_MIDDLE is 7 bits, and each stored entry is a byte,
1198    // so each sub-table is 128 bytes in size.
1199    // (Sub-tables are selected using the computed offset from the previous table.)
1200    let t2_offset = WIDTH_MIDDLE.0[usize::from(t1_offset)][cp >> {TABLE_SPLITS[0]} & 0x{(2 ** (TABLE_SPLITS[1] - TABLE_SPLITS[0]) - 1):X}];
1201
1202    // Each sub-table in WIDTH_LEAVES is 6 bits, but each stored entry is 2 bits.
1203    // This is accomplished by packing four stored entries into one byte.
1204    // So each sub-table is 2**(7-2) == 32 bytes in size.
1205    // Since this is the last table, each entry represents an encoded width.
1206    let packed_widths = WIDTH_LEAVES.0[usize::from(t2_offset)][cp >> 2 & 0x{(2 ** (TABLE_SPLITS[0] - 2) - 1):X}];
1207
1208    // Extract the packed width
1209    let width = packed_widths >> (2 * (cp & 0b11)) & 0b11;
1210
1211    if width < 3 {{
1212        (width, WidthInfo::DEFAULT)
1213    }} else {{
1214        match c {{
1215"""
1216
1217    for (lo, hi), width in special_ranges:
1218        s += f"            '\\u{{{lo:X}}}'"
1219        if hi != lo:
1220            s += f"..='\\u{{{hi:X}}}'"
1221        if width.is_carried():
1222            width_info = width.name
1223        else:
1224            width_info = "DEFAULT"
1225        s += f" => ({width.width_alone()}, WidthInfo::{width_info}),\n"
1226
1227    s += f"""            _ => (2, WidthInfo::EMOJI_PRESENTATION),
1228        }}
1229    }}
1230}}
1231
1232/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
1233/// `None` if `c` is a control character.
1234/// Ambiguous width characters are treated as {ambig}.
1235{cfg}#[inline]
1236pub fn single_char_width{cjk_lo}(c: char) -> Option<usize> {{
1237    if c < '\\u{{7F}}' {{
1238        if c >= '\\u{{20}}' {{
1239            // U+0020 to U+007F (exclusive) are single-width ASCII codepoints
1240            Some(1)
1241        }} else {{
1242            // U+0000 to U+0020 (exclusive) are control codes
1243            None
1244        }}
1245    }} else if c >= '\\u{{A0}}' {{
1246        // No characters >= U+00A0 are control codes, so we can consult the lookup tables
1247        Some(lookup_width{cjk_lo}(c).0.into())
1248    }} else {{
1249        // U+007F to U+00A0 (exclusive) are control codes
1250        None
1251    }}
1252}}
1253
1254/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`.
1255/// Ambiguous width characters are treated as {ambig}.
1256{cfg}#[inline]
1257fn width_in_str{cjk_lo}(c: char, mut next_info: WidthInfo) -> (i8, WidthInfo) {{
1258    if next_info.is_emoji_presentation() {{
1259        if starts_emoji_presentation_seq(c) {{
1260            let width = if next_info.is_zwj_emoji_presentation() {{
1261                0
1262            }} else {{
1263                2
1264            }};
1265            return (width, WidthInfo::EMOJI_PRESENTATION);
1266        }} else {{
1267            next_info = next_info.unset_emoji_presentation();
1268        }}
1269    }}"""
1270
1271    if is_cjk:
1272        s += """
1273    if (matches!(
1274        next_info,
1275        WidthInfo::COMBINING_LONG_SOLIDUS_OVERLAY | WidthInfo::SOLIDUS_OVERLAY_ALEF
1276    ) && matches!(c, '<' | '=' | '>'))
1277    {
1278        return (2, WidthInfo::DEFAULT);
1279    }"""
1280
1281    s += """
1282    if c <= '\\u{A0}' {
1283        match c {
1284            // According to the spec, LF should be width 1, which is how it is often rendered when it is forced to have a single-line rendering
1285            // However, this makes it harder to use this crate to calculate line breaks, and breaks assumptions of downstream crates.
1286            // https://github.com/unicode-rs/unicode-width/issues/60
1287            '\\n' => (0, WidthInfo::LINE_FEED),
1288            '\\r' if next_info == WidthInfo::LINE_FEED => (0, WidthInfo::DEFAULT),
1289            _ => (1, WidthInfo::DEFAULT),
1290        }
1291    } else {
1292        // Fast path
1293        if next_info != WidthInfo::DEFAULT {
1294            if c == '\\u{FE0F}' {
1295                return (0, next_info.set_emoji_presentation());
1296            }"""
1297
1298    if not is_cjk:
1299        s += """
1300            if c == '\\u{FE0E}' {
1301                return (0, next_info.set_text_presentation());
1302            }
1303            if next_info.is_text_presentation() {
1304                if starts_non_ideographic_text_presentation_seq(c) {
1305                    return (1, WidthInfo::DEFAULT);
1306                } else {
1307                    next_info = next_info.unset_text_presentation();
1308                }
1309            }"""
1310
1311    s += """
1312            if next_info.is_ligature_transparent() {
1313                if c == '\\u{200D}' {
1314                    return (0, next_info.set_zwj_bit());
1315                } else if is_ligature_transparent(c) {
1316                    return (0, next_info);
1317                }
1318            }
1319
1320            match (next_info, c) {"""
1321    if is_cjk:
1322        s += """
1323                (WidthInfo::COMBINING_LONG_SOLIDUS_OVERLAY, _) if is_solidus_transparent(c) => {
1324                    return (
1325                        lookup_width_cjk(c).0 as i8,
1326                        WidthInfo::COMBINING_LONG_SOLIDUS_OVERLAY,
1327                    );
1328                }
1329                (WidthInfo::JOINING_GROUP_ALEF, '\\u{0338}') => {
1330                    return (0, WidthInfo::SOLIDUS_OVERLAY_ALEF);
1331                }
1332                // Arabic Lam-Alef ligature
1333                (
1334                    WidthInfo::JOINING_GROUP_ALEF | WidthInfo::SOLIDUS_OVERLAY_ALEF,
1335                    """
1336    else:
1337        s += """
1338                // Arabic Lam-Alef ligature
1339                (
1340                    WidthInfo::JOINING_GROUP_ALEF,
1341                    """
1342
1343    tail = False
1344    for lo, hi in joining_group_lam:
1345        if tail:
1346            s += " | "
1347        tail = True
1348        s += f"'\\u{{{lo:X}}}'"
1349        if hi != lo:
1350            s += f"..='\\u{{{hi:X}}}'"
1351    s += """,
1352                ) => return (0, WidthInfo::DEFAULT),
1353                (WidthInfo::JOINING_GROUP_ALEF, _) if is_transparent_zero_width(c) => {
1354                    return (0, WidthInfo::JOINING_GROUP_ALEF);
1355                }
1356
1357                // Hebrew Alef-ZWJ-Lamed ligature
1358                (WidthInfo::ZWJ_HEBREW_LETTER_LAMED, '\\u{05D0}') => {
1359                    return (0, WidthInfo::DEFAULT);
1360                }
1361
1362                // Khmer coeng signs
1363                (WidthInfo::KHMER_COENG_ELIGIBLE_LETTER, '\\u{17D2}') => {
1364                    return (-1, WidthInfo::DEFAULT);
1365                }
1366
1367                // Buginese <a, -i> ZWJ ya ligature
1368                (WidthInfo::ZWJ_BUGINESE_LETTER_YA, '\\u{1A17}') => {
1369                    return (0, WidthInfo::BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA)
1370                }
1371                (WidthInfo::BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA, '\\u{1A15}') => {
1372                    return (0, WidthInfo::DEFAULT)
1373                }
1374
1375                // Tifinagh bi-consonants
1376                (WidthInfo::TIFINAGH_CONSONANT | WidthInfo::ZWJ_TIFINAGH_CONSONANT, '\\u{2D7F}') => {
1377                    return (1, WidthInfo::TIFINAGH_JOINER_CONSONANT);
1378                }
1379                (WidthInfo::ZWJ_TIFINAGH_CONSONANT, '\\u{2D31}'..='\\u{2D65}' | '\\u{2D6F}') => {
1380                    return (0, WidthInfo::DEFAULT);
1381                }
1382                (WidthInfo::TIFINAGH_JOINER_CONSONANT, '\\u{2D31}'..='\\u{2D65}' | '\\u{2D6F}') => {
1383                    return (-1, WidthInfo::DEFAULT);
1384                }
1385
1386                // Lisu tone letter combinations
1387                (WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU, '\\u{A4F8}'..='\\u{A4FB}') => {
1388                    return (0, WidthInfo::DEFAULT);
1389                }
1390
1391                // Old Turkic ligature
1392                (WidthInfo::ZWJ_OLD_TURKIC_LETTER_ORKHON_I, '\\u{10C32}') => {
1393                    return (0, WidthInfo::DEFAULT);
1394                }"""
1395
1396    s += f"""
1397                // Emoji modifier
1398                (WidthInfo::EMOJI_MODIFIER, _) if is_emoji_modifier_base(c) => {{
1399                    return (0, WidthInfo::EMOJI_PRESENTATION);
1400                }}
1401
1402                // Regional indicator
1403                (
1404                    WidthInfo::REGIONAL_INDICATOR | WidthInfo::SEVERAL_REGIONAL_INDICATOR,
1405                    '\\u{{1F1E6}}'..='\\u{{1F1FF}}',
1406                ) => return (1, WidthInfo::SEVERAL_REGIONAL_INDICATOR),
1407
1408                // ZWJ emoji
1409                (
1410                    WidthInfo::EMOJI_PRESENTATION
1411                    | WidthInfo::SEVERAL_REGIONAL_INDICATOR
1412                    | WidthInfo::EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION
1413                    | WidthInfo::ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION
1414                    | WidthInfo::EMOJI_MODIFIER,
1415                    '\\u{{200D}}',
1416                ) => return (0, WidthInfo::ZWJ_EMOJI_PRESENTATION),
1417                (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\\u{{20E3}}') => {{
1418                    return (0, WidthInfo::KEYCAP_ZWJ_EMOJI_PRESENTATION);
1419                }}
1420                (WidthInfo::VS16_ZWJ_EMOJI_PRESENTATION, _) if starts_emoji_presentation_seq(c) => {{
1421                    return (0, WidthInfo::EMOJI_PRESENTATION)
1422                }}
1423                (WidthInfo::VS16_KEYCAP_ZWJ_EMOJI_PRESENTATION, '0'..='9' | '#' | '*') => {{
1424                    return (0, WidthInfo::EMOJI_PRESENTATION)
1425                }}
1426                (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\\u{{1F1E6}}'..='\\u{{1F1FF}}') => {{
1427                    return (1, WidthInfo::REGIONAL_INDICATOR_ZWJ_PRESENTATION);
1428                }}
1429                (
1430                    WidthInfo::REGIONAL_INDICATOR_ZWJ_PRESENTATION
1431                    | WidthInfo::ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION,
1432                    '\\u{{1F1E6}}'..='\\u{{1F1FF}}',
1433                ) => return (-1, WidthInfo::EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION),
1434                (
1435                    WidthInfo::EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION,
1436                    '\\u{{1F1E6}}'..='\\u{{1F1FF}}',
1437                ) => return (3, WidthInfo::ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION),
1438                (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\\u{{1F3FB}}'..='\\u{{1F3FF}}') => {{
1439                    return (0, WidthInfo::EMOJI_MODIFIER);
1440                }}
1441                (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\\u{{E007F}}') => {{
1442                    return (0, WidthInfo::TAG_END_ZWJ_EMOJI_PRESENTATION);
1443                }}
1444                (WidthInfo::TAG_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{
1445                    return (0, WidthInfo::TAG_A1_END_ZWJ_EMOJI_PRESENTATION);
1446                }}
1447                (WidthInfo::TAG_A1_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{
1448                    return (0, WidthInfo::TAG_A2_END_ZWJ_EMOJI_PRESENTATION)
1449                }}
1450                (WidthInfo::TAG_A2_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{
1451                    return (0, WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION)
1452                }}
1453                (WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{
1454                    return (0, WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION)
1455                }}
1456                (WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{
1457                    return (0, WidthInfo::TAG_A5_END_ZWJ_EMOJI_PRESENTATION)
1458                }}
1459                (WidthInfo::TAG_A5_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{
1460                    return (0, WidthInfo::TAG_A6_END_ZWJ_EMOJI_PRESENTATION)
1461                }}
1462                (
1463                    WidthInfo::TAG_END_ZWJ_EMOJI_PRESENTATION
1464                    | WidthInfo::TAG_A1_END_ZWJ_EMOJI_PRESENTATION
1465                    | WidthInfo::TAG_A2_END_ZWJ_EMOJI_PRESENTATION
1466                    | WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION
1467                    | WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION,
1468                    '\\u{{E0030}}'..='\\u{{E0039}}',
1469                ) => return (0, WidthInfo::TAG_D1_END_ZWJ_EMOJI_PRESENTATION),
1470                (WidthInfo::TAG_D1_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0030}}'..='\\u{{E0039}}') => {{
1471                    return (0, WidthInfo::TAG_D2_END_ZWJ_EMOJI_PRESENTATION);
1472                }}
1473                (WidthInfo::TAG_D2_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0030}}'..='\\u{{E0039}}') => {{
1474                    return (0, WidthInfo::TAG_D3_END_ZWJ_EMOJI_PRESENTATION);
1475                }}
1476                (
1477                    WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION
1478                    | WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION
1479                    | WidthInfo::TAG_A5_END_ZWJ_EMOJI_PRESENTATION
1480                    | WidthInfo::TAG_A6_END_ZWJ_EMOJI_PRESENTATION
1481                    | WidthInfo::TAG_D3_END_ZWJ_EMOJI_PRESENTATION,
1482                    '\\u{{1F3F4}}',
1483                ) => return (0, WidthInfo::EMOJI_PRESENTATION),
1484                (WidthInfo::ZWJ_EMOJI_PRESENTATION, _)
1485                    if lookup_width{cjk_lo}(c).1 == WidthInfo::EMOJI_PRESENTATION =>
1486                {{
1487                    return (0, WidthInfo::EMOJI_PRESENTATION)
1488                }}
1489
1490                // Fallback
1491                _ => {{}}
1492            }}
1493        }}
1494
1495        let ret = lookup_width{cjk_lo}(c);
1496        (ret.0 as i8, ret.1)
1497    }}
1498}}
1499
1500{cfg}#[inline]
1501pub fn str_width{cjk_lo}(s: &str) -> usize {{
1502    s.chars()
1503        .rfold(
1504            (0, WidthInfo::DEFAULT),
1505            |(sum, next_info), c| -> (usize, WidthInfo) {{
1506                let (add, info) = width_in_str{cjk_lo}(c, next_info);
1507                (sum.wrapping_add_signed(isize::from(add)), info)
1508            }},
1509        )
1510        .0
1511}}
1512"""
1513
1514    return s
1515
1516
1517def emit_module(
1518    out_name: str,
1519    unicode_version: tuple[int, int, int],
1520    tables: list[Table],
1521    special_ranges: list[tuple[tuple[Codepoint, Codepoint], WidthState]],
1522    special_ranges_cjk: list[tuple[tuple[Codepoint, Codepoint], WidthState]],
1523    emoji_presentation_table: tuple[list[tuple[int, int]], list[list[int]]],
1524    text_presentation_table: tuple[list[tuple[int, int]], list[list[tuple[int, int]]]],
1525    emoji_modifier_table: tuple[list[tuple[int, int]], list[list[tuple[int, int]]]],
1526    joining_group_lam: list[tuple[Codepoint, Codepoint]],
1527    non_transparent_zero_widths: list[tuple[Codepoint, Codepoint]],
1528    ligature_transparent: list[tuple[Codepoint, Codepoint]],
1529    solidus_transparent: list[tuple[Codepoint, Codepoint]],
1530    normalization_tests: list[tuple[str, str, str, str, str]],
1531):
1532    """Outputs a Rust module to `out_name` using table data from `tables`.
1533    If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
1534    """
1535    if os.path.exists(out_name):
1536        os.remove(out_name)
1537    with open(out_name, "w", newline="\n", encoding="utf-8") as module:
1538        module.write(
1539            """// Copyright 2012-2022 The Rust Project Developers. See the COPYRIGHT
1540// file at the top-level directory of this distribution and at
1541// http://rust-lang.org/COPYRIGHT.
1542//
1543// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
1544// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
1545// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
1546// option. This file may not be copied, modified, or distributed
1547// except according to those terms.
1548
1549// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
1550
1551use core::cmp::Ordering;
1552
1553#[derive(Clone, Copy, Debug, PartialEq, Eq)]
1554struct WidthInfo(u16);
1555
1556impl WidthInfo {
1557    /// No special handling necessary
1558    const DEFAULT: Self = Self(0);
1559"""
1560        )
1561
1562        for variant in WidthState:
1563            if variant.is_carried():
1564                if variant.is_cjk_only():
1565                    module.write('    #[cfg(feature = "cjk")]\n')
1566                module.write(
1567                    f"    const {variant.name}: Self = Self(0b{variant.value:016b});\n"
1568                )
1569
1570        module.write(
1571            f"""
1572    /// Whether this width mode is ligature_transparent
1573    /// (has 5th MSB set.)
1574    fn is_ligature_transparent(self) -> bool {{
1575        (self.0 & 0b0000_1000_0000_0000) == 0b0000_1000_0000_0000
1576    }}
1577
1578    /// Sets 6th MSB.
1579    fn set_zwj_bit(self) -> Self {{
1580        Self(self.0 | 0b0000_0100_0000_0000)
1581    }}
1582
1583    /// Has top bit set
1584    fn is_emoji_presentation(self) -> bool {{
1585        (self.0 & 0b1000_0000_0000_0000) == 0b1000_0000_0000_0000
1586    }}
1587
1588    /// Has top bit set
1589    fn is_zwj_emoji_presentation(self) -> bool {{
1590        (self.0 & 0b1011_0000_0000_0000) == 0b1001_0000_0000_0000
1591    }}
1592
1593    /// Set top bit
1594    fn set_emoji_presentation(self) -> Self {{
1595        if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000
1596            || (self.0 & 0b1001_0000_0000_0000) == 0b0001_0000_0000_0000
1597        {{
1598            Self(self.0 | 0b1000_0000_0000_0000)
1599        }} else {{
1600            Self::VARIATION_SELECTOR_16
1601        }}
1602    }}
1603
1604    /// Clear top bit
1605    fn unset_emoji_presentation(self) -> Self {{
1606        if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 {{
1607            Self(self.0 & 0b0111_1111_1111_1111)
1608        }} else {{
1609            Self::DEFAULT
1610        }}
1611    }}
1612
1613    /// Has 2nd bit set
1614    fn is_text_presentation(self) -> bool {{
1615        (self.0 & 0b0100_0000_0000_0000) == 0b0100_0000_0000_0000
1616    }}
1617
1618    /// Set 2nd bit
1619    fn set_text_presentation(self) -> Self {{
1620        if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 {{
1621            Self(self.0 | 0b0100_0000_0000_0000)
1622        }} else {{
1623            Self(0b0100_0000_0000_0000)
1624        }}
1625    }}
1626
1627    /// Clear 2nd bit
1628    fn unset_text_presentation(self) -> Self {{
1629        Self(self.0 & 0b1011_1111_1111_1111)
1630    }}
1631}}
1632
1633/// The version of [Unicode](http://www.unicode.org/)
1634/// that this version of unicode-width is based on.
1635pub const UNICODE_VERSION: (u8, u8, u8) = {unicode_version};
1636"""
1637        )
1638
1639        module.write(lookup_fns(False, special_ranges, joining_group_lam))
1640        module.write(lookup_fns(True, special_ranges_cjk, joining_group_lam))
1641
1642        emoji_presentation_idx, emoji_presentation_leaves = emoji_presentation_table
1643        text_presentation_idx, text_presentation_leaves = text_presentation_table
1644        emoji_modifier_idx, emoji_modifier_leaves = emoji_modifier_table
1645
1646        module.write(
1647            """
1648/// Whether this character is a zero-width character with
1649/// `Joining_Type=Transparent`. Used by the Alef-Lamed ligatures.
1650/// See also [`is_ligature_transparent`], a near-subset of this (only ZWJ is excepted)
1651/// which is transparent for non-Arabic ligatures.
1652fn is_transparent_zero_width(c: char) -> bool {
1653    if lookup_width(c).0 != 0 {
1654        // Not zero-width
1655        false
1656    } else {
1657        let cp: u32 = c.into();
1658        NON_TRANSPARENT_ZERO_WIDTHS
1659            .binary_search_by(|&(lo, hi)| {
1660                let lo = u32::from_le_bytes([lo[0], lo[1], lo[2], 0]);
1661                let hi = u32::from_le_bytes([hi[0], hi[1], hi[2], 0]);
1662                if cp < lo {
1663                    Ordering::Greater
1664                } else if cp > hi {
1665                    Ordering::Less
1666                } else {
1667                    Ordering::Equal
1668                }
1669            })
1670            .is_err()
1671    }
1672}
1673
1674/// Whether this character is a default-ignorable combining mark
1675/// or ZWJ. These characters won't interrupt non-Arabic ligatures.
1676fn is_ligature_transparent(c: char) -> bool {
1677    matches!(c, """
1678        )
1679
1680        tail = False
1681        for lo, hi in ligature_transparent:
1682            if tail:
1683                module.write(" | ")
1684            tail = True
1685            module.write(f"'\\u{{{lo:X}}}'")
1686            if hi != lo:
1687                module.write(f"..='\\u{{{hi:X}}}'")
1688
1689        module.write(
1690            """)
1691}
1692
1693/// Whether this character is transparent wrt the effect of
1694/// U+0338 COMBINING LONG SOLIDUS OVERLAY
1695/// on its base character.
1696#[cfg(feature = "cjk")]
1697fn is_solidus_transparent(c: char) -> bool {
1698    let cp: u32 = c.into();
1699    is_ligature_transparent(c)
1700        || SOLIDUS_TRANSPARENT
1701            .binary_search_by(|&(lo, hi)| {
1702                let lo = u32::from_le_bytes([lo[0], lo[1], lo[2], 0]);
1703                let hi = u32::from_le_bytes([hi[0], hi[1], hi[2], 0]);
1704                if cp < lo {
1705                    Ordering::Greater
1706                } else if cp > hi {
1707                    Ordering::Less
1708                } else {
1709                    Ordering::Equal
1710                }
1711            })
1712            .is_ok()
1713}
1714
1715/// Whether this character forms an [emoji presentation sequence]
1716/// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
1717/// when followed by `'\\u{FEOF}'`.
1718/// Emoji presentation sequences are considered to have width 2.
1719#[inline]
1720pub fn starts_emoji_presentation_seq(c: char) -> bool {
1721    let cp: u32 = c.into();
1722    // First level of lookup uses all but 10 LSB
1723    let top_bits = cp >> 10;
1724    let idx_of_leaf: usize = match top_bits {
1725"""
1726        )
1727
1728        for msbs, i in emoji_presentation_idx:
1729            module.write(f"        0x{msbs:X} => {i},\n")
1730
1731        module.write(
1732            """        _ => return false,
1733    };
1734    // Extract the 3-9th (0-indexed) least significant bits of `cp`,
1735    // and use them to index into `leaf_row`.
1736    let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap();
1737    let leaf_byte = EMOJI_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf];
1738    // Use the 3 LSB of `cp` to index into `leaf_byte`.
1739    ((leaf_byte >> (cp & 7)) & 1) == 1
1740}
1741
1742/// Returns `true` if `c` has default emoji presentation, but forms a [text presentation sequence]
1743/// (https://www.unicode.org/reports/tr51/#def_text_presentation_sequence)
1744/// when followed by `'\\u{FEOE}'`, and is not ideographic.
1745/// Such sequences are considered to have width 1.
1746#[inline]
1747pub fn starts_non_ideographic_text_presentation_seq(c: char) -> bool {
1748    let cp: u32 = c.into();
1749    // First level of lookup uses all but 8 LSB
1750    let top_bits = cp >> 8;
1751    let leaf: &[(u8, u8)] = match top_bits {
1752"""
1753        )
1754
1755        for msbs, i in text_presentation_idx:
1756            module.write(f"        0x{msbs:X} => &TEXT_PRESENTATION_LEAF_{i},\n")
1757
1758        module.write(
1759            """        _ => return false,
1760    };
1761
1762    let bottom_bits = (cp & 0xFF) as u8;
1763    leaf.binary_search_by(|&(lo, hi)| {
1764        if bottom_bits < lo {
1765            Ordering::Greater
1766        } else if bottom_bits > hi {
1767            Ordering::Less
1768        } else {
1769            Ordering::Equal
1770        }
1771    })
1772    .is_ok()
1773}
1774
1775/// Returns `true` if `c` is an `Emoji_Modifier_Base`.
1776#[inline]
1777pub fn is_emoji_modifier_base(c: char) -> bool {
1778    let cp: u32 = c.into();
1779    // First level of lookup uses all but 8 LSB
1780    let top_bits = cp >> 8;
1781    let leaf: &[(u8, u8)] = match top_bits {
1782"""
1783        )
1784
1785        for msbs, i in emoji_modifier_idx:
1786            module.write(f"        0x{msbs:X} => &EMOJI_MODIFIER_LEAF_{i},\n")
1787
1788        module.write(
1789            """        _ => return false,
1790    };
1791
1792    let bottom_bits = (cp & 0xFF) as u8;
1793    leaf.binary_search_by(|&(lo, hi)| {
1794        if bottom_bits < lo {
1795            Ordering::Greater
1796        } else if bottom_bits > hi {
1797            Ordering::Less
1798        } else {
1799            Ordering::Equal
1800        }
1801    })
1802    .is_ok()
1803}
1804
1805#[repr(align(32))]
1806struct Align32<T>(T);
1807
1808#[repr(align(64))]
1809struct Align64<T>(T);
1810
1811#[repr(align(128))]
1812struct Align128<T>(T);
1813"""
1814        )
1815
1816        subtable_count = 1
1817        for i, table in enumerate(tables):
1818            new_subtable_count = len(table.buckets())
1819            if i == len(tables) - 1:
1820                table.indices_to_widths()  # for the last table, indices == widths
1821            byte_array = table.to_bytes()
1822
1823            if table.bytes_per_row is None:
1824                module.write(
1825                    f"/// Autogenerated. {subtable_count} sub-table(s). Consult [`lookup_width`] for layout info.)\n"
1826                )
1827                if table.cfged:
1828                    module.write('#[cfg(feature = "cjk")]\n')
1829                module.write(
1830                    f"static {table.name}: Align{table.align}<[u8; {len(byte_array)}]> = Align{table.align}(["
1831                )
1832                for j, byte in enumerate(byte_array):
1833                    # Add line breaks for every 15th entry (chosen to match what rustfmt does)
1834                    if j % 16 == 0:
1835                        module.write("\n   ")
1836                    module.write(f" 0x{byte:02X},")
1837                module.write("\n")
1838            else:
1839                num_rows = len(byte_array) // table.bytes_per_row
1840                num_primary_rows = (
1841                    table.primary_len
1842                    // (8 // int(table.offset_type))
1843                    // table.bytes_per_row
1844                )
1845                module.write(
1846                    f"""
1847#[cfg(feature = "cjk")]
1848const {table.name}_LEN: usize = {num_rows};
1849#[cfg(not(feature = "cjk"))]
1850const {table.name}_LEN: usize = {num_primary_rows};
1851/// Autogenerated. {subtable_count} sub-table(s). Consult [`lookup_width`] for layout info.
1852static {table.name}: Align{table.align}<[[u8; {table.bytes_per_row}]; {table.name}_LEN]> = Align{table.align}([\n"""
1853                )
1854                for row_num in range(0, num_rows):
1855                    if row_num >= num_primary_rows:
1856                        module.write('    #[cfg(feature = "cjk")]\n')
1857                    module.write("    [\n")
1858                    row = byte_array[
1859                        row_num
1860                        * table.bytes_per_row : (row_num + 1)
1861                        * table.bytes_per_row
1862                    ]
1863                    for subrow in batched(row, 15):
1864                        module.write("       ")
1865                        for entry in subrow:
1866                            module.write(f" 0x{entry:02X},")
1867                        module.write("\n")
1868                    module.write("    ],\n")
1869            module.write("]);\n")
1870            subtable_count = new_subtable_count
1871
1872        # non transparent zero width table
1873
1874        module.write(
1875            f"""
1876/// Sorted list of codepoint ranges (inclusive)
1877/// that are zero-width but not `Joining_Type=Transparent`
1878/// FIXME: can we get better compression?
1879static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); {len(non_transparent_zero_widths)}] = [
1880"""
1881        )
1882
1883        for lo, hi in non_transparent_zero_widths:
1884            module.write(
1885                f"    ([0x{lo & 0xFF:02X}, 0x{lo >> 8 & 0xFF:02X}, 0x{lo >> 16:02X}], [0x{hi & 0xFF:02X}, 0x{hi >> 8 & 0xFF:02X}, 0x{hi >> 16:02X}]),\n"
1886            )
1887
1888        # solidus transparent table
1889
1890        module.write(
1891            f"""];
1892
1893/// Sorted list of codepoint ranges (inclusive)
1894/// that don't affect how the combining solidus applies
1895/// (mostly ccc > 1).
1896/// FIXME: can we get better compression?
1897#[cfg(feature = "cjk")]
1898static SOLIDUS_TRANSPARENT: [([u8; 3], [u8; 3]); {len(solidus_transparent)}] = [
1899"""
1900        )
1901
1902        for lo, hi in solidus_transparent:
1903            module.write(
1904                f"    ([0x{lo & 0xFF:02X}, 0x{lo >> 8 & 0xFF:02X}, 0x{lo >> 16:02X}], [0x{hi & 0xFF:02X}, 0x{hi >> 8 & 0xFF:02X}, 0x{hi >> 16:02X}]),\n"
1905            )
1906
1907        # emoji table
1908
1909        module.write(
1910            f"""];
1911
1912/// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint
1913/// to get whether it can start an emoji presentation sequence.
1914static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(emoji_presentation_leaves)}]> = Align128([
1915"""
1916        )
1917        for leaf in emoji_presentation_leaves:
1918            module.write("    [\n")
1919            for row in batched(leaf, 15):
1920                module.write("       ")
1921                for entry in row:
1922                    module.write(f" 0x{entry:02X},")
1923                module.write("\n")
1924            module.write("    ],\n")
1925
1926        module.write("]);\n")
1927
1928        # text table
1929
1930        for leaf_idx, leaf in enumerate(text_presentation_leaves):
1931            module.write(
1932                f"""
1933#[rustfmt::skip]
1934static TEXT_PRESENTATION_LEAF_{leaf_idx}: [(u8, u8); {len(leaf)}] = [
1935"""
1936            )
1937            for lo, hi in leaf:
1938                module.write(f"    (0x{lo:02X}, 0x{hi:02X}),\n")
1939            module.write(f"];\n")
1940
1941        # emoji modifier table
1942
1943        for leaf_idx, leaf in enumerate(emoji_modifier_leaves):
1944            module.write(
1945                f"""
1946#[rustfmt::skip]
1947static EMOJI_MODIFIER_LEAF_{leaf_idx}: [(u8, u8); {len(leaf)}] = [
1948"""
1949            )
1950            for lo, hi in leaf:
1951                module.write(f"    (0x{lo:02X}, 0x{hi:02X}),\n")
1952            module.write(f"];\n")
1953
1954        test_width_variants = []
1955        test_width_variants_cjk = []
1956        for variant in WidthState:
1957            if variant.is_carried():
1958                if not variant.is_cjk_only():
1959                    test_width_variants.append(variant)
1960                if not variant.is_non_cjk_only():
1961                    test_width_variants_cjk.append(variant)
1962
1963        module.write(
1964            f"""
1965#[cfg(test)]
1966mod tests {{
1967    use super::*;
1968
1969    fn str_width_test(s: &str, init: WidthInfo) -> isize {{
1970        s.chars()
1971            .rfold((0, init), |(sum, next_info), c| -> (isize, WidthInfo) {{
1972                let (add, info) = width_in_str(c, next_info);
1973                (sum.checked_add(isize::from(add)).unwrap(), info)
1974            }})
1975            .0
1976    }}
1977
1978    #[cfg(feature = "cjk")]
1979    fn str_width_test_cjk(s: &str, init: WidthInfo) -> isize {{
1980        s.chars()
1981            .rfold((0, init), |(sum, next_info), c| -> (isize, WidthInfo) {{
1982                let (add, info) = width_in_str_cjk(c, next_info);
1983                (sum.checked_add(isize::from(add)).unwrap(), info)
1984            }})
1985            .0
1986    }}
1987
1988    #[test]
1989    fn test_normalization() {{
1990        for &(orig, nfc, nfd, nfkc, nfkd) in &NORMALIZATION_TEST {{
1991            for init in NORMALIZATION_TEST_WIDTHS {{
1992                assert_eq!(
1993                    str_width_test(orig, init),
1994                    str_width_test(nfc, init),
1995                    "width of X = {{orig:?}} differs from toNFC(X) = {{nfc:?}} with mode {{init:X?}}",
1996                );
1997                assert_eq!(
1998                    str_width_test(orig, init),
1999                    str_width_test(nfd, init),
2000                    "width of X = {{orig:?}} differs from toNFD(X) = {{nfd:?}} with mode {{init:X?}}",
2001                );
2002                assert_eq!(
2003                    str_width_test(nfkc, init),
2004                    str_width_test(nfkd, init),
2005                    "width of toNFKC(X) = {{nfkc:?}} differs from toNFKD(X) = {{nfkd:?}} with mode {{init:X?}}",
2006                );
2007            }}
2008
2009            #[cfg(feature = "cjk")]
2010            for init in NORMALIZATION_TEST_WIDTHS_CJK {{
2011                assert_eq!(
2012                    str_width_test_cjk(orig, init),
2013                    str_width_test_cjk(nfc, init),
2014                    "CJK width of X = {{orig:?}} differs from toNFC(X) = {{nfc:?}} with mode {{init:X?}}",
2015                );
2016                assert_eq!(
2017                    str_width_test_cjk(orig, init),
2018                    str_width_test_cjk(nfd, init),
2019                    "CJK width of X = {{orig:?}} differs from toNFD(X) = {{nfd:?}} with mode {{init:X?}}",
2020                );
2021                assert_eq!(
2022                    str_width_test_cjk(nfkc, init),
2023                    str_width_test_cjk(nfkd, init),
2024                    "CJK width of toNFKC(X) = {{nfkc:?}} differs from toNFKD(X) = {{nfkd:?}} with mode {{init:?}}",
2025                );
2026            }}
2027        }}
2028    }}
2029
2030    static NORMALIZATION_TEST_WIDTHS: [WidthInfo; {len(test_width_variants) + 1}] = [
2031        WidthInfo::DEFAULT,\n"""
2032        )
2033
2034        for variant in WidthState:
2035            if variant.is_carried() and not variant.is_cjk_only():
2036                module.write(f"        WidthInfo::{variant.name},\n")
2037
2038        module.write(
2039            f"""    ];
2040
2041    #[cfg(feature = "cjk")]
2042    static NORMALIZATION_TEST_WIDTHS_CJK: [WidthInfo; {len(test_width_variants_cjk) + 1}] = [
2043        WidthInfo::DEFAULT,\n"""
2044        )
2045
2046        for variant in WidthState:
2047            if variant.is_carried() and not variant.is_non_cjk_only():
2048                module.write(f"        WidthInfo::{variant.name},\n")
2049
2050        module.write(
2051            f"""    ];
2052
2053    #[rustfmt::skip]
2054    static NORMALIZATION_TEST: [(&str, &str, &str, &str, &str); {len(normalization_tests)}] = [\n"""
2055        )
2056        for orig, nfc, nfd, nfkc, nfkd in normalization_tests:
2057            module.write(
2058                f'        (r#"{orig}"#, r#"{nfc}"#, r#"{nfd}"#, r#"{nfkc}"#, r#"{nfkd}"#),\n'
2059            )
2060
2061        module.write("    ];\n}\n")
2062
2063
2064def main(module_path: str):
2065    """Obtain character data from the latest version of Unicode, transform it into a multi-level
2066    lookup table for character width, and write a Rust module utilizing that table to
2067    `module_filename`.
2068
2069    See `lib.rs` for documentation of the exact width rules.
2070    """
2071    version = load_unicode_version()
2072    print(f"Generating module for Unicode {version[0]}.{version[1]}.{version[2]}")
2073
2074    (width_map, cjk_width_map) = load_width_maps()
2075
2076    tables = make_tables(width_map, cjk_width_map)
2077
2078    special_ranges = make_special_ranges(width_map)
2079    cjk_special_ranges = make_special_ranges(cjk_width_map)
2080
2081    emoji_presentations = load_emoji_presentation_sequences()
2082    emoji_presentation_table = make_presentation_sequence_table(emoji_presentations)
2083
2084    text_presentations = load_text_presentation_sequences()
2085    text_presentation_table = make_ranges_table(text_presentations)
2086
2087    emoji_modifier_bases = load_emoji_modifier_bases()
2088    emoji_modifier_table = make_ranges_table(emoji_modifier_bases)
2089
2090    joining_group_lam = load_joining_group_lam()
2091    non_transparent_zero_widths = load_non_transparent_zero_widths(width_map)
2092    ligature_transparent = load_ligature_transparent()
2093    solidus_transparent = load_solidus_transparent(ligature_transparent, cjk_width_map)
2094
2095    normalization_tests = load_normalization_tests()
2096
2097    fetch_open("emoji-test.txt", "../tests", emoji=True)
2098
2099    print("------------------------")
2100    total_size = 0
2101    for i, table in enumerate(tables):
2102        size_bytes = len(table.to_bytes())
2103        print(f"Table {i} size: {size_bytes} bytes")
2104        total_size += size_bytes
2105
2106    for s, table in [
2107        ("Emoji presentation", emoji_presentation_table),
2108    ]:
2109        index_size = len(table[0]) * (math.ceil(math.log(table[0][-1][0], 256)) + 8)
2110        print(f"{s} index size: {index_size} bytes")
2111        total_size += index_size
2112        leaves_size = len(table[1]) * len(table[1][0])
2113        print(f"{s} leaves size: {leaves_size} bytes")
2114        total_size += leaves_size
2115
2116    for s, table in [
2117        ("Text presentation", text_presentation_table),
2118        ("Emoji modifier", emoji_modifier_table),
2119    ]:
2120        index_size = len(table[0]) * (math.ceil(math.log(table[0][-1][0], 256)) + 16)
2121        print(f"{s} index size: {index_size} bytes")
2122        total_size += index_size
2123        leaves_size = 2 * sum(map(len, table[1]))
2124        print(f"{s} leaves size: {leaves_size} bytes")
2125        total_size += leaves_size
2126
2127    for s, table in [
2128        ("Non transparent zero width", non_transparent_zero_widths),
2129        ("Solidus transparent", solidus_transparent),
2130    ]:
2131        table_size = 6 * len(table)
2132        print(f"{s} table size: {table_size} bytes")
2133        total_size += table_size
2134    print("------------------------")
2135    print(f"  Total size: {total_size} bytes")
2136
2137    emit_module(
2138        out_name=module_path,
2139        unicode_version=version,
2140        tables=tables,
2141        special_ranges=special_ranges,
2142        special_ranges_cjk=cjk_special_ranges,
2143        emoji_presentation_table=emoji_presentation_table,
2144        text_presentation_table=text_presentation_table,
2145        emoji_modifier_table=emoji_modifier_table,
2146        joining_group_lam=joining_group_lam,
2147        non_transparent_zero_widths=non_transparent_zero_widths,
2148        ligature_transparent=ligature_transparent,
2149        solidus_transparent=solidus_transparent,
2150        normalization_tests=normalization_tests,
2151    )
2152    print(f'Wrote to "{module_path}"')
2153
2154
2155if __name__ == "__main__":
2156    main(MODULE_PATH)
2157