1#!/usr/bin/env python3 2# 3# Copyright 2011-2022 The Rust Project Developers. See the COPYRIGHT 4# file at the top-level directory of this distribution and at 5# http://rust-lang.org/COPYRIGHT. 6# 7# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 8# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 9# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 10# option. This file may not be copied, modified, or distributed 11# except according to those terms. 12 13# This script uses the following Unicode tables: 14# 15# - DerivedCoreProperties.txt 16# - EastAsianWidth.txt 17# - HangulSyllableType.txt 18# - NormalizationTest.txt (for tests only) 19# - PropList.txt 20# - ReadMe.txt 21# - UnicodeData.txt 22# - auxiliary/GraphemeBreakProperty.txt 23# - emoji/emoji-data.txt 24# - emoji/emoji-variation-sequences.txt 25# - extracted/DerivedGeneralCategory.txt 26# 27# Since this should not require frequent updates, we just store this 28# out-of-line and check the generated module into git. 29 30import enum 31import math 32import operator 33import os 34import re 35import sys 36import urllib.request 37from collections import defaultdict 38from itertools import batched 39from typing import Callable, Iterable 40 41UNICODE_VERSION = "15.1.0" 42"""The version of the Unicode data files to download.""" 43 44NUM_CODEPOINTS = 0x110000 45"""An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace.""" 46 47MAX_CODEPOINT_BITS = math.ceil(math.log2(NUM_CODEPOINTS - 1)) 48"""The maximum number of bits required to represent a Unicode codepoint.""" 49 50 51class OffsetType(enum.IntEnum): 52 """Represents the data type of a lookup table's offsets. Each variant's value represents the 53 number of bits required to represent that variant's type.""" 54 55 U2 = 2 56 """Offsets are 2-bit unsigned integers, packed four-per-byte.""" 57 U4 = 4 58 """Offsets are 4-bit unsigned integers, packed two-per-byte.""" 59 U8 = 8 60 """Each offset is a single byte (u8).""" 61 62 63MODULE_PATH = "../src/tables.rs" 64"""The path of the emitted Rust module (relative to the working directory)""" 65 66TABLE_SPLITS = [7, 13] 67"""The splits between the bits of the codepoint used to index each subtable. 68Adjust these values to change the sizes of the subtables""" 69 70Codepoint = int 71BitPos = int 72 73 74def fetch_open(filename: str, local_prefix: str = "", emoji: bool = False): 75 """Opens `filename` and return its corresponding file object. If `filename` isn't on disk, 76 fetches it from `https://www.unicode.org/Public/`. Exits with code 1 on failure. 77 """ 78 basename = os.path.basename(filename) 79 localname = os.path.join(local_prefix, basename) 80 if not os.path.exists(localname): 81 if emoji: 82 prefix = f"emoji/{UNICODE_VERSION[:-2]}" 83 else: 84 prefix = f"{UNICODE_VERSION}/ucd" 85 urllib.request.urlretrieve( 86 f"https://www.unicode.org/Public/{prefix}/{filename}", 87 localname, 88 ) 89 try: 90 return open(localname, encoding="utf-8") 91 except OSError: 92 sys.stderr.write(f"cannot load {localname}") 93 sys.exit(1) 94 95 96def load_unicode_version() -> tuple[int, int, int]: 97 """Returns the current Unicode version by fetching and processing `ReadMe.txt`.""" 98 with fetch_open("ReadMe.txt") as readme: 99 pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode" 100 return tuple(map(int, re.search(pattern, readme.read()).groups())) # type: ignore 101 102 103def load_property(filename: str, pattern: str, action: Callable[[int], None]): 104 with fetch_open(filename) as properties: 105 single = re.compile(rf"^([0-9A-F]+)\s*;\s*{pattern}\s+") 106 multiple = re.compile(rf"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*{pattern}\s+") 107 108 for line in properties.readlines(): 109 raw_data = None # (low, high) 110 if match := single.match(line): 111 raw_data = (match.group(1), match.group(1)) 112 elif match := multiple.match(line): 113 raw_data = (match.group(1), match.group(2)) 114 else: 115 continue 116 low = int(raw_data[0], 16) 117 high = int(raw_data[1], 16) 118 for cp in range(low, high + 1): 119 action(cp) 120 121 122def to_sorted_ranges(iter: Iterable[Codepoint]) -> list[tuple[Codepoint, Codepoint]]: 123 "Creates a sorted list of ranges from an iterable of codepoints" 124 lst = [c for c in iter] 125 lst.sort() 126 ret = [] 127 for cp in lst: 128 if len(ret) > 0 and ret[-1][1] == cp - 1: 129 ret[-1] = (ret[-1][0], cp) 130 else: 131 ret.append((cp, cp)) 132 return ret 133 134 135class EastAsianWidth(enum.IntEnum): 136 """Represents the width of a Unicode character according to UAX 16. 137 All East Asian Width classes resolve into either 138 `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`. 139 """ 140 141 NARROW = 1 142 """ One column wide. """ 143 WIDE = 2 144 """ Two columns wide. """ 145 AMBIGUOUS = 3 146 """ Two columns wide in a CJK context. One column wide in all other contexts. """ 147 148 149class CharWidthInTable(enum.IntEnum): 150 """Represents the width of a Unicode character 151 as stored in the tables.""" 152 153 ZERO = 0 154 ONE = 1 155 TWO = 2 156 SPECIAL = 3 157 158 159class WidthState(enum.IntEnum): 160 """ 161 Width calculation proceeds according to a state machine. 162 We iterate over the characters of the string from back to front; 163 the next character encountered determines the transition to take. 164 165 The integer values of these variants have special meaning: 166 - Top bit: whether this is Vs16 167 - 2nd from top: whether this is Vs15 168 - 3rd bit from top: whether this is transparent to emoji/text presentation 169 (if set, should also set 4th) 170 - 4th bit: whether to set top bit on emoji presentation. 171 If this is set but 3rd is not, the width mode is related to zwj sequences 172 - 5th from top: whether this is unaffected by ligature-transparent 173 - 6th bit: if 4th is set but this one is not, then this is a ZWJ ligature state 174 where no ZWJ has been encountered yet; encountering one flips this on""" 175 176 # BASIC WIDTHS 177 178 ZERO = 0x1_0000 179 "Zero columns wide." 180 181 NARROW = 0x1_0001 182 "One column wide." 183 184 WIDE = 0x1_0002 185 "Two columns wide." 186 187 THREE = 0x1_0003 188 "Three columns wide." 189 190 # \r\n 191 LINE_FEED = 0b0000_0000_0000_0001 192 "\\n (CRLF has width 1)" 193 194 # EMOJI 195 196 # Emoji skintone modifiers 197 EMOJI_MODIFIER = 0b0000_0000_0000_0010 198 "`Emoji_Modifier`" 199 200 # Emoji ZWJ sequences 201 202 REGIONAL_INDICATOR = 0b0000_0000_0000_0011 203 "`Regional_Indicator`" 204 205 SEVERAL_REGIONAL_INDICATOR = 0b0000_0000_0000_0100 206 "At least two `Regional_Indicator`in sequence" 207 208 EMOJI_PRESENTATION = 0b0000_0000_0000_0101 209 "`Emoji_Presentation`" 210 211 ZWJ_EMOJI_PRESENTATION = 0b0001_0000_0000_0110 212 "\\u200D `Emoji_Presentation`" 213 214 VS16_ZWJ_EMOJI_PRESENTATION = 0b1001_0000_0000_0110 215 "\\uFE0F \\u200D `Emoji_Presentation`" 216 217 KEYCAP_ZWJ_EMOJI_PRESENTATION = 0b0001_0000_0000_0111 218 "\\u20E3 \\u200D `Emoji_Presentation`" 219 220 VS16_KEYCAP_ZWJ_EMOJI_PRESENTATION = 0b1001_0000_0000_0111 221 "\\uFE0F \\u20E3 \\u200D `Emoji_Presentation`" 222 223 REGIONAL_INDICATOR_ZWJ_PRESENTATION = 0b0000_0000_0000_1001 224 "`Regional_Indicator` \\u200D `Emoji_Presentation`" 225 226 EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION = 0b0000_0000_0000_1010 227 "(`Regional_Indicator` `Regional_Indicator`)+ \\u200D `Emoji_Presentation`" 228 229 ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION = 0b0000_0000_0000_1011 230 "(`Regional_Indicator` `Regional_Indicator`)+ `Regional_Indicator` \\u200D `Emoji_Presentation`" 231 232 TAG_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_0000 233 "\\uE007F \\u200D `Emoji_Presentation`" 234 235 TAG_D1_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_0001 236 "\\uE0030..=\\uE0039 \\uE007F \\u200D `Emoji_Presentation`" 237 238 TAG_D2_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_0010 239 "(\\uE0030..=\\uE0039){2} \\uE007F \\u200D `Emoji_Presentation`" 240 241 TAG_D3_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_0011 242 "(\\uE0030..=\\uE0039){3} \\uE007F \\u200D `Emoji_Presentation`" 243 244 TAG_A1_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1001 245 "\\uE0061..=\\uE007A \\uE007F \\u200D `Emoji_Presentation`" 246 247 TAG_A2_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1010 248 "(\\uE0061..=\\uE007A){2} \\uE007F \\u200D `Emoji_Presentation`" 249 250 TAG_A3_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1011 251 "(\\uE0061..=\\uE007A){3} \\uE007F \\u200D `Emoji_Presentation`" 252 253 TAG_A4_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1100 254 "(\\uE0061..=\\uE007A){4} \\uE007F \\u200D `Emoji_Presentation`" 255 256 TAG_A5_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1101 257 "(\\uE0061..=\\uE007A){35} \\uE007F \\u200D `Emoji_Presentation`" 258 259 TAG_A6_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1110 260 "(\\uE0061..=\\uE007A){6} \\uE007F \\u200D `Emoji_Presentation`" 261 262 # VARIATION SELECTORS 263 264 # Text presentation sequences (not CJK) 265 VARIATION_SELECTOR_15 = 0b0100_0000_0000_0000 266 "\\uFE0E (text presentation sequences)" 267 268 # Emoji presentation sequences 269 VARIATION_SELECTOR_16 = 0b1000_0000_0000_0000 270 "\\uFE0F (emoji presentation sequences)" 271 272 # ARABIC LAM ALEF 273 274 JOINING_GROUP_ALEF = 0b0011_0000_1111_1111 275 "Joining_Group=Alef (Arabic Lam-Alef ligature)" 276 277 # COMBINING SOLIDUS (CJK only) 278 279 COMBINING_LONG_SOLIDUS_OVERLAY = 0b0011_1100_1111_1111 280 "\\u0338 (CJK only, makes <, =, > width 2)" 281 282 # SOLIDUS + ALEF (solidus is Joining_Type=Transparent) 283 SOLIDUS_OVERLAY_ALEF = 0b0011_1000_1111_1111 284 "\\u0338 followed by Joining_Group=Alef" 285 286 # SCRIPT ZWJ LIGATURES 287 288 # Hebrew alef lamed 289 290 HEBREW_LETTER_LAMED = 0b0011_1000_0000_0000 291 "\\u05DC (Alef-ZWJ-Lamed ligature)" 292 293 ZWJ_HEBREW_LETTER_LAMED = 0b0011_1100_0000_0000 294 "\\u200D\\u05DC (Alef-ZWJ-Lamed ligature)" 295 296 # Buginese <a -i> ya 297 298 BUGINESE_LETTER_YA = 0b0011_1000_0000_0001 299 "\\u1A10 (<a, -i> + ya ligature)" 300 301 ZWJ_BUGINESE_LETTER_YA = 0b0011_1100_0000_0001 302 "\\u200D\\u1A10 (<a, -i> + ya ligature)" 303 304 BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA = 0b0011_1100_0000_0010 305 "\\u1A17\\u200D\\u1A10 (<a, -i> + ya ligature)" 306 307 # Tifinagh bi-consonants 308 309 TIFINAGH_CONSONANT = 0b0011_1000_0000_0011 310 "\\u2D31..=\\u2D65 or \\u2D6F (joined by ZWJ or \\u2D7F TIFINAGH CONSONANT JOINER)" 311 312 ZWJ_TIFINAGH_CONSONANT = 0b0011_1100_0000_0011 313 "ZWJ then \\u2D31..=\\u2D65 or \\u2D6F" 314 315 TIFINAGH_JOINER_CONSONANT = 0b0011_1100_0000_0100 316 "\\u2D7F then \\u2D31..=\\u2D65 or \\u2D6F" 317 318 # Lisu tone letters 319 LISU_TONE_LETTER_MYA_NA_JEU = 0b0011_1100_0000_0101 320 "\\uA4FC or \\uA4FD (https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078)" 321 322 # Old Turkic orkhon ec - orkhon i 323 324 OLD_TURKIC_LETTER_ORKHON_I = 0b0011_1000_0000_0110 325 "\\u10C03 (ORKHON EC-ZWJ-ORKHON I ligature)" 326 327 ZWJ_OLD_TURKIC_LETTER_ORKHON_I = 0b0011_1100_0000_0110 328 "\\u10C03 (ORKHON EC-ZWJ-ORKHON I ligature)" 329 330 # Khmer coeng signs 331 332 KHMER_COENG_ELIGIBLE_LETTER = 0b0011_1100_0000_0111 333 "\\u1780..=\\u17A2 | \\u17A7 | \\u17AB | \\u17AC | \\u17AF" 334 335 def table_width(self) -> CharWidthInTable: 336 "The width of a character as stored in the lookup tables." 337 match self: 338 case WidthState.ZERO: 339 return CharWidthInTable.ZERO 340 case WidthState.NARROW: 341 return CharWidthInTable.ONE 342 case WidthState.WIDE: 343 return CharWidthInTable.TWO 344 case _: 345 return CharWidthInTable.SPECIAL 346 347 def is_carried(self) -> bool: 348 "Whether this corresponds to a non-default `WidthInfo`." 349 return int(self) <= 0xFFFF 350 351 def width_alone(self) -> int: 352 "The width of a character with this type when it appears alone." 353 match self: 354 case ( 355 WidthState.ZERO 356 | WidthState.COMBINING_LONG_SOLIDUS_OVERLAY 357 | WidthState.VARIATION_SELECTOR_15 358 | WidthState.VARIATION_SELECTOR_16 359 ): 360 return 0 361 case ( 362 WidthState.WIDE 363 | WidthState.EMOJI_MODIFIER 364 | WidthState.EMOJI_PRESENTATION 365 ): 366 return 2 367 case WidthState.THREE: 368 return 3 369 case _: 370 return 1 371 372 def is_cjk_only(self) -> bool: 373 return self in [ 374 WidthState.COMBINING_LONG_SOLIDUS_OVERLAY, 375 WidthState.SOLIDUS_OVERLAY_ALEF, 376 ] 377 378 def is_non_cjk_only(self) -> bool: 379 return self == WidthState.VARIATION_SELECTOR_15 380 381 382assert len(set([v.value for v in WidthState])) == len([v.value for v in WidthState]) 383 384 385def load_east_asian_widths() -> list[EastAsianWidth]: 386 """Return a list of effective widths, indexed by codepoint. 387 Widths are determined by fetching and parsing `EastAsianWidth.txt`. 388 389 `Neutral`, `Narrow`, and `Halfwidth` characters are assigned `EffectiveWidth.NARROW`. 390 391 `Wide` and `Fullwidth` characters are assigned `EffectiveWidth.WIDE`. 392 393 `Ambiguous` characters are assigned `EffectiveWidth.AMBIGUOUS`.""" 394 395 with fetch_open("EastAsianWidth.txt") as eaw: 396 # matches a width assignment for a single codepoint, i.e. "1F336;N # ..." 397 single = re.compile(r"^([0-9A-F]+)\s*;\s*(\w+) +# (\w+)") 398 # matches a width assignment for a range of codepoints, i.e. "3001..3003;W # ..." 399 multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(\w+) +# (\w+)") 400 # map between width category code and condensed width 401 width_codes = { 402 **{c: EastAsianWidth.NARROW for c in ["N", "Na", "H"]}, 403 **{c: EastAsianWidth.WIDE for c in ["W", "F"]}, 404 "A": EastAsianWidth.AMBIGUOUS, 405 } 406 407 width_map = [] 408 current = 0 409 for line in eaw.readlines(): 410 raw_data = None # (low, high, width) 411 if match := single.match(line): 412 raw_data = (match.group(1), match.group(1), match.group(2)) 413 elif match := multiple.match(line): 414 raw_data = (match.group(1), match.group(2), match.group(3)) 415 else: 416 continue 417 low = int(raw_data[0], 16) 418 high = int(raw_data[1], 16) 419 width = width_codes[raw_data[2]] 420 421 assert current <= high 422 while current <= high: 423 # Some codepoints don't fall into any of the ranges in EastAsianWidth.txt. 424 # All such codepoints are implicitly given Neural width (resolves to narrow) 425 width_map.append(EastAsianWidth.NARROW if current < low else width) 426 current += 1 427 428 while len(width_map) < NUM_CODEPOINTS: 429 # Catch any leftover codepoints and assign them implicit Neutral/narrow width. 430 width_map.append(EastAsianWidth.NARROW) 431 432 # Ambiguous `Letter`s and `Modifier_Symbol`s are narrow 433 load_property( 434 "extracted/DerivedGeneralCategory.txt", 435 r"(:?Lu|Ll|Lt|Lm|Lo|Sk)", 436 lambda cp: ( 437 operator.setitem(width_map, cp, EastAsianWidth.NARROW) 438 if width_map[cp] == EastAsianWidth.AMBIGUOUS 439 else None 440 ), 441 ) 442 443 # GREEK ANO TELEIA: NFC decomposes to U+00B7 MIDDLE DOT 444 width_map[0x0387] = EastAsianWidth.AMBIGUOUS 445 446 # Canonical equivalence for symbols with stroke 447 with fetch_open("UnicodeData.txt") as udata: 448 single = re.compile(r"([0-9A-Z]+);.*?;.*?;.*?;.*?;([0-9A-Z]+) 0338;") 449 for line in udata.readlines(): 450 if match := single.match(line): 451 composed = int(match.group(1), 16) 452 decomposed = int(match.group(2), 16) 453 if width_map[decomposed] == EastAsianWidth.AMBIGUOUS: 454 width_map[composed] = EastAsianWidth.AMBIGUOUS 455 456 return width_map 457 458 459def load_zero_widths() -> list[bool]: 460 """Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width 461 character. `c` is considered a zero-width character if 462 463 - it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`), 464 - or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`), 465 - or if it one of eight characters that should be `Grapheme_Extend` but aren't due to a Unicode spec bug, 466 - or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`). 467 """ 468 469 zw_map = [False] * NUM_CODEPOINTS 470 471 # `Default_Ignorable_Code_Point`s also have 0 width: 472 # https://www.unicode.org/faq/unsup_char.html#3 473 # https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095 474 # 475 # `Grapheme_Extend` includes characters with general category `Mn` or `Me`, 476 # as well as a few `Mc` characters that need to be included so that 477 # canonically equivalent sequences have the same width. 478 load_property( 479 "DerivedCoreProperties.txt", 480 r"(?:Default_Ignorable_Code_Point|Grapheme_Extend)", 481 lambda cp: operator.setitem(zw_map, cp, True), 482 ) 483 484 # Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`, 485 # as they canonically decompose to two characters with this property, 486 # but they aren't. 487 for c in [0x0CC0, 0x0CC7, 0x0CC8, 0x0CCA, 0x0CCB, 0x1B3B, 0x1B3D, 0x1B43]: 488 zw_map[c] = True 489 490 # Treat `Hangul_Syllable_Type`s of `Vowel_Jamo` and `Trailing_Jamo` 491 # as zero-width. This matches the behavior of glibc `wcwidth`. 492 # 493 # Decomposed Hangul characters consist of 3 parts: a `Leading_Jamo`, 494 # a `Vowel_Jamo`, and an optional `Trailing_Jamo`. Together these combine 495 # into a single wide grapheme. So we treat vowel and trailing jamo as 496 # 0-width, such that only the width of the leading jamo is counted 497 # and the resulting grapheme has width 2. 498 # 499 # (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul) 500 load_property( 501 "HangulSyllableType.txt", 502 r"(?:V|T)", 503 lambda cp: operator.setitem(zw_map, cp, True), 504 ) 505 506 # Syriac abbreviation mark: 507 # Zero-width `Prepended_Concatenation_Mark` 508 zw_map[0x070F] = True 509 510 # Some Arabic Prepended_Concatenation_Mark`s 511 # https://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G27820 512 zw_map[0x0605] = True 513 zw_map[0x0890] = True 514 zw_map[0x0891] = True 515 zw_map[0x08E2] = True 516 517 # `[:Grapheme_Cluster_Break=Prepend:]-[:Prepended_Concatenation_Mark:]` 518 gcb_prepend = set() 519 load_property( 520 "auxiliary/GraphemeBreakProperty.txt", 521 "Prepend", 522 lambda cp: gcb_prepend.add(cp), 523 ) 524 load_property( 525 "PropList.txt", 526 "Prepended_Concatenation_Mark", 527 lambda cp: gcb_prepend.remove(cp), 528 ) 529 for cp in gcb_prepend: 530 zw_map[cp] = True 531 532 # HANGUL CHOSEONG FILLER 533 # U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have 534 # zero width. However, the expected usage is to combine it with vowel or trailing jamo 535 # (which are considered 0-width on their own) to form a composed Hangul syllable with 536 # width 2. Therefore, we treat it as having width 2. 537 zw_map[0x115F] = False 538 539 # TIFINAGH CONSONANT JOINER 540 # (invisible only when used to join two Tifinagh consonants 541 zw_map[0x2D7F] = False 542 543 # DEVANAGARI CARET 544 # https://www.unicode.org/versions/Unicode15.0.0/ch12.pdf#G667447 545 zw_map[0xA8FA] = True 546 547 return zw_map 548 549 550def load_width_maps() -> tuple[list[WidthState], list[WidthState]]: 551 """Load complete width table, including characters needing special handling. 552 (Returns 2 tables, one for East Asian and one for not.)""" 553 554 eaws = load_east_asian_widths() 555 zws = load_zero_widths() 556 557 not_ea = [] 558 ea = [] 559 560 for eaw, zw in zip(eaws, zws): 561 if zw: 562 not_ea.append(WidthState.ZERO) 563 ea.append(WidthState.ZERO) 564 else: 565 if eaw == EastAsianWidth.WIDE: 566 not_ea.append(WidthState.WIDE) 567 else: 568 not_ea.append(WidthState.NARROW) 569 570 if eaw == EastAsianWidth.NARROW: 571 ea.append(WidthState.NARROW) 572 else: 573 ea.append(WidthState.WIDE) 574 575 # Joining_Group=Alef (Arabic Lam-Alef ligature) 576 alef_joining = [] 577 load_property( 578 "extracted/DerivedJoiningGroup.txt", 579 "Alef", 580 lambda cp: alef_joining.append(cp), 581 ) 582 583 # Regional indicators 584 regional_indicators = [] 585 load_property( 586 "PropList.txt", 587 "Regional_Indicator", 588 lambda cp: regional_indicators.append(cp), 589 ) 590 591 # Emoji modifiers 592 emoji_modifiers = [] 593 load_property( 594 "emoji/emoji-data.txt", 595 "Emoji_Modifier", 596 lambda cp: emoji_modifiers.append(cp), 597 ) 598 599 # Default emoji presentation (for ZWJ sequences) 600 emoji_presentation = [] 601 load_property( 602 "emoji/emoji-data.txt", 603 "Emoji_Presentation", 604 lambda cp: emoji_presentation.append(cp), 605 ) 606 607 for cps, width in [ 608 ([0x0A], WidthState.LINE_FEED), 609 ([0x05DC], WidthState.HEBREW_LETTER_LAMED), 610 (alef_joining, WidthState.JOINING_GROUP_ALEF), 611 (range(0x1780, 0x1783), WidthState.KHMER_COENG_ELIGIBLE_LETTER), 612 (range(0x1784, 0x1788), WidthState.KHMER_COENG_ELIGIBLE_LETTER), 613 (range(0x1789, 0x178D), WidthState.KHMER_COENG_ELIGIBLE_LETTER), 614 (range(0x178E, 0x1794), WidthState.KHMER_COENG_ELIGIBLE_LETTER), 615 (range(0x1795, 0x1799), WidthState.KHMER_COENG_ELIGIBLE_LETTER), 616 (range(0x179B, 0x179E), WidthState.KHMER_COENG_ELIGIBLE_LETTER), 617 ( 618 [0x17A0, 0x17A2, 0x17A7, 0x17AB, 0x17AC, 0x17AF], 619 WidthState.KHMER_COENG_ELIGIBLE_LETTER, 620 ), 621 ([0x17A4], WidthState.WIDE), 622 ([0x17D8], WidthState.THREE), 623 ([0x1A10], WidthState.BUGINESE_LETTER_YA), 624 (range(0x2D31, 0x2D66), WidthState.TIFINAGH_CONSONANT), 625 ([0x2D6F], WidthState.TIFINAGH_CONSONANT), 626 ([0xA4FC], WidthState.LISU_TONE_LETTER_MYA_NA_JEU), 627 ([0xA4FD], WidthState.LISU_TONE_LETTER_MYA_NA_JEU), 628 ([0xFE0F], WidthState.VARIATION_SELECTOR_16), 629 ([0x10C03], WidthState.OLD_TURKIC_LETTER_ORKHON_I), 630 (emoji_presentation, WidthState.EMOJI_PRESENTATION), 631 (emoji_modifiers, WidthState.EMOJI_MODIFIER), 632 (regional_indicators, WidthState.REGIONAL_INDICATOR), 633 ]: 634 for cp in cps: 635 not_ea[cp] = width 636 ea[cp] = width 637 638 # East-Asian only 639 ea[0x0338] = WidthState.COMBINING_LONG_SOLIDUS_OVERLAY 640 641 # Not East Asian only 642 not_ea[0xFE0E] = WidthState.VARIATION_SELECTOR_15 643 644 return (not_ea, ea) 645 646 647def load_joining_group_lam() -> list[tuple[Codepoint, Codepoint]]: 648 "Returns a list of character ranges with Joining_Group=Lam" 649 lam_joining = [] 650 load_property( 651 "extracted/DerivedJoiningGroup.txt", 652 "Lam", 653 lambda cp: lam_joining.append(cp), 654 ) 655 656 return to_sorted_ranges(lam_joining) 657 658 659def load_non_transparent_zero_widths( 660 width_map: list[WidthState], 661) -> list[tuple[Codepoint, Codepoint]]: 662 "Returns a list of characters with zero width but not 'Joining_Type=Transparent'" 663 664 zero_widths = set() 665 for cp, width in enumerate(width_map): 666 if width.width_alone() == 0: 667 zero_widths.add(cp) 668 transparent = set() 669 load_property( 670 "extracted/DerivedJoiningType.txt", 671 "T", 672 lambda cp: transparent.add(cp), 673 ) 674 675 return to_sorted_ranges(zero_widths - transparent) 676 677 678def load_ligature_transparent() -> list[tuple[Codepoint, Codepoint]]: 679 """Returns a list of character ranges corresponding to all combining marks that are also 680 `Default_Ignorable_Code_Point`s, plus ZWJ. This is the set of characters that won't interrupt 681 a ligature.""" 682 default_ignorables = set() 683 load_property( 684 "DerivedCoreProperties.txt", 685 "Default_Ignorable_Code_Point", 686 lambda cp: default_ignorables.add(cp), 687 ) 688 689 combining_marks = set() 690 load_property( 691 "extracted/DerivedGeneralCategory.txt", 692 "(?:Mc|Mn|Me)", 693 lambda cp: combining_marks.add(cp), 694 ) 695 696 default_ignorable_combinings = default_ignorables.intersection(combining_marks) 697 default_ignorable_combinings.add(0x200D) # ZWJ 698 699 return to_sorted_ranges(default_ignorable_combinings) 700 701 702def load_solidus_transparent( 703 ligature_transparents: list[tuple[Codepoint, Codepoint]], 704 cjk_width_map: list[WidthState], 705) -> list[tuple[Codepoint, Codepoint]]: 706 """Characters expanding to a canonical combining class above 1, plus `ligature_transparent`s from above. 707 Ranges matching ones in `ligature_transparent` exactly are excluded (for compression), so it needs to bechecked also. 708 """ 709 710 ccc_above_1 = set() 711 load_property( 712 "extracted/DerivedCombiningClass.txt", 713 "(?:[2-9]|(?:[1-9][0-9]+))", 714 lambda cp: ccc_above_1.add(cp), 715 ) 716 717 for lo, hi in ligature_transparents: 718 for cp in range(lo, hi + 1): 719 ccc_above_1.add(cp) 720 721 num_chars = len(ccc_above_1) 722 723 # Recursive decompositions 724 while True: 725 with fetch_open("UnicodeData.txt") as udata: 726 single = re.compile(r"([0-9A-Z]+);.*?;.*?;.*?;.*?;([0-9A-F ]+);") 727 for line in udata.readlines(): 728 if match := single.match(line): 729 composed = int(match.group(1), 16) 730 decomposed = [int(c, 16) for c in match.group(2).split(" ")] 731 if all([c in ccc_above_1 for c in decomposed]): 732 ccc_above_1.add(composed) 733 if len(ccc_above_1) == num_chars: 734 break 735 else: 736 num_chars = len(ccc_above_1) 737 738 for cp in ccc_above_1: 739 if cp != 0xFE0F: 740 assert ( 741 cjk_width_map[cp].table_width() != CharWidthInTable.SPECIAL 742 ), f"U+{cp:X}" 743 744 sorted = to_sorted_ranges(ccc_above_1) 745 return list(filter(lambda range: range not in ligature_transparents, sorted)) 746 747 748def load_normalization_tests() -> list[tuple[str, str, str, str, str]]: 749 def parse_codepoints(cps: str) -> str: 750 return "".join(map(lambda cp: chr(int(cp, 16)), cps.split(" "))) 751 752 with fetch_open("NormalizationTest.txt") as normtests: 753 ret = [] 754 single = re.compile( 755 r"^([0-9A-F ]+);([0-9A-F ]+);([0-9A-F ]+);([0-9A-F ]+);([0-9A-F ]+);" 756 ) 757 for line in normtests.readlines(): 758 if match := single.match(line): 759 ret.append( 760 ( 761 parse_codepoints(match.group(1)), 762 parse_codepoints(match.group(2)), 763 parse_codepoints(match.group(3)), 764 parse_codepoints(match.group(4)), 765 parse_codepoints(match.group(5)), 766 ) 767 ) 768 return ret 769 770 771def make_special_ranges( 772 width_map: list[WidthState], 773) -> list[tuple[tuple[Codepoint, Codepoint], WidthState]]: 774 "Assign ranges of characters to their special behavior (used in match)" 775 ret = [] 776 can_merge_with_prev = False 777 for cp, width in enumerate(width_map): 778 if width == WidthState.EMOJI_PRESENTATION: 779 can_merge_with_prev = False 780 elif width.table_width() == CharWidthInTable.SPECIAL: 781 if can_merge_with_prev and ret[-1][1] == width: 782 ret[-1] = ((ret[-1][0][0], cp), width) 783 else: 784 ret.append(((cp, cp), width)) 785 can_merge_with_prev = True 786 return ret 787 788 789class Bucket: 790 """A bucket contains a group of codepoints and an ordered width list. If one bucket's width 791 list overlaps with another's width list, those buckets can be merged via `try_extend`. 792 """ 793 794 def __init__(self): 795 """Creates an empty bucket.""" 796 self.entry_set = set() 797 self.widths = [] 798 799 def append(self, codepoint: Codepoint, width: CharWidthInTable): 800 """Adds a codepoint/width pair to the bucket, and appends `width` to the width list.""" 801 self.entry_set.add((codepoint, width)) 802 self.widths.append(width) 803 804 def try_extend(self, attempt: "Bucket") -> bool: 805 """If either `self` or `attempt`'s width list starts with the other bucket's width list, 806 set `self`'s width list to the longer of the two, add all of `attempt`'s codepoints 807 into `self`, and return `True`. Otherwise, return `False`.""" 808 (less, more) = (self.widths, attempt.widths) 809 if len(self.widths) > len(attempt.widths): 810 (less, more) = (attempt.widths, self.widths) 811 if less != more[: len(less)]: 812 return False 813 self.entry_set |= attempt.entry_set 814 self.widths = more 815 return True 816 817 def entries(self) -> list[tuple[Codepoint, CharWidthInTable]]: 818 """Return a list of the codepoint/width pairs in this bucket, sorted by codepoint.""" 819 result = list(self.entry_set) 820 result.sort() 821 return result 822 823 def width(self) -> CharWidthInTable | None: 824 """If all codepoints in this bucket have the same width, return that width; otherwise, 825 return `None`.""" 826 if len(self.widths) == 0: 827 return None 828 potential_width = self.widths[0] 829 for width in self.widths[1:]: 830 if potential_width != width: 831 return None 832 return potential_width 833 834 835def make_buckets( 836 entries: Iterable[tuple[int, CharWidthInTable]], low_bit: BitPos, cap_bit: BitPos 837) -> list[Bucket]: 838 """Partitions the `(Codepoint, EffectiveWidth)` tuples in `entries` into `Bucket`s. All 839 codepoints with identical bits from `low_bit` to `cap_bit` (exclusive) are placed in the 840 same bucket. Returns a list of the buckets in increasing order of those bits.""" 841 num_bits = cap_bit - low_bit 842 assert num_bits > 0 843 buckets = [Bucket() for _ in range(0, 2**num_bits)] 844 mask = (1 << num_bits) - 1 845 for codepoint, width in entries: 846 buckets[(codepoint >> low_bit) & mask].append(codepoint, width) 847 return buckets 848 849 850class Table: 851 """Represents a lookup table. Each table contains a certain number of subtables; each 852 subtable is indexed by a contiguous bit range of the codepoint and contains a list 853 of `2**(number of bits in bit range)` entries. (The bit range is the same for all subtables.) 854 855 Typically, tables contain a list of buckets of codepoints. Bucket `i`'s codepoints should 856 be indexed by sub-table `i` in the next-level lookup table. The entries of this table are 857 indexes into the bucket list (~= indexes into the sub-tables of the next-level table.) The 858 key to compression is that two different buckets in two different sub-tables may have the 859 same width list, which means that they can be merged into the same bucket. 860 861 If no bucket contains two codepoints with different widths, calling `indices_to_widths` will 862 discard the buckets and convert the entries into `EffectiveWidth` values.""" 863 864 def __init__( 865 self, 866 name: str, 867 entry_groups: Iterable[Iterable[tuple[int, CharWidthInTable]]], 868 secondary_entry_groups: Iterable[Iterable[tuple[int, CharWidthInTable]]], 869 low_bit: BitPos, 870 cap_bit: BitPos, 871 offset_type: OffsetType, 872 align: int, 873 bytes_per_row: int | None = None, 874 starting_indexed: list[Bucket] = [], 875 cfged: bool = False, 876 ): 877 """Create a lookup table with a sub-table for each `(Codepoint, EffectiveWidth)` iterator 878 in `entry_groups`. Each sub-table is indexed by codepoint bits in `low_bit..cap_bit`, 879 and each table entry is represented in the format specified by `offset_type`. Asserts 880 that this table is actually representable with `offset_type`.""" 881 starting_indexed_len = len(starting_indexed) 882 self.name = name 883 self.low_bit = low_bit 884 self.cap_bit = cap_bit 885 self.offset_type = offset_type 886 self.entries: list[int] = [] 887 self.indexed: list[Bucket] = list(starting_indexed) 888 self.align = align 889 self.bytes_per_row = bytes_per_row 890 self.cfged = cfged 891 892 buckets: list[Bucket] = [] 893 for entries in entry_groups: 894 buckets.extend(make_buckets(entries, self.low_bit, self.cap_bit)) 895 896 for bucket in buckets: 897 for i, existing in enumerate(self.indexed): 898 if existing.try_extend(bucket): 899 self.entries.append(i) 900 break 901 else: 902 self.entries.append(len(self.indexed)) 903 self.indexed.append(bucket) 904 905 self.primary_len = len(self.entries) 906 self.primary_bucket_len = len(self.indexed) 907 908 buckets = [] 909 for entries in secondary_entry_groups: 910 buckets.extend(make_buckets(entries, self.low_bit, self.cap_bit)) 911 912 for bucket in buckets: 913 for i, existing in enumerate(self.indexed): 914 if existing.try_extend(bucket): 915 self.entries.append(i) 916 break 917 else: 918 self.entries.append(len(self.indexed)) 919 self.indexed.append(bucket) 920 921 # Validate offset type 922 max_index = 1 << int(self.offset_type) 923 for index in self.entries: 924 assert index < max_index, f"{index} <= {max_index}" 925 926 self.indexed = self.indexed[starting_indexed_len:] 927 928 def indices_to_widths(self): 929 """Destructively converts the indices in this table to the `EffectiveWidth` values of 930 their buckets. Assumes that no bucket contains codepoints with different widths. 931 """ 932 self.entries = list(map(lambda i: int(self.indexed[i].width()), self.entries)) # type: ignore 933 del self.indexed 934 935 def buckets(self): 936 """Returns an iterator over this table's buckets.""" 937 return self.indexed 938 939 def to_bytes(self) -> list[int]: 940 """Returns this table's entries as a list of bytes. The bytes are formatted according to 941 the `OffsetType` which the table was created with, converting any `EffectiveWidth` entries 942 to their enum variant's integer value. For example, with `OffsetType.U2`, each byte will 943 contain four packed 2-bit entries.""" 944 entries_per_byte = 8 // int(self.offset_type) 945 byte_array = [] 946 for i in range(0, len(self.entries), entries_per_byte): 947 byte = 0 948 for j in range(0, entries_per_byte): 949 byte |= self.entries[i + j] << (j * int(self.offset_type)) 950 byte_array.append(byte) 951 return byte_array 952 953 954def make_tables( 955 width_map: list[WidthState], 956 cjk_width_map: list[WidthState], 957) -> list[Table]: 958 """Creates a table for each configuration in `table_cfgs`, with the first config corresponding 959 to the top-level lookup table, the second config corresponding to the second-level lookup 960 table, and so forth. `entries` is an iterator over the `(Codepoint, EffectiveWidth)` pairs 961 to include in the top-level table.""" 962 963 entries = enumerate([w.table_width() for w in width_map]) 964 cjk_entries = enumerate([w.table_width() for w in cjk_width_map]) 965 966 root_table = Table( 967 "WIDTH_ROOT", 968 [entries], 969 [], 970 TABLE_SPLITS[1], 971 MAX_CODEPOINT_BITS, 972 OffsetType.U8, 973 128, 974 ) 975 976 cjk_root_table = Table( 977 "WIDTH_ROOT_CJK", 978 [cjk_entries], 979 [], 980 TABLE_SPLITS[1], 981 MAX_CODEPOINT_BITS, 982 OffsetType.U8, 983 128, 984 starting_indexed=root_table.indexed, 985 cfged=True, 986 ) 987 988 middle_table = Table( 989 "WIDTH_MIDDLE", 990 map(lambda bucket: bucket.entries(), root_table.buckets()), 991 map(lambda bucket: bucket.entries(), cjk_root_table.buckets()), 992 TABLE_SPLITS[0], 993 TABLE_SPLITS[1], 994 OffsetType.U8, 995 2 ** (TABLE_SPLITS[1] - TABLE_SPLITS[0]), 996 bytes_per_row=2 ** (TABLE_SPLITS[1] - TABLE_SPLITS[0]), 997 ) 998 999 leaves_table = Table( 1000 "WIDTH_LEAVES", 1001 map( 1002 lambda bucket: bucket.entries(), 1003 middle_table.buckets()[: middle_table.primary_bucket_len], 1004 ), 1005 map( 1006 lambda bucket: bucket.entries(), 1007 middle_table.buckets()[middle_table.primary_bucket_len :], 1008 ), 1009 0, 1010 TABLE_SPLITS[0], 1011 OffsetType.U2, 1012 2 ** (TABLE_SPLITS[0] - 2), 1013 bytes_per_row=2 ** (TABLE_SPLITS[0] - 2), 1014 ) 1015 1016 return [root_table, cjk_root_table, middle_table, leaves_table] 1017 1018 1019def load_emoji_presentation_sequences() -> list[Codepoint]: 1020 """Outputs a list of cpodepoints, corresponding to all the valid characters for starting 1021 an emoji presentation sequence.""" 1022 1023 with fetch_open("emoji/emoji-variation-sequences.txt") as sequences: 1024 # Match all emoji presentation sequences 1025 # (one codepoint followed by U+FE0F, and labeled "emoji style") 1026 sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s*emoji style") 1027 codepoints = [] 1028 for line in sequences.readlines(): 1029 if match := sequence.match(line): 1030 cp = int(match.group(1), 16) 1031 codepoints.append(cp) 1032 return codepoints 1033 1034 1035def load_text_presentation_sequences() -> list[Codepoint]: 1036 """Outputs a list of codepoints, corresponding to all the valid characters 1037 whose widths change with a text presentation sequence.""" 1038 1039 text_presentation_seq_codepoints = set() 1040 with fetch_open("emoji/emoji-variation-sequences.txt") as sequences: 1041 # Match all text presentation sequences 1042 # (one codepoint followed by U+FE0E, and labeled "text style") 1043 sequence = re.compile(r"^([0-9A-F]+)\s+FE0E\s*;\s*text style") 1044 for line in sequences.readlines(): 1045 if match := sequence.match(line): 1046 cp = int(match.group(1), 16) 1047 text_presentation_seq_codepoints.add(cp) 1048 1049 default_emoji_codepoints = set() 1050 1051 load_property( 1052 "emoji/emoji-data.txt", 1053 "Emoji_Presentation", 1054 lambda cp: default_emoji_codepoints.add(cp), 1055 ) 1056 1057 codepoints = [] 1058 for cp in text_presentation_seq_codepoints.intersection(default_emoji_codepoints): 1059 # "Enclosed Ideographic Supplement" block; 1060 # wide even in text presentation 1061 if not cp in range(0x1F200, 0x1F300): 1062 codepoints.append(cp) 1063 1064 codepoints.sort() 1065 return codepoints 1066 1067 1068def load_emoji_modifier_bases() -> list[Codepoint]: 1069 """Outputs a list of codepoints, corresponding to all the valid characters 1070 whose widths change with a text presentation sequence.""" 1071 1072 ret = [] 1073 load_property( 1074 "emoji/emoji-data.txt", 1075 "Emoji_Modifier_Base", 1076 lambda cp: ret.append(cp), 1077 ) 1078 ret.sort() 1079 return ret 1080 1081 1082def make_presentation_sequence_table( 1083 seqs: list[Codepoint], 1084 lsb: int = 10, 1085) -> tuple[list[tuple[int, int]], list[list[int]]]: 1086 """Generates 2-level lookup table for whether a codepoint might start an emoji variation sequence. 1087 The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB. 1088 """ 1089 1090 prefixes_dict = defaultdict(set) 1091 for cp in seqs: 1092 prefixes_dict[cp >> lsb].add(cp & (2**lsb - 1)) 1093 1094 msbs: list[int] = list(prefixes_dict.keys()) 1095 1096 leaves: list[list[int]] = [] 1097 for cps in prefixes_dict.values(): 1098 leaf = [0] * (2 ** (lsb - 3)) 1099 for cp in cps: 1100 idx_in_leaf, bit_shift = divmod(cp, 8) 1101 leaf[idx_in_leaf] |= 1 << bit_shift 1102 leaves.append(leaf) 1103 1104 indexes = [(msb, index) for (index, msb) in enumerate(msbs)] 1105 1106 # Cull duplicate leaves 1107 i = 0 1108 while i < len(leaves): 1109 first_idx = leaves.index(leaves[i]) 1110 if first_idx == i: 1111 i += 1 1112 else: 1113 for j in range(0, len(indexes)): 1114 if indexes[j][1] == i: 1115 indexes[j] = (indexes[j][0], first_idx) 1116 elif indexes[j][1] > i: 1117 indexes[j] = (indexes[j][0], indexes[j][1] - 1) 1118 1119 leaves.pop(i) 1120 1121 return (indexes, leaves) 1122 1123 1124def make_ranges_table( 1125 seqs: list[Codepoint], 1126) -> tuple[list[tuple[int, int]], list[list[tuple[int, int]]]]: 1127 """Generates 2-level lookup table for a binary property of a codepoint. 1128 First level is all but the last byte, second level is ranges for last byte 1129 """ 1130 1131 prefixes_dict = defaultdict(list) 1132 for cp in seqs: 1133 prefixes_dict[cp >> 8].append(cp & 0xFF) 1134 1135 msbs: list[int] = list(prefixes_dict.keys()) 1136 1137 leaves: list[list[tuple[int, int]]] = [] 1138 for cps in prefixes_dict.values(): 1139 leaf = [] 1140 for cp in cps: 1141 if len(leaf) > 0 and leaf[-1][1] == cp - 1: 1142 leaf[-1] = (leaf[-1][0], cp) 1143 else: 1144 leaf.append((cp, cp)) 1145 leaves.append(leaf) 1146 1147 indexes = [(msb, index) for (index, msb) in enumerate(msbs)] 1148 1149 # Cull duplicate leaves 1150 i = 0 1151 while i < len(leaves): 1152 first_idx = leaves.index(leaves[i]) 1153 if first_idx == i: 1154 i += 1 1155 else: 1156 for j in range(0, len(indexes)): 1157 if indexes[j][1] == i: 1158 indexes[j] = (indexes[j][0], first_idx) 1159 elif indexes[j][1] > i: 1160 indexes[j] = (indexes[j][0], indexes[j][1] - 1) 1161 1162 leaves.pop(i) 1163 1164 return (indexes, leaves) 1165 1166 1167def lookup_fns( 1168 is_cjk: bool, 1169 special_ranges: list[tuple[tuple[Codepoint, Codepoint], WidthState]], 1170 joining_group_lam: list[tuple[Codepoint, Codepoint]], 1171) -> str: 1172 if is_cjk: 1173 cfg = '#[cfg(feature = "cjk")]\n' 1174 cjk_lo = "_cjk" 1175 cjk_cap = "_CJK" 1176 ambig = "wide" 1177 else: 1178 cfg = "" 1179 cjk_lo = "" 1180 cjk_cap = "" 1181 ambig = "narrow" 1182 s = f""" 1183/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c` by 1184/// consulting a multi-level lookup table. 1185/// 1186/// # Maintenance 1187/// The tables themselves are autogenerated but this function is hardcoded. You should have 1188/// nothing to worry about if you re-run `unicode.py` (for example, when updating Unicode.) 1189/// However, if you change the *actual structure* of the lookup tables (perhaps by editing the 1190/// `make_tables` function in `unicode.py`) you must ensure that this code reflects those changes. 1191{cfg}#[inline] 1192fn lookup_width{cjk_lo}(c: char) -> (u8, WidthInfo) {{ 1193 let cp = c as usize; 1194 1195 let t1_offset = WIDTH_ROOT{cjk_cap}.0[cp >> {TABLE_SPLITS[1]}]; 1196 1197 // Each sub-table in WIDTH_MIDDLE is 7 bits, and each stored entry is a byte, 1198 // so each sub-table is 128 bytes in size. 1199 // (Sub-tables are selected using the computed offset from the previous table.) 1200 let t2_offset = WIDTH_MIDDLE.0[usize::from(t1_offset)][cp >> {TABLE_SPLITS[0]} & 0x{(2 ** (TABLE_SPLITS[1] - TABLE_SPLITS[0]) - 1):X}]; 1201 1202 // Each sub-table in WIDTH_LEAVES is 6 bits, but each stored entry is 2 bits. 1203 // This is accomplished by packing four stored entries into one byte. 1204 // So each sub-table is 2**(7-2) == 32 bytes in size. 1205 // Since this is the last table, each entry represents an encoded width. 1206 let packed_widths = WIDTH_LEAVES.0[usize::from(t2_offset)][cp >> 2 & 0x{(2 ** (TABLE_SPLITS[0] - 2) - 1):X}]; 1207 1208 // Extract the packed width 1209 let width = packed_widths >> (2 * (cp & 0b11)) & 0b11; 1210 1211 if width < 3 {{ 1212 (width, WidthInfo::DEFAULT) 1213 }} else {{ 1214 match c {{ 1215""" 1216 1217 for (lo, hi), width in special_ranges: 1218 s += f" '\\u{{{lo:X}}}'" 1219 if hi != lo: 1220 s += f"..='\\u{{{hi:X}}}'" 1221 if width.is_carried(): 1222 width_info = width.name 1223 else: 1224 width_info = "DEFAULT" 1225 s += f" => ({width.width_alone()}, WidthInfo::{width_info}),\n" 1226 1227 s += f""" _ => (2, WidthInfo::EMOJI_PRESENTATION), 1228 }} 1229 }} 1230}} 1231 1232/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or 1233/// `None` if `c` is a control character. 1234/// Ambiguous width characters are treated as {ambig}. 1235{cfg}#[inline] 1236pub fn single_char_width{cjk_lo}(c: char) -> Option<usize> {{ 1237 if c < '\\u{{7F}}' {{ 1238 if c >= '\\u{{20}}' {{ 1239 // U+0020 to U+007F (exclusive) are single-width ASCII codepoints 1240 Some(1) 1241 }} else {{ 1242 // U+0000 to U+0020 (exclusive) are control codes 1243 None 1244 }} 1245 }} else if c >= '\\u{{A0}}' {{ 1246 // No characters >= U+00A0 are control codes, so we can consult the lookup tables 1247 Some(lookup_width{cjk_lo}(c).0.into()) 1248 }} else {{ 1249 // U+007F to U+00A0 (exclusive) are control codes 1250 None 1251 }} 1252}} 1253 1254/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`. 1255/// Ambiguous width characters are treated as {ambig}. 1256{cfg}#[inline] 1257fn width_in_str{cjk_lo}(c: char, mut next_info: WidthInfo) -> (i8, WidthInfo) {{ 1258 if next_info.is_emoji_presentation() {{ 1259 if starts_emoji_presentation_seq(c) {{ 1260 let width = if next_info.is_zwj_emoji_presentation() {{ 1261 0 1262 }} else {{ 1263 2 1264 }}; 1265 return (width, WidthInfo::EMOJI_PRESENTATION); 1266 }} else {{ 1267 next_info = next_info.unset_emoji_presentation(); 1268 }} 1269 }}""" 1270 1271 if is_cjk: 1272 s += """ 1273 if (matches!( 1274 next_info, 1275 WidthInfo::COMBINING_LONG_SOLIDUS_OVERLAY | WidthInfo::SOLIDUS_OVERLAY_ALEF 1276 ) && matches!(c, '<' | '=' | '>')) 1277 { 1278 return (2, WidthInfo::DEFAULT); 1279 }""" 1280 1281 s += """ 1282 if c <= '\\u{A0}' { 1283 match c { 1284 // According to the spec, LF should be width 1, which is how it is often rendered when it is forced to have a single-line rendering 1285 // However, this makes it harder to use this crate to calculate line breaks, and breaks assumptions of downstream crates. 1286 // https://github.com/unicode-rs/unicode-width/issues/60 1287 '\\n' => (0, WidthInfo::LINE_FEED), 1288 '\\r' if next_info == WidthInfo::LINE_FEED => (0, WidthInfo::DEFAULT), 1289 _ => (1, WidthInfo::DEFAULT), 1290 } 1291 } else { 1292 // Fast path 1293 if next_info != WidthInfo::DEFAULT { 1294 if c == '\\u{FE0F}' { 1295 return (0, next_info.set_emoji_presentation()); 1296 }""" 1297 1298 if not is_cjk: 1299 s += """ 1300 if c == '\\u{FE0E}' { 1301 return (0, next_info.set_text_presentation()); 1302 } 1303 if next_info.is_text_presentation() { 1304 if starts_non_ideographic_text_presentation_seq(c) { 1305 return (1, WidthInfo::DEFAULT); 1306 } else { 1307 next_info = next_info.unset_text_presentation(); 1308 } 1309 }""" 1310 1311 s += """ 1312 if next_info.is_ligature_transparent() { 1313 if c == '\\u{200D}' { 1314 return (0, next_info.set_zwj_bit()); 1315 } else if is_ligature_transparent(c) { 1316 return (0, next_info); 1317 } 1318 } 1319 1320 match (next_info, c) {""" 1321 if is_cjk: 1322 s += """ 1323 (WidthInfo::COMBINING_LONG_SOLIDUS_OVERLAY, _) if is_solidus_transparent(c) => { 1324 return ( 1325 lookup_width_cjk(c).0 as i8, 1326 WidthInfo::COMBINING_LONG_SOLIDUS_OVERLAY, 1327 ); 1328 } 1329 (WidthInfo::JOINING_GROUP_ALEF, '\\u{0338}') => { 1330 return (0, WidthInfo::SOLIDUS_OVERLAY_ALEF); 1331 } 1332 // Arabic Lam-Alef ligature 1333 ( 1334 WidthInfo::JOINING_GROUP_ALEF | WidthInfo::SOLIDUS_OVERLAY_ALEF, 1335 """ 1336 else: 1337 s += """ 1338 // Arabic Lam-Alef ligature 1339 ( 1340 WidthInfo::JOINING_GROUP_ALEF, 1341 """ 1342 1343 tail = False 1344 for lo, hi in joining_group_lam: 1345 if tail: 1346 s += " | " 1347 tail = True 1348 s += f"'\\u{{{lo:X}}}'" 1349 if hi != lo: 1350 s += f"..='\\u{{{hi:X}}}'" 1351 s += """, 1352 ) => return (0, WidthInfo::DEFAULT), 1353 (WidthInfo::JOINING_GROUP_ALEF, _) if is_transparent_zero_width(c) => { 1354 return (0, WidthInfo::JOINING_GROUP_ALEF); 1355 } 1356 1357 // Hebrew Alef-ZWJ-Lamed ligature 1358 (WidthInfo::ZWJ_HEBREW_LETTER_LAMED, '\\u{05D0}') => { 1359 return (0, WidthInfo::DEFAULT); 1360 } 1361 1362 // Khmer coeng signs 1363 (WidthInfo::KHMER_COENG_ELIGIBLE_LETTER, '\\u{17D2}') => { 1364 return (-1, WidthInfo::DEFAULT); 1365 } 1366 1367 // Buginese <a, -i> ZWJ ya ligature 1368 (WidthInfo::ZWJ_BUGINESE_LETTER_YA, '\\u{1A17}') => { 1369 return (0, WidthInfo::BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA) 1370 } 1371 (WidthInfo::BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA, '\\u{1A15}') => { 1372 return (0, WidthInfo::DEFAULT) 1373 } 1374 1375 // Tifinagh bi-consonants 1376 (WidthInfo::TIFINAGH_CONSONANT | WidthInfo::ZWJ_TIFINAGH_CONSONANT, '\\u{2D7F}') => { 1377 return (1, WidthInfo::TIFINAGH_JOINER_CONSONANT); 1378 } 1379 (WidthInfo::ZWJ_TIFINAGH_CONSONANT, '\\u{2D31}'..='\\u{2D65}' | '\\u{2D6F}') => { 1380 return (0, WidthInfo::DEFAULT); 1381 } 1382 (WidthInfo::TIFINAGH_JOINER_CONSONANT, '\\u{2D31}'..='\\u{2D65}' | '\\u{2D6F}') => { 1383 return (-1, WidthInfo::DEFAULT); 1384 } 1385 1386 // Lisu tone letter combinations 1387 (WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU, '\\u{A4F8}'..='\\u{A4FB}') => { 1388 return (0, WidthInfo::DEFAULT); 1389 } 1390 1391 // Old Turkic ligature 1392 (WidthInfo::ZWJ_OLD_TURKIC_LETTER_ORKHON_I, '\\u{10C32}') => { 1393 return (0, WidthInfo::DEFAULT); 1394 }""" 1395 1396 s += f""" 1397 // Emoji modifier 1398 (WidthInfo::EMOJI_MODIFIER, _) if is_emoji_modifier_base(c) => {{ 1399 return (0, WidthInfo::EMOJI_PRESENTATION); 1400 }} 1401 1402 // Regional indicator 1403 ( 1404 WidthInfo::REGIONAL_INDICATOR | WidthInfo::SEVERAL_REGIONAL_INDICATOR, 1405 '\\u{{1F1E6}}'..='\\u{{1F1FF}}', 1406 ) => return (1, WidthInfo::SEVERAL_REGIONAL_INDICATOR), 1407 1408 // ZWJ emoji 1409 ( 1410 WidthInfo::EMOJI_PRESENTATION 1411 | WidthInfo::SEVERAL_REGIONAL_INDICATOR 1412 | WidthInfo::EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION 1413 | WidthInfo::ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION 1414 | WidthInfo::EMOJI_MODIFIER, 1415 '\\u{{200D}}', 1416 ) => return (0, WidthInfo::ZWJ_EMOJI_PRESENTATION), 1417 (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\\u{{20E3}}') => {{ 1418 return (0, WidthInfo::KEYCAP_ZWJ_EMOJI_PRESENTATION); 1419 }} 1420 (WidthInfo::VS16_ZWJ_EMOJI_PRESENTATION, _) if starts_emoji_presentation_seq(c) => {{ 1421 return (0, WidthInfo::EMOJI_PRESENTATION) 1422 }} 1423 (WidthInfo::VS16_KEYCAP_ZWJ_EMOJI_PRESENTATION, '0'..='9' | '#' | '*') => {{ 1424 return (0, WidthInfo::EMOJI_PRESENTATION) 1425 }} 1426 (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\\u{{1F1E6}}'..='\\u{{1F1FF}}') => {{ 1427 return (1, WidthInfo::REGIONAL_INDICATOR_ZWJ_PRESENTATION); 1428 }} 1429 ( 1430 WidthInfo::REGIONAL_INDICATOR_ZWJ_PRESENTATION 1431 | WidthInfo::ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION, 1432 '\\u{{1F1E6}}'..='\\u{{1F1FF}}', 1433 ) => return (-1, WidthInfo::EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION), 1434 ( 1435 WidthInfo::EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION, 1436 '\\u{{1F1E6}}'..='\\u{{1F1FF}}', 1437 ) => return (3, WidthInfo::ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION), 1438 (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\\u{{1F3FB}}'..='\\u{{1F3FF}}') => {{ 1439 return (0, WidthInfo::EMOJI_MODIFIER); 1440 }} 1441 (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\\u{{E007F}}') => {{ 1442 return (0, WidthInfo::TAG_END_ZWJ_EMOJI_PRESENTATION); 1443 }} 1444 (WidthInfo::TAG_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{ 1445 return (0, WidthInfo::TAG_A1_END_ZWJ_EMOJI_PRESENTATION); 1446 }} 1447 (WidthInfo::TAG_A1_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{ 1448 return (0, WidthInfo::TAG_A2_END_ZWJ_EMOJI_PRESENTATION) 1449 }} 1450 (WidthInfo::TAG_A2_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{ 1451 return (0, WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION) 1452 }} 1453 (WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{ 1454 return (0, WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION) 1455 }} 1456 (WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{ 1457 return (0, WidthInfo::TAG_A5_END_ZWJ_EMOJI_PRESENTATION) 1458 }} 1459 (WidthInfo::TAG_A5_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{ 1460 return (0, WidthInfo::TAG_A6_END_ZWJ_EMOJI_PRESENTATION) 1461 }} 1462 ( 1463 WidthInfo::TAG_END_ZWJ_EMOJI_PRESENTATION 1464 | WidthInfo::TAG_A1_END_ZWJ_EMOJI_PRESENTATION 1465 | WidthInfo::TAG_A2_END_ZWJ_EMOJI_PRESENTATION 1466 | WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION 1467 | WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION, 1468 '\\u{{E0030}}'..='\\u{{E0039}}', 1469 ) => return (0, WidthInfo::TAG_D1_END_ZWJ_EMOJI_PRESENTATION), 1470 (WidthInfo::TAG_D1_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0030}}'..='\\u{{E0039}}') => {{ 1471 return (0, WidthInfo::TAG_D2_END_ZWJ_EMOJI_PRESENTATION); 1472 }} 1473 (WidthInfo::TAG_D2_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0030}}'..='\\u{{E0039}}') => {{ 1474 return (0, WidthInfo::TAG_D3_END_ZWJ_EMOJI_PRESENTATION); 1475 }} 1476 ( 1477 WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION 1478 | WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION 1479 | WidthInfo::TAG_A5_END_ZWJ_EMOJI_PRESENTATION 1480 | WidthInfo::TAG_A6_END_ZWJ_EMOJI_PRESENTATION 1481 | WidthInfo::TAG_D3_END_ZWJ_EMOJI_PRESENTATION, 1482 '\\u{{1F3F4}}', 1483 ) => return (0, WidthInfo::EMOJI_PRESENTATION), 1484 (WidthInfo::ZWJ_EMOJI_PRESENTATION, _) 1485 if lookup_width{cjk_lo}(c).1 == WidthInfo::EMOJI_PRESENTATION => 1486 {{ 1487 return (0, WidthInfo::EMOJI_PRESENTATION) 1488 }} 1489 1490 // Fallback 1491 _ => {{}} 1492 }} 1493 }} 1494 1495 let ret = lookup_width{cjk_lo}(c); 1496 (ret.0 as i8, ret.1) 1497 }} 1498}} 1499 1500{cfg}#[inline] 1501pub fn str_width{cjk_lo}(s: &str) -> usize {{ 1502 s.chars() 1503 .rfold( 1504 (0, WidthInfo::DEFAULT), 1505 |(sum, next_info), c| -> (usize, WidthInfo) {{ 1506 let (add, info) = width_in_str{cjk_lo}(c, next_info); 1507 (sum.wrapping_add_signed(isize::from(add)), info) 1508 }}, 1509 ) 1510 .0 1511}} 1512""" 1513 1514 return s 1515 1516 1517def emit_module( 1518 out_name: str, 1519 unicode_version: tuple[int, int, int], 1520 tables: list[Table], 1521 special_ranges: list[tuple[tuple[Codepoint, Codepoint], WidthState]], 1522 special_ranges_cjk: list[tuple[tuple[Codepoint, Codepoint], WidthState]], 1523 emoji_presentation_table: tuple[list[tuple[int, int]], list[list[int]]], 1524 text_presentation_table: tuple[list[tuple[int, int]], list[list[tuple[int, int]]]], 1525 emoji_modifier_table: tuple[list[tuple[int, int]], list[list[tuple[int, int]]]], 1526 joining_group_lam: list[tuple[Codepoint, Codepoint]], 1527 non_transparent_zero_widths: list[tuple[Codepoint, Codepoint]], 1528 ligature_transparent: list[tuple[Codepoint, Codepoint]], 1529 solidus_transparent: list[tuple[Codepoint, Codepoint]], 1530 normalization_tests: list[tuple[str, str, str, str, str]], 1531): 1532 """Outputs a Rust module to `out_name` using table data from `tables`. 1533 If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`. 1534 """ 1535 if os.path.exists(out_name): 1536 os.remove(out_name) 1537 with open(out_name, "w", newline="\n", encoding="utf-8") as module: 1538 module.write( 1539 """// Copyright 2012-2022 The Rust Project Developers. See the COPYRIGHT 1540// file at the top-level directory of this distribution and at 1541// http://rust-lang.org/COPYRIGHT. 1542// 1543// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 1544// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 1545// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 1546// option. This file may not be copied, modified, or distributed 1547// except according to those terms. 1548 1549// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly 1550 1551use core::cmp::Ordering; 1552 1553#[derive(Clone, Copy, Debug, PartialEq, Eq)] 1554struct WidthInfo(u16); 1555 1556impl WidthInfo { 1557 /// No special handling necessary 1558 const DEFAULT: Self = Self(0); 1559""" 1560 ) 1561 1562 for variant in WidthState: 1563 if variant.is_carried(): 1564 if variant.is_cjk_only(): 1565 module.write(' #[cfg(feature = "cjk")]\n') 1566 module.write( 1567 f" const {variant.name}: Self = Self(0b{variant.value:016b});\n" 1568 ) 1569 1570 module.write( 1571 f""" 1572 /// Whether this width mode is ligature_transparent 1573 /// (has 5th MSB set.) 1574 fn is_ligature_transparent(self) -> bool {{ 1575 (self.0 & 0b0000_1000_0000_0000) == 0b0000_1000_0000_0000 1576 }} 1577 1578 /// Sets 6th MSB. 1579 fn set_zwj_bit(self) -> Self {{ 1580 Self(self.0 | 0b0000_0100_0000_0000) 1581 }} 1582 1583 /// Has top bit set 1584 fn is_emoji_presentation(self) -> bool {{ 1585 (self.0 & 0b1000_0000_0000_0000) == 0b1000_0000_0000_0000 1586 }} 1587 1588 /// Has top bit set 1589 fn is_zwj_emoji_presentation(self) -> bool {{ 1590 (self.0 & 0b1011_0000_0000_0000) == 0b1001_0000_0000_0000 1591 }} 1592 1593 /// Set top bit 1594 fn set_emoji_presentation(self) -> Self {{ 1595 if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 1596 || (self.0 & 0b1001_0000_0000_0000) == 0b0001_0000_0000_0000 1597 {{ 1598 Self(self.0 | 0b1000_0000_0000_0000) 1599 }} else {{ 1600 Self::VARIATION_SELECTOR_16 1601 }} 1602 }} 1603 1604 /// Clear top bit 1605 fn unset_emoji_presentation(self) -> Self {{ 1606 if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 {{ 1607 Self(self.0 & 0b0111_1111_1111_1111) 1608 }} else {{ 1609 Self::DEFAULT 1610 }} 1611 }} 1612 1613 /// Has 2nd bit set 1614 fn is_text_presentation(self) -> bool {{ 1615 (self.0 & 0b0100_0000_0000_0000) == 0b0100_0000_0000_0000 1616 }} 1617 1618 /// Set 2nd bit 1619 fn set_text_presentation(self) -> Self {{ 1620 if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 {{ 1621 Self(self.0 | 0b0100_0000_0000_0000) 1622 }} else {{ 1623 Self(0b0100_0000_0000_0000) 1624 }} 1625 }} 1626 1627 /// Clear 2nd bit 1628 fn unset_text_presentation(self) -> Self {{ 1629 Self(self.0 & 0b1011_1111_1111_1111) 1630 }} 1631}} 1632 1633/// The version of [Unicode](http://www.unicode.org/) 1634/// that this version of unicode-width is based on. 1635pub const UNICODE_VERSION: (u8, u8, u8) = {unicode_version}; 1636""" 1637 ) 1638 1639 module.write(lookup_fns(False, special_ranges, joining_group_lam)) 1640 module.write(lookup_fns(True, special_ranges_cjk, joining_group_lam)) 1641 1642 emoji_presentation_idx, emoji_presentation_leaves = emoji_presentation_table 1643 text_presentation_idx, text_presentation_leaves = text_presentation_table 1644 emoji_modifier_idx, emoji_modifier_leaves = emoji_modifier_table 1645 1646 module.write( 1647 """ 1648/// Whether this character is a zero-width character with 1649/// `Joining_Type=Transparent`. Used by the Alef-Lamed ligatures. 1650/// See also [`is_ligature_transparent`], a near-subset of this (only ZWJ is excepted) 1651/// which is transparent for non-Arabic ligatures. 1652fn is_transparent_zero_width(c: char) -> bool { 1653 if lookup_width(c).0 != 0 { 1654 // Not zero-width 1655 false 1656 } else { 1657 let cp: u32 = c.into(); 1658 NON_TRANSPARENT_ZERO_WIDTHS 1659 .binary_search_by(|&(lo, hi)| { 1660 let lo = u32::from_le_bytes([lo[0], lo[1], lo[2], 0]); 1661 let hi = u32::from_le_bytes([hi[0], hi[1], hi[2], 0]); 1662 if cp < lo { 1663 Ordering::Greater 1664 } else if cp > hi { 1665 Ordering::Less 1666 } else { 1667 Ordering::Equal 1668 } 1669 }) 1670 .is_err() 1671 } 1672} 1673 1674/// Whether this character is a default-ignorable combining mark 1675/// or ZWJ. These characters won't interrupt non-Arabic ligatures. 1676fn is_ligature_transparent(c: char) -> bool { 1677 matches!(c, """ 1678 ) 1679 1680 tail = False 1681 for lo, hi in ligature_transparent: 1682 if tail: 1683 module.write(" | ") 1684 tail = True 1685 module.write(f"'\\u{{{lo:X}}}'") 1686 if hi != lo: 1687 module.write(f"..='\\u{{{hi:X}}}'") 1688 1689 module.write( 1690 """) 1691} 1692 1693/// Whether this character is transparent wrt the effect of 1694/// U+0338 COMBINING LONG SOLIDUS OVERLAY 1695/// on its base character. 1696#[cfg(feature = "cjk")] 1697fn is_solidus_transparent(c: char) -> bool { 1698 let cp: u32 = c.into(); 1699 is_ligature_transparent(c) 1700 || SOLIDUS_TRANSPARENT 1701 .binary_search_by(|&(lo, hi)| { 1702 let lo = u32::from_le_bytes([lo[0], lo[1], lo[2], 0]); 1703 let hi = u32::from_le_bytes([hi[0], hi[1], hi[2], 0]); 1704 if cp < lo { 1705 Ordering::Greater 1706 } else if cp > hi { 1707 Ordering::Less 1708 } else { 1709 Ordering::Equal 1710 } 1711 }) 1712 .is_ok() 1713} 1714 1715/// Whether this character forms an [emoji presentation sequence] 1716/// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence) 1717/// when followed by `'\\u{FEOF}'`. 1718/// Emoji presentation sequences are considered to have width 2. 1719#[inline] 1720pub fn starts_emoji_presentation_seq(c: char) -> bool { 1721 let cp: u32 = c.into(); 1722 // First level of lookup uses all but 10 LSB 1723 let top_bits = cp >> 10; 1724 let idx_of_leaf: usize = match top_bits { 1725""" 1726 ) 1727 1728 for msbs, i in emoji_presentation_idx: 1729 module.write(f" 0x{msbs:X} => {i},\n") 1730 1731 module.write( 1732 """ _ => return false, 1733 }; 1734 // Extract the 3-9th (0-indexed) least significant bits of `cp`, 1735 // and use them to index into `leaf_row`. 1736 let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap(); 1737 let leaf_byte = EMOJI_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf]; 1738 // Use the 3 LSB of `cp` to index into `leaf_byte`. 1739 ((leaf_byte >> (cp & 7)) & 1) == 1 1740} 1741 1742/// Returns `true` if `c` has default emoji presentation, but forms a [text presentation sequence] 1743/// (https://www.unicode.org/reports/tr51/#def_text_presentation_sequence) 1744/// when followed by `'\\u{FEOE}'`, and is not ideographic. 1745/// Such sequences are considered to have width 1. 1746#[inline] 1747pub fn starts_non_ideographic_text_presentation_seq(c: char) -> bool { 1748 let cp: u32 = c.into(); 1749 // First level of lookup uses all but 8 LSB 1750 let top_bits = cp >> 8; 1751 let leaf: &[(u8, u8)] = match top_bits { 1752""" 1753 ) 1754 1755 for msbs, i in text_presentation_idx: 1756 module.write(f" 0x{msbs:X} => &TEXT_PRESENTATION_LEAF_{i},\n") 1757 1758 module.write( 1759 """ _ => return false, 1760 }; 1761 1762 let bottom_bits = (cp & 0xFF) as u8; 1763 leaf.binary_search_by(|&(lo, hi)| { 1764 if bottom_bits < lo { 1765 Ordering::Greater 1766 } else if bottom_bits > hi { 1767 Ordering::Less 1768 } else { 1769 Ordering::Equal 1770 } 1771 }) 1772 .is_ok() 1773} 1774 1775/// Returns `true` if `c` is an `Emoji_Modifier_Base`. 1776#[inline] 1777pub fn is_emoji_modifier_base(c: char) -> bool { 1778 let cp: u32 = c.into(); 1779 // First level of lookup uses all but 8 LSB 1780 let top_bits = cp >> 8; 1781 let leaf: &[(u8, u8)] = match top_bits { 1782""" 1783 ) 1784 1785 for msbs, i in emoji_modifier_idx: 1786 module.write(f" 0x{msbs:X} => &EMOJI_MODIFIER_LEAF_{i},\n") 1787 1788 module.write( 1789 """ _ => return false, 1790 }; 1791 1792 let bottom_bits = (cp & 0xFF) as u8; 1793 leaf.binary_search_by(|&(lo, hi)| { 1794 if bottom_bits < lo { 1795 Ordering::Greater 1796 } else if bottom_bits > hi { 1797 Ordering::Less 1798 } else { 1799 Ordering::Equal 1800 } 1801 }) 1802 .is_ok() 1803} 1804 1805#[repr(align(32))] 1806struct Align32<T>(T); 1807 1808#[repr(align(64))] 1809struct Align64<T>(T); 1810 1811#[repr(align(128))] 1812struct Align128<T>(T); 1813""" 1814 ) 1815 1816 subtable_count = 1 1817 for i, table in enumerate(tables): 1818 new_subtable_count = len(table.buckets()) 1819 if i == len(tables) - 1: 1820 table.indices_to_widths() # for the last table, indices == widths 1821 byte_array = table.to_bytes() 1822 1823 if table.bytes_per_row is None: 1824 module.write( 1825 f"/// Autogenerated. {subtable_count} sub-table(s). Consult [`lookup_width`] for layout info.)\n" 1826 ) 1827 if table.cfged: 1828 module.write('#[cfg(feature = "cjk")]\n') 1829 module.write( 1830 f"static {table.name}: Align{table.align}<[u8; {len(byte_array)}]> = Align{table.align}([" 1831 ) 1832 for j, byte in enumerate(byte_array): 1833 # Add line breaks for every 15th entry (chosen to match what rustfmt does) 1834 if j % 16 == 0: 1835 module.write("\n ") 1836 module.write(f" 0x{byte:02X},") 1837 module.write("\n") 1838 else: 1839 num_rows = len(byte_array) // table.bytes_per_row 1840 num_primary_rows = ( 1841 table.primary_len 1842 // (8 // int(table.offset_type)) 1843 // table.bytes_per_row 1844 ) 1845 module.write( 1846 f""" 1847#[cfg(feature = "cjk")] 1848const {table.name}_LEN: usize = {num_rows}; 1849#[cfg(not(feature = "cjk"))] 1850const {table.name}_LEN: usize = {num_primary_rows}; 1851/// Autogenerated. {subtable_count} sub-table(s). Consult [`lookup_width`] for layout info. 1852static {table.name}: Align{table.align}<[[u8; {table.bytes_per_row}]; {table.name}_LEN]> = Align{table.align}([\n""" 1853 ) 1854 for row_num in range(0, num_rows): 1855 if row_num >= num_primary_rows: 1856 module.write(' #[cfg(feature = "cjk")]\n') 1857 module.write(" [\n") 1858 row = byte_array[ 1859 row_num 1860 * table.bytes_per_row : (row_num + 1) 1861 * table.bytes_per_row 1862 ] 1863 for subrow in batched(row, 15): 1864 module.write(" ") 1865 for entry in subrow: 1866 module.write(f" 0x{entry:02X},") 1867 module.write("\n") 1868 module.write(" ],\n") 1869 module.write("]);\n") 1870 subtable_count = new_subtable_count 1871 1872 # non transparent zero width table 1873 1874 module.write( 1875 f""" 1876/// Sorted list of codepoint ranges (inclusive) 1877/// that are zero-width but not `Joining_Type=Transparent` 1878/// FIXME: can we get better compression? 1879static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); {len(non_transparent_zero_widths)}] = [ 1880""" 1881 ) 1882 1883 for lo, hi in non_transparent_zero_widths: 1884 module.write( 1885 f" ([0x{lo & 0xFF:02X}, 0x{lo >> 8 & 0xFF:02X}, 0x{lo >> 16:02X}], [0x{hi & 0xFF:02X}, 0x{hi >> 8 & 0xFF:02X}, 0x{hi >> 16:02X}]),\n" 1886 ) 1887 1888 # solidus transparent table 1889 1890 module.write( 1891 f"""]; 1892 1893/// Sorted list of codepoint ranges (inclusive) 1894/// that don't affect how the combining solidus applies 1895/// (mostly ccc > 1). 1896/// FIXME: can we get better compression? 1897#[cfg(feature = "cjk")] 1898static SOLIDUS_TRANSPARENT: [([u8; 3], [u8; 3]); {len(solidus_transparent)}] = [ 1899""" 1900 ) 1901 1902 for lo, hi in solidus_transparent: 1903 module.write( 1904 f" ([0x{lo & 0xFF:02X}, 0x{lo >> 8 & 0xFF:02X}, 0x{lo >> 16:02X}], [0x{hi & 0xFF:02X}, 0x{hi >> 8 & 0xFF:02X}, 0x{hi >> 16:02X}]),\n" 1905 ) 1906 1907 # emoji table 1908 1909 module.write( 1910 f"""]; 1911 1912/// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint 1913/// to get whether it can start an emoji presentation sequence. 1914static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(emoji_presentation_leaves)}]> = Align128([ 1915""" 1916 ) 1917 for leaf in emoji_presentation_leaves: 1918 module.write(" [\n") 1919 for row in batched(leaf, 15): 1920 module.write(" ") 1921 for entry in row: 1922 module.write(f" 0x{entry:02X},") 1923 module.write("\n") 1924 module.write(" ],\n") 1925 1926 module.write("]);\n") 1927 1928 # text table 1929 1930 for leaf_idx, leaf in enumerate(text_presentation_leaves): 1931 module.write( 1932 f""" 1933#[rustfmt::skip] 1934static TEXT_PRESENTATION_LEAF_{leaf_idx}: [(u8, u8); {len(leaf)}] = [ 1935""" 1936 ) 1937 for lo, hi in leaf: 1938 module.write(f" (0x{lo:02X}, 0x{hi:02X}),\n") 1939 module.write(f"];\n") 1940 1941 # emoji modifier table 1942 1943 for leaf_idx, leaf in enumerate(emoji_modifier_leaves): 1944 module.write( 1945 f""" 1946#[rustfmt::skip] 1947static EMOJI_MODIFIER_LEAF_{leaf_idx}: [(u8, u8); {len(leaf)}] = [ 1948""" 1949 ) 1950 for lo, hi in leaf: 1951 module.write(f" (0x{lo:02X}, 0x{hi:02X}),\n") 1952 module.write(f"];\n") 1953 1954 test_width_variants = [] 1955 test_width_variants_cjk = [] 1956 for variant in WidthState: 1957 if variant.is_carried(): 1958 if not variant.is_cjk_only(): 1959 test_width_variants.append(variant) 1960 if not variant.is_non_cjk_only(): 1961 test_width_variants_cjk.append(variant) 1962 1963 module.write( 1964 f""" 1965#[cfg(test)] 1966mod tests {{ 1967 use super::*; 1968 1969 fn str_width_test(s: &str, init: WidthInfo) -> isize {{ 1970 s.chars() 1971 .rfold((0, init), |(sum, next_info), c| -> (isize, WidthInfo) {{ 1972 let (add, info) = width_in_str(c, next_info); 1973 (sum.checked_add(isize::from(add)).unwrap(), info) 1974 }}) 1975 .0 1976 }} 1977 1978 #[cfg(feature = "cjk")] 1979 fn str_width_test_cjk(s: &str, init: WidthInfo) -> isize {{ 1980 s.chars() 1981 .rfold((0, init), |(sum, next_info), c| -> (isize, WidthInfo) {{ 1982 let (add, info) = width_in_str_cjk(c, next_info); 1983 (sum.checked_add(isize::from(add)).unwrap(), info) 1984 }}) 1985 .0 1986 }} 1987 1988 #[test] 1989 fn test_normalization() {{ 1990 for &(orig, nfc, nfd, nfkc, nfkd) in &NORMALIZATION_TEST {{ 1991 for init in NORMALIZATION_TEST_WIDTHS {{ 1992 assert_eq!( 1993 str_width_test(orig, init), 1994 str_width_test(nfc, init), 1995 "width of X = {{orig:?}} differs from toNFC(X) = {{nfc:?}} with mode {{init:X?}}", 1996 ); 1997 assert_eq!( 1998 str_width_test(orig, init), 1999 str_width_test(nfd, init), 2000 "width of X = {{orig:?}} differs from toNFD(X) = {{nfd:?}} with mode {{init:X?}}", 2001 ); 2002 assert_eq!( 2003 str_width_test(nfkc, init), 2004 str_width_test(nfkd, init), 2005 "width of toNFKC(X) = {{nfkc:?}} differs from toNFKD(X) = {{nfkd:?}} with mode {{init:X?}}", 2006 ); 2007 }} 2008 2009 #[cfg(feature = "cjk")] 2010 for init in NORMALIZATION_TEST_WIDTHS_CJK {{ 2011 assert_eq!( 2012 str_width_test_cjk(orig, init), 2013 str_width_test_cjk(nfc, init), 2014 "CJK width of X = {{orig:?}} differs from toNFC(X) = {{nfc:?}} with mode {{init:X?}}", 2015 ); 2016 assert_eq!( 2017 str_width_test_cjk(orig, init), 2018 str_width_test_cjk(nfd, init), 2019 "CJK width of X = {{orig:?}} differs from toNFD(X) = {{nfd:?}} with mode {{init:X?}}", 2020 ); 2021 assert_eq!( 2022 str_width_test_cjk(nfkc, init), 2023 str_width_test_cjk(nfkd, init), 2024 "CJK width of toNFKC(X) = {{nfkc:?}} differs from toNFKD(X) = {{nfkd:?}} with mode {{init:?}}", 2025 ); 2026 }} 2027 }} 2028 }} 2029 2030 static NORMALIZATION_TEST_WIDTHS: [WidthInfo; {len(test_width_variants) + 1}] = [ 2031 WidthInfo::DEFAULT,\n""" 2032 ) 2033 2034 for variant in WidthState: 2035 if variant.is_carried() and not variant.is_cjk_only(): 2036 module.write(f" WidthInfo::{variant.name},\n") 2037 2038 module.write( 2039 f""" ]; 2040 2041 #[cfg(feature = "cjk")] 2042 static NORMALIZATION_TEST_WIDTHS_CJK: [WidthInfo; {len(test_width_variants_cjk) + 1}] = [ 2043 WidthInfo::DEFAULT,\n""" 2044 ) 2045 2046 for variant in WidthState: 2047 if variant.is_carried() and not variant.is_non_cjk_only(): 2048 module.write(f" WidthInfo::{variant.name},\n") 2049 2050 module.write( 2051 f""" ]; 2052 2053 #[rustfmt::skip] 2054 static NORMALIZATION_TEST: [(&str, &str, &str, &str, &str); {len(normalization_tests)}] = [\n""" 2055 ) 2056 for orig, nfc, nfd, nfkc, nfkd in normalization_tests: 2057 module.write( 2058 f' (r#"{orig}"#, r#"{nfc}"#, r#"{nfd}"#, r#"{nfkc}"#, r#"{nfkd}"#),\n' 2059 ) 2060 2061 module.write(" ];\n}\n") 2062 2063 2064def main(module_path: str): 2065 """Obtain character data from the latest version of Unicode, transform it into a multi-level 2066 lookup table for character width, and write a Rust module utilizing that table to 2067 `module_filename`. 2068 2069 See `lib.rs` for documentation of the exact width rules. 2070 """ 2071 version = load_unicode_version() 2072 print(f"Generating module for Unicode {version[0]}.{version[1]}.{version[2]}") 2073 2074 (width_map, cjk_width_map) = load_width_maps() 2075 2076 tables = make_tables(width_map, cjk_width_map) 2077 2078 special_ranges = make_special_ranges(width_map) 2079 cjk_special_ranges = make_special_ranges(cjk_width_map) 2080 2081 emoji_presentations = load_emoji_presentation_sequences() 2082 emoji_presentation_table = make_presentation_sequence_table(emoji_presentations) 2083 2084 text_presentations = load_text_presentation_sequences() 2085 text_presentation_table = make_ranges_table(text_presentations) 2086 2087 emoji_modifier_bases = load_emoji_modifier_bases() 2088 emoji_modifier_table = make_ranges_table(emoji_modifier_bases) 2089 2090 joining_group_lam = load_joining_group_lam() 2091 non_transparent_zero_widths = load_non_transparent_zero_widths(width_map) 2092 ligature_transparent = load_ligature_transparent() 2093 solidus_transparent = load_solidus_transparent(ligature_transparent, cjk_width_map) 2094 2095 normalization_tests = load_normalization_tests() 2096 2097 fetch_open("emoji-test.txt", "../tests", emoji=True) 2098 2099 print("------------------------") 2100 total_size = 0 2101 for i, table in enumerate(tables): 2102 size_bytes = len(table.to_bytes()) 2103 print(f"Table {i} size: {size_bytes} bytes") 2104 total_size += size_bytes 2105 2106 for s, table in [ 2107 ("Emoji presentation", emoji_presentation_table), 2108 ]: 2109 index_size = len(table[0]) * (math.ceil(math.log(table[0][-1][0], 256)) + 8) 2110 print(f"{s} index size: {index_size} bytes") 2111 total_size += index_size 2112 leaves_size = len(table[1]) * len(table[1][0]) 2113 print(f"{s} leaves size: {leaves_size} bytes") 2114 total_size += leaves_size 2115 2116 for s, table in [ 2117 ("Text presentation", text_presentation_table), 2118 ("Emoji modifier", emoji_modifier_table), 2119 ]: 2120 index_size = len(table[0]) * (math.ceil(math.log(table[0][-1][0], 256)) + 16) 2121 print(f"{s} index size: {index_size} bytes") 2122 total_size += index_size 2123 leaves_size = 2 * sum(map(len, table[1])) 2124 print(f"{s} leaves size: {leaves_size} bytes") 2125 total_size += leaves_size 2126 2127 for s, table in [ 2128 ("Non transparent zero width", non_transparent_zero_widths), 2129 ("Solidus transparent", solidus_transparent), 2130 ]: 2131 table_size = 6 * len(table) 2132 print(f"{s} table size: {table_size} bytes") 2133 total_size += table_size 2134 print("------------------------") 2135 print(f" Total size: {total_size} bytes") 2136 2137 emit_module( 2138 out_name=module_path, 2139 unicode_version=version, 2140 tables=tables, 2141 special_ranges=special_ranges, 2142 special_ranges_cjk=cjk_special_ranges, 2143 emoji_presentation_table=emoji_presentation_table, 2144 text_presentation_table=text_presentation_table, 2145 emoji_modifier_table=emoji_modifier_table, 2146 joining_group_lam=joining_group_lam, 2147 non_transparent_zero_widths=non_transparent_zero_widths, 2148 ligature_transparent=ligature_transparent, 2149 solidus_transparent=solidus_transparent, 2150 normalization_tests=normalization_tests, 2151 ) 2152 print(f'Wrote to "{module_path}"') 2153 2154 2155if __name__ == "__main__": 2156 main(MODULE_PATH) 2157