1# Copyright 2013-2014 The rust-url developers. 2# 3# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 4# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 5# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 6# option. This file may not be copied, modified, or distributed 7# except according to those terms. 8 9# Run as: python make_uts46_mapping_table.py IdnaMappingTable.txt > uts46_mapping_table.rs 10# You can get the latest idna table from 11# http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt 12 13import collections 14import itertools 15 16print('''\ 17// Copyright 2013-2020 The rust-url developers. 18// 19// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 20// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 21// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 22// option. This file may not be copied, modified, or distributed 23// except according to those terms. 24 25// Generated by make_idna_table.py 26''') 27 28txt = open("IdnaMappingTable.txt") 29 30def escape_char(c): 31 return "\\u{%x}" % ord(c[0]) 32 33def char(s): 34 return chr(int(s, 16)) 35 36strtab = collections.OrderedDict() 37strtab_offset = 0 38 39def strtab_slice(s): 40 global strtab, strtab_offset 41 42 if s in strtab: 43 return strtab[s] 44 else: 45 utf8_len = len(s.encode('utf8')) 46 c = (strtab_offset, utf8_len) 47 strtab[s] = c 48 strtab_offset += utf8_len 49 return c 50 51def rust_slice(s): 52 start = s[0] 53 length = s[1] 54 start_lo = start & 0xff 55 start_hi = start >> 8 56 assert length <= 255 57 assert start_hi <= 255 58 return "(StringTableSlice { byte_start_lo: %d, byte_start_hi: %d, byte_len: %d })" % (start_lo, start_hi, length) 59 60ranges = [] 61 62for line in txt: 63 # remove comments 64 line, _, _ = line.partition('#') 65 # skip empty lines 66 if len(line.strip()) == 0: 67 continue 68 fields = line.split(';') 69 if fields[0].strip() == 'D800..DFFF': 70 continue # Surrogates don't occur in Rust strings. 71 first, _, last = fields[0].strip().partition('..') 72 if not last: 73 last = first 74 mapping = fields[1].strip().replace('_', ' ').title().replace(' ', '') 75 unicode_str = None 76 if len(fields) > 2: 77 if fields[2].strip(): 78 unicode_str = u''.join(char(c) for c in fields[2].strip().split(' ')) 79 elif mapping == "Deviation": 80 unicode_str = u'' 81 82 if len(fields) > 3: 83 assert fields[3].strip() in ('NV8', 'XV8'), fields[3] 84 assert mapping == 'Valid', mapping 85 mapping = 'DisallowedIdna2008' 86 87 ranges.append((first, last, mapping, unicode_str)) 88 89def mergeable_key(r): 90 mapping = r[2] 91 92 # These types have associated data, so we should not merge them. 93 if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'): 94 return r 95 assert mapping in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid', 'DisallowedIdna2008') 96 return mapping 97 98grouped_ranges = itertools.groupby(ranges, key=mergeable_key) 99 100optimized_ranges = [] 101 102for (k, g) in grouped_ranges: 103 group = list(g) 104 if len(group) == 1: 105 optimized_ranges.append(group[0]) 106 continue 107 # Assert that nothing in the group has an associated unicode string. 108 for g in group: 109 if g[3] is not None and len(g[3]) > 2: 110 assert not g[3][2].strip() 111 # Assert that consecutive members of the group don't leave gaps in 112 # the codepoint space. 113 a, b = itertools.tee(group) 114 next(b, None) 115 for (g1, g2) in zip(a, b): 116 last_char = int(g1[1], 16) 117 next_char = int(g2[0], 16) 118 if last_char + 1 == next_char: 119 continue 120 # There's a gap where surrogates would appear, but we don't have to 121 # worry about that gap, as surrogates never appear in Rust strings. 122 # Assert we're seeing the surrogate case here. 123 assert last_char == 0xd7ff 124 assert next_char == 0xe000 125 optimized_ranges.append((group[0][0], group[-1][1]) + group[0][2:]) 126 127def is_single_char_range(r): 128 (first, last, _, _) = r 129 return first == last 130 131# We can reduce the size of the character range table and the index table to about 1/4 132# by merging runs of single character ranges and using character offsets from the start 133# of that range to retrieve the correct `Mapping` value 134def merge_single_char_ranges(ranges): 135 current = [] 136 for r in ranges: 137 if not current or is_single_char_range(current[-1]) and is_single_char_range(r): 138 current.append(r) 139 continue 140 if len(current) != 0: 141 ret = current 142 current = [r] 143 yield ret 144 continue 145 current.append(r) 146 ret = current 147 current = [] 148 yield ret 149 yield current 150 151optimized_ranges = list(merge_single_char_ranges(optimized_ranges)) 152 153SINGLE_MARKER = 1 << 15 154 155print("static TABLE: &[(char, u16)] = &[") 156 157offset = 0 158for ranges in optimized_ranges: 159 assert offset < SINGLE_MARKER 160 161 block_len = len(ranges) 162 single = SINGLE_MARKER if block_len == 1 else 0 163 index = offset | single 164 offset += block_len 165 166 start = escape_char(char(ranges[0][0])) 167 print(" ('%s', %s)," % (start, index)) 168 169print("];\n") 170 171print("static MAPPING_TABLE: &[Mapping] = &[") 172 173for ranges in optimized_ranges: 174 for (first, last, mapping, unicode_str) in ranges: 175 if unicode_str is not None: 176 mapping += rust_slice(strtab_slice(unicode_str)) 177 print(" %s," % mapping) 178 179print("];\n") 180 181def escape_str(s): 182 return [escape_char(c) for c in s] 183 184print("static STRING_TABLE: &str = \"%s\";" 185 % '\\\n '.join(itertools.chain(*[escape_str(s) for s in strtab.keys()]))) 186