1 //! Character inclusion in binary or General_Category value Unicode sets.
2 //!
3 //! We rely on dead code elimination to remove the tables that aren't needed.
4
5 #![allow(bad_style)]
6 #![allow(clippy::all)]
7
8 use alloc::boxed::Box;
9
10 macro_rules! property_functions {
11 ($module:ident, $property_names:ident, [$(
12 $prop:ident,
13 )*]) => {
14 #[allow(unused)]
15 mod $module;
16 // unicode::ALPHABETIC('a')
17 $(pub fn $prop(c: char) -> bool {
18 self::$module::$prop.contains_char(c)
19 })*
20
21 pub static $property_names: &[&str] = &[
22 $(stringify!($prop),)*
23 ];
24 };
25 }
26
27 macro_rules! char_property_functions {
28 // For define custom property names
29 {$(
30 mod $module:ident;
31 static $property_names:ident = [$(
32 $prop:ident,
33 )*];
34 )*} => {$(
35 property_functions!($module, $property_names, [$(
36 $prop,
37 )*]);
38 )*};
39 // For define property by copy BY_NAME values from `ucd-generate` generated.
40 {$(
41 mod $module:ident;
42 static $property_names:ident = [$(
43 ($_name:tt, $prop:ident),
44 )*];
45 )*} => {$(
46 property_functions!($module, $property_names, [$(
47 $prop,
48 )*]);
49 )*};
50 }
51
52 char_property_functions! {
53 mod binary;
54 static BINARY_PROPERTY_NAMES = [
55 // ASCII_HEX_DIGIT, // let this one be stripped out -- the full trie is wasteful for ASCII
56 ALPHABETIC, BIDI_CONTROL, CASE_IGNORABLE, CASED, CHANGES_WHEN_CASEFOLDED,
57 CHANGES_WHEN_CASEMAPPED, CHANGES_WHEN_LOWERCASED, CHANGES_WHEN_TITLECASED,
58 CHANGES_WHEN_UPPERCASED, DASH, DEFAULT_IGNORABLE_CODE_POINT, DEPRECATED, DIACRITIC,
59 EXTENDER, GRAPHEME_BASE, GRAPHEME_EXTEND, GRAPHEME_LINK, HEX_DIGIT, HYPHEN,
60 IDS_BINARY_OPERATOR, IDS_TRINARY_OPERATOR, ID_CONTINUE, ID_START, IDEOGRAPHIC, JOIN_CONTROL,
61 LOGICAL_ORDER_EXCEPTION, LOWERCASE, MATH, NONCHARACTER_CODE_POINT, OTHER_ALPHABETIC,
62 OTHER_DEFAULT_IGNORABLE_CODE_POINT, OTHER_GRAPHEME_EXTEND, OTHER_ID_CONTINUE,
63 OTHER_ID_START, OTHER_LOWERCASE, OTHER_MATH, OTHER_UPPERCASE, PATTERN_SYNTAX,
64 PATTERN_WHITE_SPACE, PREPENDED_CONCATENATION_MARK, QUOTATION_MARK, RADICAL,
65 REGIONAL_INDICATOR, SENTENCE_TERMINAL, SOFT_DOTTED, TERMINAL_PUNCTUATION, UNIFIED_IDEOGRAPH,
66 UPPERCASE, VARIATION_SELECTOR, WHITE_SPACE, XID_CONTINUE, XID_START,
67 ];
68 }
69
70 char_property_functions! {
71 mod category;
72 // Copy from category::BY_NAME
73 static CATEGORY_PROPERTY_NAMES = [
74 ("Cased_Letter", CASED_LETTER), ("Close_Punctuation", CLOSE_PUNCTUATION),
75 ("Connector_Punctuation", CONNECTOR_PUNCTUATION), ("Control", CONTROL),
76 ("Currency_Symbol", CURRENCY_SYMBOL),
77 ("Dash_Punctuation", DASH_PUNCTUATION), ("Decimal_Number", DECIMAL_NUMBER),
78 ("Enclosing_Mark", ENCLOSING_MARK),
79 ("Final_Punctuation", FINAL_PUNCTUATION), ("Format", FORMAT),
80 ("Initial_Punctuation", INITIAL_PUNCTUATION), ("Letter", LETTER),
81 ("Letter_Number", LETTER_NUMBER), ("Line_Separator", LINE_SEPARATOR),
82 ("Lowercase_Letter", LOWERCASE_LETTER), ("Mark", MARK),
83 ("Math_Symbol", MATH_SYMBOL), ("Modifier_Letter", MODIFIER_LETTER),
84 ("Modifier_Symbol", MODIFIER_SYMBOL), ("Nonspacing_Mark", NONSPACING_MARK),
85 ("Number", NUMBER), ("Open_Punctuation", OPEN_PUNCTUATION),
86 ("Other", OTHER), ("Other_Letter", OTHER_LETTER),
87 ("Other_Number", OTHER_NUMBER), ("Other_Punctuation", OTHER_PUNCTUATION),
88 ("Other_Symbol", OTHER_SYMBOL),
89 ("Paragraph_Separator", PARAGRAPH_SEPARATOR), ("Private_Use", PRIVATE_USE),
90 ("Punctuation", PUNCTUATION), ("Separator", SEPARATOR),
91 ("Space_Separator", SPACE_SEPARATOR), ("Spacing_Mark", SPACING_MARK),
92 ("Surrogate", SURROGATE), ("Symbol", SYMBOL),
93 ("Titlecase_Letter", TITLECASE_LETTER), ("Unassigned", UNASSIGNED),
94 ("Uppercase_Letter", UPPERCASE_LETTER),
95 ];
96
97 mod script;
98 // Copy from script::BY_NAME
99 static SCRIPT_PROPERTY_NAMES = [
100 ("Adlam", ADLAM),
101 ("Ahom", AHOM),
102 ("Anatolian_Hieroglyphs", ANATOLIAN_HIEROGLYPHS),
103 ("Arabic", ARABIC),
104 ("Armenian", ARMENIAN),
105 ("Avestan", AVESTAN),
106 ("Balinese", BALINESE),
107 ("Bamum", BAMUM),
108 ("Bassa_Vah", BASSA_VAH),
109 ("Batak", BATAK),
110 ("Bengali", BENGALI),
111 ("Bhaiksuki", BHAIKSUKI),
112 ("Bopomofo", BOPOMOFO),
113 ("Brahmi", BRAHMI),
114 ("Braille", BRAILLE),
115 ("Buginese", BUGINESE),
116 ("Buhid", BUHID),
117 ("Canadian_Aboriginal", CANADIAN_ABORIGINAL),
118 ("Carian", CARIAN),
119 ("Caucasian_Albanian", CAUCASIAN_ALBANIAN),
120 ("Chakma", CHAKMA),
121 ("Cham", CHAM),
122 ("Cherokee", CHEROKEE),
123 ("Chorasmian", CHORASMIAN),
124 ("Common", COMMON),
125 ("Coptic", COPTIC),
126 ("Cuneiform", CUNEIFORM),
127 ("Cypriot", CYPRIOT),
128 ("Cypro_Minoan", CYPRO_MINOAN),
129 ("Cyrillic", CYRILLIC),
130 ("Deseret", DESERET),
131 ("Devanagari", DEVANAGARI),
132 ("Dives_Akuru", DIVES_AKURU),
133 ("Dogra", DOGRA),
134 ("Duployan", DUPLOYAN),
135 ("Egyptian_Hieroglyphs", EGYPTIAN_HIEROGLYPHS),
136 ("Elbasan", ELBASAN),
137 ("Elymaic", ELYMAIC),
138 ("Ethiopic", ETHIOPIC),
139 ("Georgian", GEORGIAN),
140 ("Glagolitic", GLAGOLITIC),
141 ("Gothic", GOTHIC),
142 ("Grantha", GRANTHA),
143 ("Greek", GREEK),
144 ("Gujarati", GUJARATI),
145 ("Gunjala_Gondi", GUNJALA_GONDI),
146 ("Gurmukhi", GURMUKHI),
147 ("Han", HAN),
148 ("Hangul", HANGUL),
149 ("Hanifi_Rohingya", HANIFI_ROHINGYA),
150 ("Hanunoo", HANUNOO),
151 ("Hatran", HATRAN),
152 ("Hebrew", HEBREW),
153 ("Hiragana", HIRAGANA),
154 ("Imperial_Aramaic", IMPERIAL_ARAMAIC),
155 ("Inherited", INHERITED),
156 ("Inscriptional_Pahlavi", INSCRIPTIONAL_PAHLAVI),
157 ("Inscriptional_Parthian", INSCRIPTIONAL_PARTHIAN),
158 ("Javanese", JAVANESE),
159 ("Kaithi", KAITHI),
160 ("Kannada", KANNADA),
161 ("Katakana", KATAKANA),
162 ("Kawi", KAWI),
163 ("Kayah_Li", KAYAH_LI),
164 ("Kharoshthi", KHAROSHTHI),
165 ("Khitan_Small_Script", KHITAN_SMALL_SCRIPT),
166 ("Khmer", KHMER),
167 ("Khojki", KHOJKI),
168 ("Khudawadi", KHUDAWADI),
169 ("Lao", LAO),
170 ("Latin", LATIN),
171 ("Lepcha", LEPCHA),
172 ("Limbu", LIMBU),
173 ("Linear_A", LINEAR_A),
174 ("Linear_B", LINEAR_B),
175 ("Lisu", LISU),
176 ("Lycian", LYCIAN),
177 ("Lydian", LYDIAN),
178 ("Mahajani", MAHAJANI),
179 ("Makasar", MAKASAR),
180 ("Malayalam", MALAYALAM),
181 ("Mandaic", MANDAIC),
182 ("Manichaean", MANICHAEAN),
183 ("Marchen", MARCHEN),
184 ("Masaram_Gondi", MASARAM_GONDI),
185 ("Medefaidrin", MEDEFAIDRIN),
186 ("Meetei_Mayek", MEETEI_MAYEK),
187 ("Mende_Kikakui", MENDE_KIKAKUI),
188 ("Meroitic_Cursive", MEROITIC_CURSIVE),
189 ("Meroitic_Hieroglyphs", MEROITIC_HIEROGLYPHS),
190 ("Miao", MIAO),
191 ("Modi", MODI),
192 ("Mongolian", MONGOLIAN),
193 ("Mro", MRO),
194 ("Multani", MULTANI),
195 ("Myanmar", MYANMAR),
196 ("Nabataean", NABATAEAN),
197 ("Nag_Mundari", NAG_MUNDARI),
198 ("Nandinagari", NANDINAGARI),
199 ("New_Tai_Lue", NEW_TAI_LUE),
200 ("Newa", NEWA),
201 ("Nko", NKO),
202 ("Nushu", NUSHU),
203 ("Nyiakeng_Puachue_Hmong", NYIAKENG_PUACHUE_HMONG),
204 ("Ogham", OGHAM),
205 ("Ol_Chiki", OL_CHIKI),
206 ("Old_Hungarian", OLD_HUNGARIAN),
207 ("Old_Italic", OLD_ITALIC),
208 ("Old_North_Arabian", OLD_NORTH_ARABIAN),
209 ("Old_Permic", OLD_PERMIC),
210 ("Old_Persian", OLD_PERSIAN),
211 ("Old_Sogdian", OLD_SOGDIAN),
212 ("Old_South_Arabian", OLD_SOUTH_ARABIAN),
213 ("Old_Turkic", OLD_TURKIC),
214 ("Old_Uyghur", OLD_UYGHUR),
215 ("Oriya", ORIYA),
216 ("Osage", OSAGE),
217 ("Osmanya", OSMANYA),
218 ("Pahawh_Hmong", PAHAWH_HMONG),
219 ("Palmyrene", PALMYRENE),
220 ("Pau_Cin_Hau", PAU_CIN_HAU),
221 ("Phags_Pa", PHAGS_PA),
222 ("Phoenician", PHOENICIAN),
223 ("Psalter_Pahlavi", PSALTER_PAHLAVI),
224 ("Rejang", REJANG),
225 ("Runic", RUNIC),
226 ("Samaritan", SAMARITAN),
227 ("Saurashtra", SAURASHTRA),
228 ("Sharada", SHARADA),
229 ("Shavian", SHAVIAN),
230 ("Siddham", SIDDHAM),
231 ("SignWriting", SIGNWRITING),
232 ("Sinhala", SINHALA),
233 ("Sogdian", SOGDIAN),
234 ("Sora_Sompeng", SORA_SOMPENG),
235 ("Soyombo", SOYOMBO),
236 ("Sundanese", SUNDANESE),
237 ("Syloti_Nagri", SYLOTI_NAGRI),
238 ("Syriac", SYRIAC),
239 ("Tagalog", TAGALOG),
240 ("Tagbanwa", TAGBANWA),
241 ("Tai_Le", TAI_LE),
242 ("Tai_Tham", TAI_THAM),
243 ("Tai_Viet", TAI_VIET),
244 ("Takri", TAKRI),
245 ("Tamil", TAMIL),
246 ("Tangsa", TANGSA),
247 ("Tangut", TANGUT),
248 ("Telugu", TELUGU),
249 ("Thaana", THAANA),
250 ("Thai", THAI),
251 ("Tibetan", TIBETAN),
252 ("Tifinagh", TIFINAGH),
253 ("Tirhuta", TIRHUTA),
254 ("Toto", TOTO),
255 ("Ugaritic", UGARITIC),
256 ("Vai", VAI),
257 ("Vithkuqi", VITHKUQI),
258 ("Wancho", WANCHO),
259 ("Warang_Citi", WARANG_CITI),
260 ("Yezidi", YEZIDI),
261 ("Yi", YI),
262 ("Zanabazar_Square", ZANABAZAR_SQUARE),
263 ];
264 }
265
266 /// Return all available unicode property names
unicode_property_names() -> Box<dyn Iterator<Item = &'static str>>267 pub fn unicode_property_names() -> Box<dyn Iterator<Item = &'static str>> {
268 Box::new(
269 BINARY_PROPERTY_NAMES
270 .iter()
271 .map(|name| *name)
272 .chain(CATEGORY_PROPERTY_NAMES.iter().map(|name| *name))
273 .chain(SCRIPT_PROPERTY_NAMES.iter().map(|name| *name)),
274 )
275 }
276
by_name(name: &str) -> Option<Box<dyn Fn(char) -> bool>>277 pub fn by_name(name: &str) -> Option<Box<dyn Fn(char) -> bool>> {
278 for property in binary::BY_NAME {
279 if name == property.0.to_uppercase() {
280 return Some(Box::new(move |c| property.1.contains_char(c)));
281 }
282 }
283
284 for property in category::BY_NAME {
285 if name == property.0.to_uppercase() {
286 return Some(Box::new(move |c| property.1.contains_char(c)));
287 }
288 }
289
290 for property in script::BY_NAME {
291 if name == property.0.to_uppercase() {
292 return Some(Box::new(move |c| property.1.contains_char(c)));
293 }
294 }
295
296 None
297 }
298