• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //! Character inclusion in binary or General_Category value Unicode sets.
2 //!
3 //! We rely on dead code elimination to remove the tables that aren't needed.
4 
5 #![allow(bad_style)]
6 #![allow(clippy::all)]
7 
8 use alloc::boxed::Box;
9 
10 macro_rules! property_functions {
11     ($module:ident, $property_names:ident, [$(
12         $prop:ident,
13     )*]) => {
14         #[allow(unused)]
15         mod $module;
16         // unicode::ALPHABETIC('a')
17         $(pub fn $prop(c: char) -> bool {
18             self::$module::$prop.contains_char(c)
19         })*
20 
21         pub static $property_names: &[&str] = &[
22             $(stringify!($prop),)*
23         ];
24     };
25 }
26 
27 macro_rules! char_property_functions {
28     // For define custom property names
29     {$(
30         mod $module:ident;
31         static $property_names:ident = [$(
32             $prop:ident,
33         )*];
34     )*} => {$(
35         property_functions!($module, $property_names, [$(
36             $prop,
37         )*]);
38     )*};
39     // For define property by copy BY_NAME values from `ucd-generate` generated.
40     {$(
41         mod $module:ident;
42         static $property_names:ident = [$(
43             ($_name:tt, $prop:ident),
44         )*];
45     )*} => {$(
46         property_functions!($module, $property_names, [$(
47             $prop,
48         )*]);
49     )*};
50 }
51 
52 char_property_functions! {
53     mod binary;
54     static BINARY_PROPERTY_NAMES = [
55         // ASCII_HEX_DIGIT, // let this one be stripped out -- the full trie is wasteful for ASCII
56         ALPHABETIC, BIDI_CONTROL, CASE_IGNORABLE, CASED, CHANGES_WHEN_CASEFOLDED,
57         CHANGES_WHEN_CASEMAPPED, CHANGES_WHEN_LOWERCASED, CHANGES_WHEN_TITLECASED,
58         CHANGES_WHEN_UPPERCASED, DASH, DEFAULT_IGNORABLE_CODE_POINT, DEPRECATED, DIACRITIC,
59         EXTENDER, GRAPHEME_BASE, GRAPHEME_EXTEND, GRAPHEME_LINK, HEX_DIGIT, HYPHEN,
60         IDS_BINARY_OPERATOR, IDS_TRINARY_OPERATOR, ID_CONTINUE, ID_START, IDEOGRAPHIC, JOIN_CONTROL,
61         LOGICAL_ORDER_EXCEPTION, LOWERCASE, MATH, NONCHARACTER_CODE_POINT, OTHER_ALPHABETIC,
62         OTHER_DEFAULT_IGNORABLE_CODE_POINT, OTHER_GRAPHEME_EXTEND, OTHER_ID_CONTINUE,
63         OTHER_ID_START, OTHER_LOWERCASE, OTHER_MATH, OTHER_UPPERCASE, PATTERN_SYNTAX,
64         PATTERN_WHITE_SPACE, PREPENDED_CONCATENATION_MARK, QUOTATION_MARK, RADICAL,
65         REGIONAL_INDICATOR, SENTENCE_TERMINAL, SOFT_DOTTED, TERMINAL_PUNCTUATION, UNIFIED_IDEOGRAPH,
66         UPPERCASE, VARIATION_SELECTOR, WHITE_SPACE, XID_CONTINUE, XID_START,
67     ];
68 }
69 
70 char_property_functions! {
71     mod category;
72     // Copy from category::BY_NAME
73     static CATEGORY_PROPERTY_NAMES = [
74         ("Cased_Letter", CASED_LETTER), ("Close_Punctuation", CLOSE_PUNCTUATION),
75         ("Connector_Punctuation", CONNECTOR_PUNCTUATION), ("Control", CONTROL),
76         ("Currency_Symbol", CURRENCY_SYMBOL),
77         ("Dash_Punctuation", DASH_PUNCTUATION), ("Decimal_Number", DECIMAL_NUMBER),
78         ("Enclosing_Mark", ENCLOSING_MARK),
79         ("Final_Punctuation", FINAL_PUNCTUATION), ("Format", FORMAT),
80         ("Initial_Punctuation", INITIAL_PUNCTUATION), ("Letter", LETTER),
81         ("Letter_Number", LETTER_NUMBER), ("Line_Separator", LINE_SEPARATOR),
82         ("Lowercase_Letter", LOWERCASE_LETTER), ("Mark", MARK),
83         ("Math_Symbol", MATH_SYMBOL), ("Modifier_Letter", MODIFIER_LETTER),
84         ("Modifier_Symbol", MODIFIER_SYMBOL), ("Nonspacing_Mark", NONSPACING_MARK),
85         ("Number", NUMBER), ("Open_Punctuation", OPEN_PUNCTUATION),
86         ("Other", OTHER), ("Other_Letter", OTHER_LETTER),
87         ("Other_Number", OTHER_NUMBER), ("Other_Punctuation", OTHER_PUNCTUATION),
88         ("Other_Symbol", OTHER_SYMBOL),
89         ("Paragraph_Separator", PARAGRAPH_SEPARATOR), ("Private_Use", PRIVATE_USE),
90         ("Punctuation", PUNCTUATION), ("Separator", SEPARATOR),
91         ("Space_Separator", SPACE_SEPARATOR), ("Spacing_Mark", SPACING_MARK),
92         ("Surrogate", SURROGATE), ("Symbol", SYMBOL),
93         ("Titlecase_Letter", TITLECASE_LETTER), ("Unassigned", UNASSIGNED),
94         ("Uppercase_Letter", UPPERCASE_LETTER),
95     ];
96 
97     mod script;
98     // Copy from script::BY_NAME
99     static SCRIPT_PROPERTY_NAMES = [
100         ("Adlam", ADLAM),
101         ("Ahom", AHOM),
102         ("Anatolian_Hieroglyphs", ANATOLIAN_HIEROGLYPHS),
103         ("Arabic", ARABIC),
104         ("Armenian", ARMENIAN),
105         ("Avestan", AVESTAN),
106         ("Balinese", BALINESE),
107         ("Bamum", BAMUM),
108         ("Bassa_Vah", BASSA_VAH),
109         ("Batak", BATAK),
110         ("Bengali", BENGALI),
111         ("Bhaiksuki", BHAIKSUKI),
112         ("Bopomofo", BOPOMOFO),
113         ("Brahmi", BRAHMI),
114         ("Braille", BRAILLE),
115         ("Buginese", BUGINESE),
116         ("Buhid", BUHID),
117         ("Canadian_Aboriginal", CANADIAN_ABORIGINAL),
118         ("Carian", CARIAN),
119         ("Caucasian_Albanian", CAUCASIAN_ALBANIAN),
120         ("Chakma", CHAKMA),
121         ("Cham", CHAM),
122         ("Cherokee", CHEROKEE),
123         ("Chorasmian", CHORASMIAN),
124         ("Common", COMMON),
125         ("Coptic", COPTIC),
126         ("Cuneiform", CUNEIFORM),
127         ("Cypriot", CYPRIOT),
128         ("Cypro_Minoan", CYPRO_MINOAN),
129         ("Cyrillic", CYRILLIC),
130         ("Deseret", DESERET),
131         ("Devanagari", DEVANAGARI),
132         ("Dives_Akuru", DIVES_AKURU),
133         ("Dogra", DOGRA),
134         ("Duployan", DUPLOYAN),
135         ("Egyptian_Hieroglyphs", EGYPTIAN_HIEROGLYPHS),
136         ("Elbasan", ELBASAN),
137         ("Elymaic", ELYMAIC),
138         ("Ethiopic", ETHIOPIC),
139         ("Georgian", GEORGIAN),
140         ("Glagolitic", GLAGOLITIC),
141         ("Gothic", GOTHIC),
142         ("Grantha", GRANTHA),
143         ("Greek", GREEK),
144         ("Gujarati", GUJARATI),
145         ("Gunjala_Gondi", GUNJALA_GONDI),
146         ("Gurmukhi", GURMUKHI),
147         ("Han", HAN),
148         ("Hangul", HANGUL),
149         ("Hanifi_Rohingya", HANIFI_ROHINGYA),
150         ("Hanunoo", HANUNOO),
151         ("Hatran", HATRAN),
152         ("Hebrew", HEBREW),
153         ("Hiragana", HIRAGANA),
154         ("Imperial_Aramaic", IMPERIAL_ARAMAIC),
155         ("Inherited", INHERITED),
156         ("Inscriptional_Pahlavi", INSCRIPTIONAL_PAHLAVI),
157         ("Inscriptional_Parthian", INSCRIPTIONAL_PARTHIAN),
158         ("Javanese", JAVANESE),
159         ("Kaithi", KAITHI),
160         ("Kannada", KANNADA),
161         ("Katakana", KATAKANA),
162         ("Kawi", KAWI),
163         ("Kayah_Li", KAYAH_LI),
164         ("Kharoshthi", KHAROSHTHI),
165         ("Khitan_Small_Script", KHITAN_SMALL_SCRIPT),
166         ("Khmer", KHMER),
167         ("Khojki", KHOJKI),
168         ("Khudawadi", KHUDAWADI),
169         ("Lao", LAO),
170         ("Latin", LATIN),
171         ("Lepcha", LEPCHA),
172         ("Limbu", LIMBU),
173         ("Linear_A", LINEAR_A),
174         ("Linear_B", LINEAR_B),
175         ("Lisu", LISU),
176         ("Lycian", LYCIAN),
177         ("Lydian", LYDIAN),
178         ("Mahajani", MAHAJANI),
179         ("Makasar", MAKASAR),
180         ("Malayalam", MALAYALAM),
181         ("Mandaic", MANDAIC),
182         ("Manichaean", MANICHAEAN),
183         ("Marchen", MARCHEN),
184         ("Masaram_Gondi", MASARAM_GONDI),
185         ("Medefaidrin", MEDEFAIDRIN),
186         ("Meetei_Mayek", MEETEI_MAYEK),
187         ("Mende_Kikakui", MENDE_KIKAKUI),
188         ("Meroitic_Cursive", MEROITIC_CURSIVE),
189         ("Meroitic_Hieroglyphs", MEROITIC_HIEROGLYPHS),
190         ("Miao", MIAO),
191         ("Modi", MODI),
192         ("Mongolian", MONGOLIAN),
193         ("Mro", MRO),
194         ("Multani", MULTANI),
195         ("Myanmar", MYANMAR),
196         ("Nabataean", NABATAEAN),
197         ("Nag_Mundari", NAG_MUNDARI),
198         ("Nandinagari", NANDINAGARI),
199         ("New_Tai_Lue", NEW_TAI_LUE),
200         ("Newa", NEWA),
201         ("Nko", NKO),
202         ("Nushu", NUSHU),
203         ("Nyiakeng_Puachue_Hmong", NYIAKENG_PUACHUE_HMONG),
204         ("Ogham", OGHAM),
205         ("Ol_Chiki", OL_CHIKI),
206         ("Old_Hungarian", OLD_HUNGARIAN),
207         ("Old_Italic", OLD_ITALIC),
208         ("Old_North_Arabian", OLD_NORTH_ARABIAN),
209         ("Old_Permic", OLD_PERMIC),
210         ("Old_Persian", OLD_PERSIAN),
211         ("Old_Sogdian", OLD_SOGDIAN),
212         ("Old_South_Arabian", OLD_SOUTH_ARABIAN),
213         ("Old_Turkic", OLD_TURKIC),
214         ("Old_Uyghur", OLD_UYGHUR),
215         ("Oriya", ORIYA),
216         ("Osage", OSAGE),
217         ("Osmanya", OSMANYA),
218         ("Pahawh_Hmong", PAHAWH_HMONG),
219         ("Palmyrene", PALMYRENE),
220         ("Pau_Cin_Hau", PAU_CIN_HAU),
221         ("Phags_Pa", PHAGS_PA),
222         ("Phoenician", PHOENICIAN),
223         ("Psalter_Pahlavi", PSALTER_PAHLAVI),
224         ("Rejang", REJANG),
225         ("Runic", RUNIC),
226         ("Samaritan", SAMARITAN),
227         ("Saurashtra", SAURASHTRA),
228         ("Sharada", SHARADA),
229         ("Shavian", SHAVIAN),
230         ("Siddham", SIDDHAM),
231         ("SignWriting", SIGNWRITING),
232         ("Sinhala", SINHALA),
233         ("Sogdian", SOGDIAN),
234         ("Sora_Sompeng", SORA_SOMPENG),
235         ("Soyombo", SOYOMBO),
236         ("Sundanese", SUNDANESE),
237         ("Syloti_Nagri", SYLOTI_NAGRI),
238         ("Syriac", SYRIAC),
239         ("Tagalog", TAGALOG),
240         ("Tagbanwa", TAGBANWA),
241         ("Tai_Le", TAI_LE),
242         ("Tai_Tham", TAI_THAM),
243         ("Tai_Viet", TAI_VIET),
244         ("Takri", TAKRI),
245         ("Tamil", TAMIL),
246         ("Tangsa", TANGSA),
247         ("Tangut", TANGUT),
248         ("Telugu", TELUGU),
249         ("Thaana", THAANA),
250         ("Thai", THAI),
251         ("Tibetan", TIBETAN),
252         ("Tifinagh", TIFINAGH),
253         ("Tirhuta", TIRHUTA),
254         ("Toto", TOTO),
255         ("Ugaritic", UGARITIC),
256         ("Vai", VAI),
257         ("Vithkuqi", VITHKUQI),
258         ("Wancho", WANCHO),
259         ("Warang_Citi", WARANG_CITI),
260         ("Yezidi", YEZIDI),
261         ("Yi", YI),
262         ("Zanabazar_Square", ZANABAZAR_SQUARE),
263     ];
264 }
265 
266 /// Return all available unicode property names
unicode_property_names() -> Box<dyn Iterator<Item = &'static str>>267 pub fn unicode_property_names() -> Box<dyn Iterator<Item = &'static str>> {
268     Box::new(
269         BINARY_PROPERTY_NAMES
270             .iter()
271             .map(|name| *name)
272             .chain(CATEGORY_PROPERTY_NAMES.iter().map(|name| *name))
273             .chain(SCRIPT_PROPERTY_NAMES.iter().map(|name| *name)),
274     )
275 }
276 
by_name(name: &str) -> Option<Box<dyn Fn(char) -> bool>>277 pub fn by_name(name: &str) -> Option<Box<dyn Fn(char) -> bool>> {
278     for property in binary::BY_NAME {
279         if name == property.0.to_uppercase() {
280             return Some(Box::new(move |c| property.1.contains_char(c)));
281         }
282     }
283 
284     for property in category::BY_NAME {
285         if name == property.0.to_uppercase() {
286             return Some(Box::new(move |c| property.1.contains_char(c)));
287         }
288     }
289 
290     for property in script::BY_NAME {
291         if name == property.0.to_uppercase() {
292             return Some(Box::new(move |c| property.1.contains_char(c)));
293         }
294     }
295 
296     None
297 }
298