• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 mat!(uni_literal, r"☃", "☃", Some((0, 3)));
2 mat!(uni_literal_plus, r"☃+", "☃", Some((0, 3)));
3 mat!(uni_literal_casei_plus, r"(?i)☃+", "☃", Some((0, 3)));
4 mat!(uni_class_plus, r"[☃Ⅰ]+", "☃", Some((0, 3)));
5 mat!(uni_one, r"\pN", "Ⅰ", Some((0, 3)));
6 mat!(uni_mixed, r"\pN+", "Ⅰ1Ⅱ2", Some((0, 8)));
7 mat!(uni_not, r"\PN+", "abⅠ", Some((0, 2)));
8 mat!(uni_not_class, r"[\PN]+", "abⅠ", Some((0, 2)));
9 mat!(uni_not_class_neg, r"[^\PN]+", "abⅠ", Some((2, 5)));
10 mat!(uni_case, r"(?i)Δ", "δ", Some((0, 2)));
11 mat!(uni_case_upper, r"\p{Lu}+", "ΛΘΓΔα", Some((0, 8)));
12 mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10)));
13 mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10)));
14 mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10)));
15 
16 // Test the Unicode friendliness of Perl character classes.
17 mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4)));
18 mat!(uni_perl_w_not, r"\w+", "⥡", None);
19 mat!(uni_perl_w_neg, r"\W+", "⥡", Some((0, 3)));
20 mat!(uni_perl_d, r"\d+", "1२३9", Some((0, 8)));
21 mat!(uni_perl_d_not, r"\d+", "Ⅱ", None);
22 mat!(uni_perl_d_neg, r"\D+", "Ⅱ", Some((0, 3)));
23 mat!(uni_perl_s, r"\s+", " ", Some((0, 3)));
24 mat!(uni_perl_s_not, r"\s+", "☃", None);
25 mat!(uni_perl_s_neg, r"\S+", "☃", Some((0, 3)));
26 
27 // And do the same for word boundaries.
28 mat!(uni_boundary_none, r"\d\b", "6δ", None);
29 mat!(uni_boundary_ogham, r"\d\b", "6 ", Some((0, 1)));
30 mat!(uni_not_boundary_none, r"\d\B", "6δ", Some((0, 1)));
31 mat!(uni_not_boundary_ogham, r"\d\B", "6 ", None);
32 
33 // Test general categories.
34 //
35 // We should test more, but there's a lot. Write a script to generate more of
36 // these tests.
37 mat!(uni_class_gencat_cased_letter, r"\p{Cased_Letter}", "A", Some((0, 3)));
38 mat!(
39     uni_class_gencat_close_punctuation,
40     r"\p{Close_Punctuation}",
41     "❯",
42     Some((0, 3))
43 );
44 mat!(
45     uni_class_gencat_connector_punctuation,
46     r"\p{Connector_Punctuation}",
47     "⁀",
48     Some((0, 3))
49 );
50 mat!(uni_class_gencat_control, r"\p{Control}", "\u{9f}", Some((0, 2)));
51 mat!(
52     uni_class_gencat_currency_symbol,
53     r"\p{Currency_Symbol}",
54     "£",
55     Some((0, 3))
56 );
57 mat!(
58     uni_class_gencat_dash_punctuation,
59     r"\p{Dash_Punctuation}",
60     "〰",
61     Some((0, 3))
62 );
63 mat!(uni_class_gencat_decimal_numer, r"\p{Decimal_Number}", "��", Some((0, 4)));
64 mat!(
65     uni_class_gencat_enclosing_mark,
66     r"\p{Enclosing_Mark}",
67     "\u{A672}",
68     Some((0, 3))
69 );
70 mat!(
71     uni_class_gencat_final_punctuation,
72     r"\p{Final_Punctuation}",
73     "⸡",
74     Some((0, 3))
75 );
76 mat!(uni_class_gencat_format, r"\p{Format}", "\u{E007F}", Some((0, 4)));
77 // See: https://github.com/rust-lang/regex/issues/719
78 mat!(uni_class_gencat_format_abbrev1, r"\p{cf}", "\u{E007F}", Some((0, 4)));
79 mat!(uni_class_gencat_format_abbrev2, r"\p{gc=cf}", "\u{E007F}", Some((0, 4)));
80 mat!(
81     uni_class_gencat_initial_punctuation,
82     r"\p{Initial_Punctuation}",
83     "⸜",
84     Some((0, 3))
85 );
86 mat!(uni_class_gencat_letter, r"\p{Letter}", "Έ", Some((0, 2)));
87 mat!(uni_class_gencat_letter_number, r"\p{Letter_Number}", "ↂ", Some((0, 3)));
88 mat!(
89     uni_class_gencat_line_separator,
90     r"\p{Line_Separator}",
91     "\u{2028}",
92     Some((0, 3))
93 );
94 mat!(
95     uni_class_gencat_lowercase_letter,
96     r"\p{Lowercase_Letter}",
97     "ϛ",
98     Some((0, 2))
99 );
100 mat!(uni_class_gencat_mark, r"\p{Mark}", "\u{E01EF}", Some((0, 4)));
101 mat!(uni_class_gencat_math, r"\p{Math}", "⋿", Some((0, 3)));
102 mat!(
103     uni_class_gencat_modifier_letter,
104     r"\p{Modifier_Letter}",
105     "��",
106     Some((0, 4))
107 );
108 mat!(
109     uni_class_gencat_modifier_symbol,
110     r"\p{Modifier_Symbol}",
111     "��",
112     Some((0, 4))
113 );
114 mat!(
115     uni_class_gencat_nonspacing_mark,
116     r"\p{Nonspacing_Mark}",
117     "\u{1E94A}",
118     Some((0, 4))
119 );
120 mat!(uni_class_gencat_number, r"\p{Number}", "⓿", Some((0, 3)));
121 mat!(
122     uni_class_gencat_open_punctuation,
123     r"\p{Open_Punctuation}",
124     "⦅",
125     Some((0, 3))
126 );
127 mat!(uni_class_gencat_other, r"\p{Other}", "\u{bc9}", Some((0, 3)));
128 mat!(uni_class_gencat_other_letter, r"\p{Other_Letter}", "ꓷ", Some((0, 3)));
129 mat!(uni_class_gencat_other_number, r"\p{Other_Number}", "㉏", Some((0, 3)));
130 mat!(
131     uni_class_gencat_other_punctuation,
132     r"\p{Other_Punctuation}",
133     "��",
134     Some((0, 4))
135 );
136 mat!(uni_class_gencat_other_symbol, r"\p{Other_Symbol}", "⅌", Some((0, 3)));
137 mat!(
138     uni_class_gencat_paragraph_separator,
139     r"\p{Paragraph_Separator}",
140     "\u{2029}",
141     Some((0, 3))
142 );
143 mat!(
144     uni_class_gencat_private_use,
145     r"\p{Private_Use}",
146     "\u{10FFFD}",
147     Some((0, 4))
148 );
149 mat!(uni_class_gencat_punctuation, r"\p{Punctuation}", "��", Some((0, 4)));
150 mat!(uni_class_gencat_separator, r"\p{Separator}", "\u{3000}", Some((0, 3)));
151 mat!(
152     uni_class_gencat_space_separator,
153     r"\p{Space_Separator}",
154     "\u{205F}",
155     Some((0, 3))
156 );
157 mat!(
158     uni_class_gencat_spacing_mark,
159     r"\p{Spacing_Mark}",
160     "\u{16F7E}",
161     Some((0, 4))
162 );
163 mat!(uni_class_gencat_symbol, r"\p{Symbol}", "⯈", Some((0, 3)));
164 mat!(
165     uni_class_gencat_titlecase_letter,
166     r"\p{Titlecase_Letter}",
167     "ῼ",
168     Some((0, 3))
169 );
170 mat!(
171     uni_class_gencat_unassigned,
172     r"\p{Unassigned}",
173     "\u{10FFFF}",
174     Some((0, 4))
175 );
176 mat!(
177     uni_class_gencat_uppercase_letter,
178     r"\p{Uppercase_Letter}",
179     "Ꝋ",
180     Some((0, 3))
181 );
182 
183 // Test a smattering of properties.
184 mat!(uni_class_prop_emoji1, r"\p{Emoji}", "\u{23E9}", Some((0, 3)));
185 mat!(uni_class_prop_emoji2, r"\p{emoji}", "\u{1F21A}", Some((0, 4)));
186 mat!(
187     uni_class_prop_picto1,
188     r"\p{extendedpictographic}",
189     "\u{1FA6E}",
190     Some((0, 4))
191 );
192 mat!(
193     uni_class_prop_picto2,
194     r"\p{extendedpictographic}",
195     "\u{1FFFD}",
196     Some((0, 4))
197 );
198 
199 // grapheme_cluster_break
200 mat!(
201     uni_class_gcb_prepend,
202     r"\p{grapheme_cluster_break=prepend}",
203     "\u{11D46}",
204     Some((0, 4))
205 );
206 mat!(
207     uni_class_gcb_ri1,
208     r"\p{gcb=regional_indicator}",
209     "\u{1F1E6}",
210     Some((0, 4))
211 );
212 mat!(uni_class_gcb_ri2, r"\p{gcb=ri}", "\u{1F1E7}", Some((0, 4)));
213 mat!(
214     uni_class_gcb_ri3,
215     r"\p{gcb=regionalindicator}",
216     "\u{1F1FF}",
217     Some((0, 4))
218 );
219 mat!(uni_class_gcb_lvt, r"\p{gcb=lvt}", "\u{C989}", Some((0, 3)));
220 mat!(uni_class_gcb_zwj, r"\p{gcb=zwj}", "\u{200D}", Some((0, 3)));
221 
222 // word_break
223 mat!(uni_class_wb1, r"\p{word_break=Hebrew_Letter}", "\u{FB46}", Some((0, 3)));
224 mat!(uni_class_wb2, r"\p{wb=hebrewletter}", "\u{FB46}", Some((0, 3)));
225 mat!(uni_class_wb3, r"\p{wb=ExtendNumLet}", "\u{FF3F}", Some((0, 3)));
226 mat!(uni_class_wb4, r"\p{wb=WSegSpace}", "\u{3000}", Some((0, 3)));
227 mat!(uni_class_wb5, r"\p{wb=numeric}", "\u{1E950}", Some((0, 4)));
228 
229 // sentence_break
230 mat!(uni_class_sb1, r"\p{sentence_break=Lower}", "\u{0469}", Some((0, 2)));
231 mat!(uni_class_sb2, r"\p{sb=lower}", "\u{0469}", Some((0, 2)));
232 mat!(uni_class_sb3, r"\p{sb=Close}", "\u{FF60}", Some((0, 3)));
233 mat!(uni_class_sb4, r"\p{sb=Close}", "\u{1F677}", Some((0, 4)));
234 mat!(uni_class_sb5, r"\p{sb=SContinue}", "\u{FF64}", Some((0, 3)));
235 
236 // Test 'Vithkuqi' support, which was added in Unicode 14.
237 // See: https://github.com/rust-lang/regex/issues/877
238 mat!(
239     uni_vithkuqi_literal_upper,
240     r"(?i)^\u{10570}$",
241     "\u{10570}",
242     Some((0, 4))
243 );
244 mat!(
245     uni_vithkuqi_literal_lower,
246     r"(?i)^\u{10570}$",
247     "\u{10597}",
248     Some((0, 4))
249 );
250 mat!(uni_vithkuqi_word_upper, r"^\w$", "\u{10570}", Some((0, 4)));
251 mat!(uni_vithkuqi_word_lower, r"^\w$", "\u{10597}", Some((0, 4)));
252