1 mat!(uni_literal, r"☃", "☃", Some((0, 3))); 2 mat!(uni_literal_plus, r"☃+", "☃", Some((0, 3))); 3 mat!(uni_literal_casei_plus, r"(?i)☃+", "☃", Some((0, 3))); 4 mat!(uni_class_plus, r"[☃Ⅰ]+", "☃", Some((0, 3))); 5 mat!(uni_one, r"\pN", "Ⅰ", Some((0, 3))); 6 mat!(uni_mixed, r"\pN+", "Ⅰ1Ⅱ2", Some((0, 8))); 7 mat!(uni_not, r"\PN+", "abⅠ", Some((0, 2))); 8 mat!(uni_not_class, r"[\PN]+", "abⅠ", Some((0, 2))); 9 mat!(uni_not_class_neg, r"[^\PN]+", "abⅠ", Some((2, 5))); 10 mat!(uni_case, r"(?i)Δ", "δ", Some((0, 2))); 11 mat!(uni_case_upper, r"\p{Lu}+", "ΛΘΓΔα", Some((0, 8))); 12 mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10))); 13 mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10))); 14 mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10))); 15 16 // Test the Unicode friendliness of Perl character classes. 17 mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4))); 18 mat!(uni_perl_w_not, r"\w+", "⥡", None); 19 mat!(uni_perl_w_neg, r"\W+", "⥡", Some((0, 3))); 20 mat!(uni_perl_d, r"\d+", "1२३9", Some((0, 8))); 21 mat!(uni_perl_d_not, r"\d+", "Ⅱ", None); 22 mat!(uni_perl_d_neg, r"\D+", "Ⅱ", Some((0, 3))); 23 mat!(uni_perl_s, r"\s+", " ", Some((0, 3))); 24 mat!(uni_perl_s_not, r"\s+", "☃", None); 25 mat!(uni_perl_s_neg, r"\S+", "☃", Some((0, 3))); 26 27 // And do the same for word boundaries. 28 mat!(uni_boundary_none, r"\d\b", "6δ", None); 29 mat!(uni_boundary_ogham, r"\d\b", "6 ", Some((0, 1))); 30 mat!(uni_not_boundary_none, r"\d\B", "6δ", Some((0, 1))); 31 mat!(uni_not_boundary_ogham, r"\d\B", "6 ", None); 32 33 // Test general categories. 34 // 35 // We should test more, but there's a lot. Write a script to generate more of 36 // these tests. 37 mat!(uni_class_gencat_cased_letter, r"\p{Cased_Letter}", "A", Some((0, 3))); 38 mat!( 39 uni_class_gencat_close_punctuation, 40 r"\p{Close_Punctuation}", 41 "❯", 42 Some((0, 3)) 43 ); 44 mat!( 45 uni_class_gencat_connector_punctuation, 46 r"\p{Connector_Punctuation}", 47 "⁀", 48 Some((0, 3)) 49 ); 50 mat!(uni_class_gencat_control, r"\p{Control}", "\u{9f}", Some((0, 2))); 51 mat!( 52 uni_class_gencat_currency_symbol, 53 r"\p{Currency_Symbol}", 54 "£", 55 Some((0, 3)) 56 ); 57 mat!( 58 uni_class_gencat_dash_punctuation, 59 r"\p{Dash_Punctuation}", 60 "〰", 61 Some((0, 3)) 62 ); 63 mat!(uni_class_gencat_decimal_numer, r"\p{Decimal_Number}", "��", Some((0, 4))); 64 mat!( 65 uni_class_gencat_enclosing_mark, 66 r"\p{Enclosing_Mark}", 67 "\u{A672}", 68 Some((0, 3)) 69 ); 70 mat!( 71 uni_class_gencat_final_punctuation, 72 r"\p{Final_Punctuation}", 73 "⸡", 74 Some((0, 3)) 75 ); 76 mat!(uni_class_gencat_format, r"\p{Format}", "\u{E007F}", Some((0, 4))); 77 // See: https://github.com/rust-lang/regex/issues/719 78 mat!(uni_class_gencat_format_abbrev1, r"\p{cf}", "\u{E007F}", Some((0, 4))); 79 mat!(uni_class_gencat_format_abbrev2, r"\p{gc=cf}", "\u{E007F}", Some((0, 4))); 80 mat!( 81 uni_class_gencat_initial_punctuation, 82 r"\p{Initial_Punctuation}", 83 "⸜", 84 Some((0, 3)) 85 ); 86 mat!(uni_class_gencat_letter, r"\p{Letter}", "Έ", Some((0, 2))); 87 mat!(uni_class_gencat_letter_number, r"\p{Letter_Number}", "ↂ", Some((0, 3))); 88 mat!( 89 uni_class_gencat_line_separator, 90 r"\p{Line_Separator}", 91 "\u{2028}", 92 Some((0, 3)) 93 ); 94 mat!( 95 uni_class_gencat_lowercase_letter, 96 r"\p{Lowercase_Letter}", 97 "ϛ", 98 Some((0, 2)) 99 ); 100 mat!(uni_class_gencat_mark, r"\p{Mark}", "\u{E01EF}", Some((0, 4))); 101 mat!(uni_class_gencat_math, r"\p{Math}", "⋿", Some((0, 3))); 102 mat!( 103 uni_class_gencat_modifier_letter, 104 r"\p{Modifier_Letter}", 105 "��", 106 Some((0, 4)) 107 ); 108 mat!( 109 uni_class_gencat_modifier_symbol, 110 r"\p{Modifier_Symbol}", 111 "��", 112 Some((0, 4)) 113 ); 114 mat!( 115 uni_class_gencat_nonspacing_mark, 116 r"\p{Nonspacing_Mark}", 117 "\u{1E94A}", 118 Some((0, 4)) 119 ); 120 mat!(uni_class_gencat_number, r"\p{Number}", "⓿", Some((0, 3))); 121 mat!( 122 uni_class_gencat_open_punctuation, 123 r"\p{Open_Punctuation}", 124 "⦅", 125 Some((0, 3)) 126 ); 127 mat!(uni_class_gencat_other, r"\p{Other}", "\u{bc9}", Some((0, 3))); 128 mat!(uni_class_gencat_other_letter, r"\p{Other_Letter}", "ꓷ", Some((0, 3))); 129 mat!(uni_class_gencat_other_number, r"\p{Other_Number}", "㉏", Some((0, 3))); 130 mat!( 131 uni_class_gencat_other_punctuation, 132 r"\p{Other_Punctuation}", 133 "��", 134 Some((0, 4)) 135 ); 136 mat!(uni_class_gencat_other_symbol, r"\p{Other_Symbol}", "⅌", Some((0, 3))); 137 mat!( 138 uni_class_gencat_paragraph_separator, 139 r"\p{Paragraph_Separator}", 140 "\u{2029}", 141 Some((0, 3)) 142 ); 143 mat!( 144 uni_class_gencat_private_use, 145 r"\p{Private_Use}", 146 "\u{10FFFD}", 147 Some((0, 4)) 148 ); 149 mat!(uni_class_gencat_punctuation, r"\p{Punctuation}", "��", Some((0, 4))); 150 mat!(uni_class_gencat_separator, r"\p{Separator}", "\u{3000}", Some((0, 3))); 151 mat!( 152 uni_class_gencat_space_separator, 153 r"\p{Space_Separator}", 154 "\u{205F}", 155 Some((0, 3)) 156 ); 157 mat!( 158 uni_class_gencat_spacing_mark, 159 r"\p{Spacing_Mark}", 160 "\u{16F7E}", 161 Some((0, 4)) 162 ); 163 mat!(uni_class_gencat_symbol, r"\p{Symbol}", "⯈", Some((0, 3))); 164 mat!( 165 uni_class_gencat_titlecase_letter, 166 r"\p{Titlecase_Letter}", 167 "ῼ", 168 Some((0, 3)) 169 ); 170 mat!( 171 uni_class_gencat_unassigned, 172 r"\p{Unassigned}", 173 "\u{10FFFF}", 174 Some((0, 4)) 175 ); 176 mat!( 177 uni_class_gencat_uppercase_letter, 178 r"\p{Uppercase_Letter}", 179 "Ꝋ", 180 Some((0, 3)) 181 ); 182 183 // Test a smattering of properties. 184 mat!(uni_class_prop_emoji1, r"\p{Emoji}", "\u{23E9}", Some((0, 3))); 185 mat!(uni_class_prop_emoji2, r"\p{emoji}", "\u{1F21A}", Some((0, 4))); 186 mat!( 187 uni_class_prop_picto1, 188 r"\p{extendedpictographic}", 189 "\u{1FA6E}", 190 Some((0, 4)) 191 ); 192 mat!( 193 uni_class_prop_picto2, 194 r"\p{extendedpictographic}", 195 "\u{1FFFD}", 196 Some((0, 4)) 197 ); 198 199 // grapheme_cluster_break 200 mat!( 201 uni_class_gcb_prepend, 202 r"\p{grapheme_cluster_break=prepend}", 203 "\u{11D46}", 204 Some((0, 4)) 205 ); 206 mat!( 207 uni_class_gcb_ri1, 208 r"\p{gcb=regional_indicator}", 209 "\u{1F1E6}", 210 Some((0, 4)) 211 ); 212 mat!(uni_class_gcb_ri2, r"\p{gcb=ri}", "\u{1F1E7}", Some((0, 4))); 213 mat!( 214 uni_class_gcb_ri3, 215 r"\p{gcb=regionalindicator}", 216 "\u{1F1FF}", 217 Some((0, 4)) 218 ); 219 mat!(uni_class_gcb_lvt, r"\p{gcb=lvt}", "\u{C989}", Some((0, 3))); 220 mat!(uni_class_gcb_zwj, r"\p{gcb=zwj}", "\u{200D}", Some((0, 3))); 221 222 // word_break 223 mat!(uni_class_wb1, r"\p{word_break=Hebrew_Letter}", "\u{FB46}", Some((0, 3))); 224 mat!(uni_class_wb2, r"\p{wb=hebrewletter}", "\u{FB46}", Some((0, 3))); 225 mat!(uni_class_wb3, r"\p{wb=ExtendNumLet}", "\u{FF3F}", Some((0, 3))); 226 mat!(uni_class_wb4, r"\p{wb=WSegSpace}", "\u{3000}", Some((0, 3))); 227 mat!(uni_class_wb5, r"\p{wb=numeric}", "\u{1E950}", Some((0, 4))); 228 229 // sentence_break 230 mat!(uni_class_sb1, r"\p{sentence_break=Lower}", "\u{0469}", Some((0, 2))); 231 mat!(uni_class_sb2, r"\p{sb=lower}", "\u{0469}", Some((0, 2))); 232 mat!(uni_class_sb3, r"\p{sb=Close}", "\u{FF60}", Some((0, 3))); 233 mat!(uni_class_sb4, r"\p{sb=Close}", "\u{1F677}", Some((0, 4))); 234 mat!(uni_class_sb5, r"\p{sb=SContinue}", "\u{FF64}", Some((0, 3))); 235 236 // Test 'Vithkuqi' support, which was added in Unicode 14. 237 // See: https://github.com/rust-lang/regex/issues/877 238 mat!( 239 uni_vithkuqi_literal_upper, 240 r"(?i)^\u{10570}$", 241 "\u{10570}", 242 Some((0, 4)) 243 ); 244 mat!( 245 uni_vithkuqi_literal_lower, 246 r"(?i)^\u{10570}$", 247 "\u{10597}", 248 Some((0, 4)) 249 ); 250 mat!(uni_vithkuqi_word_upper, r"^\w$", "\u{10570}", Some((0, 4))); 251 mat!(uni_vithkuqi_word_lower, r"^\w$", "\u{10597}", Some((0, 4))); 252