• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use std::error;
2 use std::fmt;
3 use std::result;
4 
5 use crate::hir;
6 
7 /// A type alias for errors specific to Unicode handling of classes.
8 pub type Result<T> = result::Result<T, Error>;
9 
10 /// An inclusive range of codepoints from a generated file (hence the static
11 /// lifetime).
12 type Range = &'static [(char, char)];
13 
14 /// An error that occurs when dealing with Unicode.
15 ///
16 /// We don't impl the Error trait here because these always get converted
17 /// into other public errors. (This error type isn't exported.)
18 #[derive(Debug)]
19 pub enum Error {
20     PropertyNotFound,
21     PropertyValueNotFound,
22     // Not used when unicode-perl is enabled.
23     #[allow(dead_code)]
24     PerlClassNotFound,
25 }
26 
27 /// A type alias for errors specific to Unicode case folding.
28 pub type FoldResult<T> = result::Result<T, CaseFoldError>;
29 
30 /// An error that occurs when Unicode-aware simple case folding fails.
31 ///
32 /// This error can occur when the case mapping tables necessary for Unicode
33 /// aware case folding are unavailable. This only occurs when the
34 /// `unicode-case` feature is disabled. (The feature is enabled by default.)
35 #[derive(Debug)]
36 pub struct CaseFoldError(());
37 
38 impl error::Error for CaseFoldError {}
39 
40 impl fmt::Display for CaseFoldError {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result41     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
42         write!(
43             f,
44             "Unicode-aware case folding is not available \
45              (probably because the unicode-case feature is not enabled)"
46         )
47     }
48 }
49 
50 /// An error that occurs when the Unicode-aware `\w` class is unavailable.
51 ///
52 /// This error can occur when the data tables necessary for the Unicode aware
53 /// Perl character class `\w` are unavailable. This only occurs when the
54 /// `unicode-perl` feature is disabled. (The feature is enabled by default.)
55 #[derive(Debug)]
56 pub struct UnicodeWordError(());
57 
58 impl error::Error for UnicodeWordError {}
59 
60 impl fmt::Display for UnicodeWordError {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result61     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
62         write!(
63             f,
64             "Unicode-aware \\w class is not available \
65              (probably because the unicode-perl feature is not enabled)"
66         )
67     }
68 }
69 
70 /// Return an iterator over the equivalence class of simple case mappings
71 /// for the given codepoint. The equivalence class does not include the
72 /// given codepoint.
73 ///
74 /// If the equivalence class is empty, then this returns the next scalar
75 /// value that has a non-empty equivalence class, if it exists. If no such
76 /// scalar value exists, then `None` is returned. The point of this behavior
77 /// is to permit callers to avoid calling `simple_fold` more than they need
78 /// to, since there is some cost to fetching the equivalence class.
79 ///
80 /// This returns an error if the Unicode case folding tables are not available.
simple_fold( c: char, ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>>81 pub fn simple_fold(
82     c: char,
83 ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> {
84     #[cfg(not(feature = "unicode-case"))]
85     fn imp(
86         _: char,
87     ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>>
88     {
89         use std::option::IntoIter;
90         Err::<result::Result<IntoIter<char>, _>, _>(CaseFoldError(()))
91     }
92 
93     #[cfg(feature = "unicode-case")]
94     fn imp(
95         c: char,
96     ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>>
97     {
98         use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
99 
100         Ok(CASE_FOLDING_SIMPLE
101             .binary_search_by_key(&c, |&(c1, _)| c1)
102             .map(|i| CASE_FOLDING_SIMPLE[i].1.iter().copied())
103             .map_err(|i| {
104                 if i >= CASE_FOLDING_SIMPLE.len() {
105                     None
106                 } else {
107                     Some(CASE_FOLDING_SIMPLE[i].0)
108                 }
109             }))
110     }
111 
112     imp(c)
113 }
114 
115 /// Returns true if and only if the given (inclusive) range contains at least
116 /// one Unicode scalar value that has a non-empty non-trivial simple case
117 /// mapping.
118 ///
119 /// This function panics if `end < start`.
120 ///
121 /// This returns an error if the Unicode case folding tables are not available.
contains_simple_case_mapping( start: char, end: char, ) -> FoldResult<bool>122 pub fn contains_simple_case_mapping(
123     start: char,
124     end: char,
125 ) -> FoldResult<bool> {
126     #[cfg(not(feature = "unicode-case"))]
127     fn imp(_: char, _: char) -> FoldResult<bool> {
128         Err(CaseFoldError(()))
129     }
130 
131     #[cfg(feature = "unicode-case")]
132     fn imp(start: char, end: char) -> FoldResult<bool> {
133         use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
134         use std::cmp::Ordering;
135 
136         assert!(start <= end);
137         Ok(CASE_FOLDING_SIMPLE
138             .binary_search_by(|&(c, _)| {
139                 if start <= c && c <= end {
140                     Ordering::Equal
141                 } else if c > end {
142                     Ordering::Greater
143                 } else {
144                     Ordering::Less
145                 }
146             })
147             .is_ok())
148     }
149 
150     imp(start, end)
151 }
152 
153 /// A query for finding a character class defined by Unicode. This supports
154 /// either use of a property name directly, or lookup by property value. The
155 /// former generally refers to Binary properties (see UTS#44, Table 8), but
156 /// as a special exception (see UTS#18, Section 1.2) both general categories
157 /// (an enumeration) and scripts (a catalog) are supported as if each of their
158 /// possible values were a binary property.
159 ///
160 /// In all circumstances, property names and values are normalized and
161 /// canonicalized. That is, `GC == gc == GeneralCategory == general_category`.
162 ///
163 /// The lifetime `'a` refers to the shorter of the lifetimes of property name
164 /// and property value.
165 #[derive(Debug)]
166 pub enum ClassQuery<'a> {
167     /// Return a class corresponding to a Unicode binary property, named by
168     /// a single letter.
169     OneLetter(char),
170     /// Return a class corresponding to a Unicode binary property.
171     ///
172     /// Note that, by special exception (see UTS#18, Section 1.2), both
173     /// general category values and script values are permitted here as if
174     /// they were a binary property.
175     Binary(&'a str),
176     /// Return a class corresponding to all codepoints whose property
177     /// (identified by `property_name`) corresponds to the given value
178     /// (identified by `property_value`).
179     ByValue {
180         /// A property name.
181         property_name: &'a str,
182         /// A property value.
183         property_value: &'a str,
184     },
185 }
186 
187 impl<'a> ClassQuery<'a> {
canonicalize(&self) -> Result<CanonicalClassQuery>188     fn canonicalize(&self) -> Result<CanonicalClassQuery> {
189         match *self {
190             ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()),
191             ClassQuery::Binary(name) => self.canonical_binary(name),
192             ClassQuery::ByValue { property_name, property_value } => {
193                 let property_name = symbolic_name_normalize(property_name);
194                 let property_value = symbolic_name_normalize(property_value);
195 
196                 let canon_name = match canonical_prop(&property_name)? {
197                     None => return Err(Error::PropertyNotFound),
198                     Some(canon_name) => canon_name,
199                 };
200                 Ok(match canon_name {
201                     "General_Category" => {
202                         let canon = match canonical_gencat(&property_value)? {
203                             None => return Err(Error::PropertyValueNotFound),
204                             Some(canon) => canon,
205                         };
206                         CanonicalClassQuery::GeneralCategory(canon)
207                     }
208                     "Script" => {
209                         let canon = match canonical_script(&property_value)? {
210                             None => return Err(Error::PropertyValueNotFound),
211                             Some(canon) => canon,
212                         };
213                         CanonicalClassQuery::Script(canon)
214                     }
215                     _ => {
216                         let vals = match property_values(canon_name)? {
217                             None => return Err(Error::PropertyValueNotFound),
218                             Some(vals) => vals,
219                         };
220                         let canon_val =
221                             match canonical_value(vals, &property_value) {
222                                 None => {
223                                     return Err(Error::PropertyValueNotFound)
224                                 }
225                                 Some(canon_val) => canon_val,
226                             };
227                         CanonicalClassQuery::ByValue {
228                             property_name: canon_name,
229                             property_value: canon_val,
230                         }
231                     }
232                 })
233             }
234         }
235     }
236 
canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery>237     fn canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery> {
238         let norm = symbolic_name_normalize(name);
239 
240         // This is a special case where 'cf' refers to the 'Format' general
241         // category, but where the 'cf' abbreviation is also an abbreviation
242         // for the 'Case_Folding' property. But we want to treat it as
243         // a general category. (Currently, we don't even support the
244         // 'Case_Folding' property. But if we do in the future, users will be
245         // required to spell it out.)
246         if norm != "cf" {
247             if let Some(canon) = canonical_prop(&norm)? {
248                 return Ok(CanonicalClassQuery::Binary(canon));
249             }
250         }
251         if let Some(canon) = canonical_gencat(&norm)? {
252             return Ok(CanonicalClassQuery::GeneralCategory(canon));
253         }
254         if let Some(canon) = canonical_script(&norm)? {
255             return Ok(CanonicalClassQuery::Script(canon));
256         }
257         Err(Error::PropertyNotFound)
258     }
259 }
260 
261 /// Like ClassQuery, but its parameters have been canonicalized. This also
262 /// differentiates binary properties from flattened general categories and
263 /// scripts.
264 #[derive(Debug, Eq, PartialEq)]
265 enum CanonicalClassQuery {
266     /// The canonical binary property name.
267     Binary(&'static str),
268     /// The canonical general category name.
269     GeneralCategory(&'static str),
270     /// The canonical script name.
271     Script(&'static str),
272     /// An arbitrary association between property and value, both of which
273     /// have been canonicalized.
274     ///
275     /// Note that by construction, the property name of ByValue will never
276     /// be General_Category or Script. Those two cases are subsumed by the
277     /// eponymous variants.
278     ByValue {
279         /// The canonical property name.
280         property_name: &'static str,
281         /// The canonical property value.
282         property_value: &'static str,
283     },
284 }
285 
286 /// Looks up a Unicode class given a query. If one doesn't exist, then
287 /// `None` is returned.
class(query: ClassQuery<'_>) -> Result<hir::ClassUnicode>288 pub fn class(query: ClassQuery<'_>) -> Result<hir::ClassUnicode> {
289     use self::CanonicalClassQuery::*;
290 
291     match query.canonicalize()? {
292         Binary(name) => bool_property(name),
293         GeneralCategory(name) => gencat(name),
294         Script(name) => script(name),
295         ByValue { property_name: "Age", property_value } => {
296             let mut class = hir::ClassUnicode::empty();
297             for set in ages(property_value)? {
298                 class.union(&hir_class(set));
299             }
300             Ok(class)
301         }
302         ByValue { property_name: "Script_Extensions", property_value } => {
303             script_extension(property_value)
304         }
305         ByValue {
306             property_name: "Grapheme_Cluster_Break",
307             property_value,
308         } => gcb(property_value),
309         ByValue { property_name: "Sentence_Break", property_value } => {
310             sb(property_value)
311         }
312         ByValue { property_name: "Word_Break", property_value } => {
313             wb(property_value)
314         }
315         _ => {
316             // What else should we support?
317             Err(Error::PropertyNotFound)
318         }
319     }
320 }
321 
322 /// Returns a Unicode aware class for \w.
323 ///
324 /// This returns an error if the data is not available for \w.
perl_word() -> Result<hir::ClassUnicode>325 pub fn perl_word() -> Result<hir::ClassUnicode> {
326     #[cfg(not(feature = "unicode-perl"))]
327     fn imp() -> Result<hir::ClassUnicode> {
328         Err(Error::PerlClassNotFound)
329     }
330 
331     #[cfg(feature = "unicode-perl")]
332     fn imp() -> Result<hir::ClassUnicode> {
333         use crate::unicode_tables::perl_word::PERL_WORD;
334         Ok(hir_class(PERL_WORD))
335     }
336 
337     imp()
338 }
339 
340 /// Returns a Unicode aware class for \s.
341 ///
342 /// This returns an error if the data is not available for \s.
perl_space() -> Result<hir::ClassUnicode>343 pub fn perl_space() -> Result<hir::ClassUnicode> {
344     #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))]
345     fn imp() -> Result<hir::ClassUnicode> {
346         Err(Error::PerlClassNotFound)
347     }
348 
349     #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))]
350     fn imp() -> Result<hir::ClassUnicode> {
351         use crate::unicode_tables::perl_space::WHITE_SPACE;
352         Ok(hir_class(WHITE_SPACE))
353     }
354 
355     #[cfg(feature = "unicode-bool")]
356     fn imp() -> Result<hir::ClassUnicode> {
357         use crate::unicode_tables::property_bool::WHITE_SPACE;
358         Ok(hir_class(WHITE_SPACE))
359     }
360 
361     imp()
362 }
363 
364 /// Returns a Unicode aware class for \d.
365 ///
366 /// This returns an error if the data is not available for \d.
perl_digit() -> Result<hir::ClassUnicode>367 pub fn perl_digit() -> Result<hir::ClassUnicode> {
368     #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))]
369     fn imp() -> Result<hir::ClassUnicode> {
370         Err(Error::PerlClassNotFound)
371     }
372 
373     #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))]
374     fn imp() -> Result<hir::ClassUnicode> {
375         use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER;
376         Ok(hir_class(DECIMAL_NUMBER))
377     }
378 
379     #[cfg(feature = "unicode-gencat")]
380     fn imp() -> Result<hir::ClassUnicode> {
381         use crate::unicode_tables::general_category::DECIMAL_NUMBER;
382         Ok(hir_class(DECIMAL_NUMBER))
383     }
384 
385     imp()
386 }
387 
388 /// Build a Unicode HIR class from a sequence of Unicode scalar value ranges.
hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode389 pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode {
390     let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges
391         .iter()
392         .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
393         .collect();
394     hir::ClassUnicode::new(hir_ranges)
395 }
396 
397 /// Returns true only if the given codepoint is in the `\w` character class.
398 ///
399 /// If the `unicode-perl` feature is not enabled, then this returns an error.
is_word_character(c: char) -> result::Result<bool, UnicodeWordError>400 pub fn is_word_character(c: char) -> result::Result<bool, UnicodeWordError> {
401     #[cfg(not(feature = "unicode-perl"))]
402     fn imp(_: char) -> result::Result<bool, UnicodeWordError> {
403         Err(UnicodeWordError(()))
404     }
405 
406     #[cfg(feature = "unicode-perl")]
407     fn imp(c: char) -> result::Result<bool, UnicodeWordError> {
408         use crate::is_word_byte;
409         use crate::unicode_tables::perl_word::PERL_WORD;
410         use std::cmp::Ordering;
411 
412         if c <= 0x7F as char && is_word_byte(c as u8) {
413             return Ok(true);
414         }
415         Ok(PERL_WORD
416             .binary_search_by(|&(start, end)| {
417                 if start <= c && c <= end {
418                     Ordering::Equal
419                 } else if start > c {
420                     Ordering::Greater
421                 } else {
422                     Ordering::Less
423                 }
424             })
425             .is_ok())
426     }
427 
428     imp(c)
429 }
430 
431 /// A mapping of property values for a specific property.
432 ///
433 /// The first element of each tuple is a normalized property value while the
434 /// second element of each tuple is the corresponding canonical property
435 /// value.
436 type PropertyValues = &'static [(&'static str, &'static str)];
437 
canonical_gencat(normalized_value: &str) -> Result<Option<&'static str>>438 fn canonical_gencat(normalized_value: &str) -> Result<Option<&'static str>> {
439     Ok(match normalized_value {
440         "any" => Some("Any"),
441         "assigned" => Some("Assigned"),
442         "ascii" => Some("ASCII"),
443         _ => {
444             let gencats = property_values("General_Category")?.unwrap();
445             canonical_value(gencats, normalized_value)
446         }
447     })
448 }
449 
canonical_script(normalized_value: &str) -> Result<Option<&'static str>>450 fn canonical_script(normalized_value: &str) -> Result<Option<&'static str>> {
451     let scripts = property_values("Script")?.unwrap();
452     Ok(canonical_value(scripts, normalized_value))
453 }
454 
455 /// Find the canonical property name for the given normalized property name.
456 ///
457 /// If no such property exists, then `None` is returned.
458 ///
459 /// The normalized property name must have been normalized according to
460 /// UAX44 LM3, which can be done using `symbolic_name_normalize`.
461 ///
462 /// If the property names data is not available, then an error is returned.
canonical_prop(normalized_name: &str) -> Result<Option<&'static str>>463 fn canonical_prop(normalized_name: &str) -> Result<Option<&'static str>> {
464     #[cfg(not(any(
465         feature = "unicode-age",
466         feature = "unicode-bool",
467         feature = "unicode-gencat",
468         feature = "unicode-perl",
469         feature = "unicode-script",
470         feature = "unicode-segment",
471     )))]
472     fn imp(_: &str) -> Result<Option<&'static str>> {
473         Err(Error::PropertyNotFound)
474     }
475 
476     #[cfg(any(
477         feature = "unicode-age",
478         feature = "unicode-bool",
479         feature = "unicode-gencat",
480         feature = "unicode-perl",
481         feature = "unicode-script",
482         feature = "unicode-segment",
483     ))]
484     fn imp(name: &str) -> Result<Option<&'static str>> {
485         use crate::unicode_tables::property_names::PROPERTY_NAMES;
486 
487         Ok(PROPERTY_NAMES
488             .binary_search_by_key(&name, |&(n, _)| n)
489             .ok()
490             .map(|i| PROPERTY_NAMES[i].1))
491     }
492 
493     imp(normalized_name)
494 }
495 
496 /// Find the canonical property value for the given normalized property
497 /// value.
498 ///
499 /// The given property values should correspond to the values for the property
500 /// under question, which can be found using `property_values`.
501 ///
502 /// If no such property value exists, then `None` is returned.
503 ///
504 /// The normalized property value must have been normalized according to
505 /// UAX44 LM3, which can be done using `symbolic_name_normalize`.
canonical_value( vals: PropertyValues, normalized_value: &str, ) -> Option<&'static str>506 fn canonical_value(
507     vals: PropertyValues,
508     normalized_value: &str,
509 ) -> Option<&'static str> {
510     vals.binary_search_by_key(&normalized_value, |&(n, _)| n)
511         .ok()
512         .map(|i| vals[i].1)
513 }
514 
515 /// Return the table of property values for the given property name.
516 ///
517 /// If the property values data is not available, then an error is returned.
property_values( canonical_property_name: &'static str, ) -> Result<Option<PropertyValues>>518 fn property_values(
519     canonical_property_name: &'static str,
520 ) -> Result<Option<PropertyValues>> {
521     #[cfg(not(any(
522         feature = "unicode-age",
523         feature = "unicode-bool",
524         feature = "unicode-gencat",
525         feature = "unicode-perl",
526         feature = "unicode-script",
527         feature = "unicode-segment",
528     )))]
529     fn imp(_: &'static str) -> Result<Option<PropertyValues>> {
530         Err(Error::PropertyValueNotFound)
531     }
532 
533     #[cfg(any(
534         feature = "unicode-age",
535         feature = "unicode-bool",
536         feature = "unicode-gencat",
537         feature = "unicode-perl",
538         feature = "unicode-script",
539         feature = "unicode-segment",
540     ))]
541     fn imp(name: &'static str) -> Result<Option<PropertyValues>> {
542         use crate::unicode_tables::property_values::PROPERTY_VALUES;
543 
544         Ok(PROPERTY_VALUES
545             .binary_search_by_key(&name, |&(n, _)| n)
546             .ok()
547             .map(|i| PROPERTY_VALUES[i].1))
548     }
549 
550     imp(canonical_property_name)
551 }
552 
553 // This is only used in some cases, but small enough to just let it be dead
554 // instead of figuring out (and maintaining) the right set of features.
555 #[allow(dead_code)]
property_set( name_map: &'static [(&'static str, Range)], canonical: &'static str, ) -> Option<Range>556 fn property_set(
557     name_map: &'static [(&'static str, Range)],
558     canonical: &'static str,
559 ) -> Option<Range> {
560     name_map
561         .binary_search_by_key(&canonical, |x| x.0)
562         .ok()
563         .map(|i| name_map[i].1)
564 }
565 
566 /// Returns an iterator over Unicode Age sets. Each item corresponds to a set
567 /// of codepoints that were added in a particular revision of Unicode. The
568 /// iterator yields items in chronological order.
569 ///
570 /// If the given age value isn't valid or if the data isn't available, then an
571 /// error is returned instead.
ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>>572 fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>> {
573     #[cfg(not(feature = "unicode-age"))]
574     fn imp(_: &str) -> Result<impl Iterator<Item = Range>> {
575         use std::option::IntoIter;
576         Err::<IntoIter<Range>, _>(Error::PropertyNotFound)
577     }
578 
579     #[cfg(feature = "unicode-age")]
580     fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>> {
581         use crate::unicode_tables::age;
582 
583         const AGES: &[(&str, Range)] = &[
584             ("V1_1", age::V1_1),
585             ("V2_0", age::V2_0),
586             ("V2_1", age::V2_1),
587             ("V3_0", age::V3_0),
588             ("V3_1", age::V3_1),
589             ("V3_2", age::V3_2),
590             ("V4_0", age::V4_0),
591             ("V4_1", age::V4_1),
592             ("V5_0", age::V5_0),
593             ("V5_1", age::V5_1),
594             ("V5_2", age::V5_2),
595             ("V6_0", age::V6_0),
596             ("V6_1", age::V6_1),
597             ("V6_2", age::V6_2),
598             ("V6_3", age::V6_3),
599             ("V7_0", age::V7_0),
600             ("V8_0", age::V8_0),
601             ("V9_0", age::V9_0),
602             ("V10_0", age::V10_0),
603             ("V11_0", age::V11_0),
604             ("V12_0", age::V12_0),
605             ("V12_1", age::V12_1),
606             ("V13_0", age::V13_0),
607             ("V14_0", age::V14_0),
608             ("V15_0", age::V15_0),
609         ];
610         assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync");
611 
612         let pos = AGES.iter().position(|&(age, _)| canonical_age == age);
613         match pos {
614             None => Err(Error::PropertyValueNotFound),
615             Some(i) => Ok(AGES[..=i].iter().map(|&(_, classes)| classes)),
616         }
617     }
618 
619     imp(canonical_age)
620 }
621 
622 /// Returns the Unicode HIR class corresponding to the given general category.
623 ///
624 /// Name canonicalization is assumed to be performed by the caller.
625 ///
626 /// If the given general category could not be found, or if the general
627 /// category data is not available, then an error is returned.
gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode>628 fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
629     #[cfg(not(feature = "unicode-gencat"))]
630     fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
631         Err(Error::PropertyNotFound)
632     }
633 
634     #[cfg(feature = "unicode-gencat")]
635     fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
636         use crate::unicode_tables::general_category::BY_NAME;
637         match name {
638             "ASCII" => Ok(hir_class(&[('\0', '\x7F')])),
639             "Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])),
640             "Assigned" => {
641                 let mut cls = gencat("Unassigned")?;
642                 cls.negate();
643                 Ok(cls)
644             }
645             name => property_set(BY_NAME, name)
646                 .map(hir_class)
647                 .ok_or(Error::PropertyValueNotFound),
648         }
649     }
650 
651     match canonical_name {
652         "Decimal_Number" => perl_digit(),
653         name => imp(name),
654     }
655 }
656 
657 /// Returns the Unicode HIR class corresponding to the given script.
658 ///
659 /// Name canonicalization is assumed to be performed by the caller.
660 ///
661 /// If the given script could not be found, or if the script data is not
662 /// available, then an error is returned.
script(canonical_name: &'static str) -> Result<hir::ClassUnicode>663 fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
664     #[cfg(not(feature = "unicode-script"))]
665     fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
666         Err(Error::PropertyNotFound)
667     }
668 
669     #[cfg(feature = "unicode-script")]
670     fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
671         use crate::unicode_tables::script::BY_NAME;
672         property_set(BY_NAME, name)
673             .map(hir_class)
674             .ok_or(Error::PropertyValueNotFound)
675     }
676 
677     imp(canonical_name)
678 }
679 
680 /// Returns the Unicode HIR class corresponding to the given script extension.
681 ///
682 /// Name canonicalization is assumed to be performed by the caller.
683 ///
684 /// If the given script extension could not be found, or if the script data is
685 /// not available, then an error is returned.
script_extension( canonical_name: &'static str, ) -> Result<hir::ClassUnicode>686 fn script_extension(
687     canonical_name: &'static str,
688 ) -> Result<hir::ClassUnicode> {
689     #[cfg(not(feature = "unicode-script"))]
690     fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
691         Err(Error::PropertyNotFound)
692     }
693 
694     #[cfg(feature = "unicode-script")]
695     fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
696         use crate::unicode_tables::script_extension::BY_NAME;
697         property_set(BY_NAME, name)
698             .map(hir_class)
699             .ok_or(Error::PropertyValueNotFound)
700     }
701 
702     imp(canonical_name)
703 }
704 
705 /// Returns the Unicode HIR class corresponding to the given Unicode boolean
706 /// property.
707 ///
708 /// Name canonicalization is assumed to be performed by the caller.
709 ///
710 /// If the given boolean property could not be found, or if the boolean
711 /// property data is not available, then an error is returned.
bool_property(canonical_name: &'static str) -> Result<hir::ClassUnicode>712 fn bool_property(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
713     #[cfg(not(feature = "unicode-bool"))]
714     fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
715         Err(Error::PropertyNotFound)
716     }
717 
718     #[cfg(feature = "unicode-bool")]
719     fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
720         use crate::unicode_tables::property_bool::BY_NAME;
721         property_set(BY_NAME, name)
722             .map(hir_class)
723             .ok_or(Error::PropertyNotFound)
724     }
725 
726     match canonical_name {
727         "Decimal_Number" => perl_digit(),
728         "White_Space" => perl_space(),
729         name => imp(name),
730     }
731 }
732 
733 /// Returns the Unicode HIR class corresponding to the given grapheme cluster
734 /// break property.
735 ///
736 /// Name canonicalization is assumed to be performed by the caller.
737 ///
738 /// If the given property could not be found, or if the corresponding data is
739 /// not available, then an error is returned.
gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode>740 fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
741     #[cfg(not(feature = "unicode-segment"))]
742     fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
743         Err(Error::PropertyNotFound)
744     }
745 
746     #[cfg(feature = "unicode-segment")]
747     fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
748         use crate::unicode_tables::grapheme_cluster_break::BY_NAME;
749         property_set(BY_NAME, name)
750             .map(hir_class)
751             .ok_or(Error::PropertyValueNotFound)
752     }
753 
754     imp(canonical_name)
755 }
756 
757 /// Returns the Unicode HIR class corresponding to the given word break
758 /// property.
759 ///
760 /// Name canonicalization is assumed to be performed by the caller.
761 ///
762 /// If the given property could not be found, or if the corresponding data is
763 /// not available, then an error is returned.
wb(canonical_name: &'static str) -> Result<hir::ClassUnicode>764 fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
765     #[cfg(not(feature = "unicode-segment"))]
766     fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
767         Err(Error::PropertyNotFound)
768     }
769 
770     #[cfg(feature = "unicode-segment")]
771     fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
772         use crate::unicode_tables::word_break::BY_NAME;
773         property_set(BY_NAME, name)
774             .map(hir_class)
775             .ok_or(Error::PropertyValueNotFound)
776     }
777 
778     imp(canonical_name)
779 }
780 
781 /// Returns the Unicode HIR class corresponding to the given sentence
782 /// break property.
783 ///
784 /// Name canonicalization is assumed to be performed by the caller.
785 ///
786 /// If the given property could not be found, or if the corresponding data is
787 /// not available, then an error is returned.
sb(canonical_name: &'static str) -> Result<hir::ClassUnicode>788 fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
789     #[cfg(not(feature = "unicode-segment"))]
790     fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
791         Err(Error::PropertyNotFound)
792     }
793 
794     #[cfg(feature = "unicode-segment")]
795     fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
796         use crate::unicode_tables::sentence_break::BY_NAME;
797         property_set(BY_NAME, name)
798             .map(hir_class)
799             .ok_or(Error::PropertyValueNotFound)
800     }
801 
802     imp(canonical_name)
803 }
804 
805 /// Like symbolic_name_normalize_bytes, but operates on a string.
symbolic_name_normalize(x: &str) -> String806 fn symbolic_name_normalize(x: &str) -> String {
807     let mut tmp = x.as_bytes().to_vec();
808     let len = symbolic_name_normalize_bytes(&mut tmp).len();
809     tmp.truncate(len);
810     // This should always succeed because `symbolic_name_normalize_bytes`
811     // guarantees that `&tmp[..len]` is always valid UTF-8.
812     //
813     // N.B. We could avoid the additional UTF-8 check here, but it's unlikely
814     // to be worth skipping the additional safety check. A benchmark must
815     // justify it first.
816     String::from_utf8(tmp).unwrap()
817 }
818 
819 /// Normalize the given symbolic name in place according to UAX44-LM3.
820 ///
821 /// A "symbolic name" typically corresponds to property names and property
822 /// value aliases. Note, though, that it should not be applied to property
823 /// string values.
824 ///
825 /// The slice returned is guaranteed to be valid UTF-8 for all possible values
826 /// of `slice`.
827 ///
828 /// See: https://unicode.org/reports/tr44/#UAX44-LM3
symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8]829 fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
830     // I couldn't find a place in the standard that specified that property
831     // names/aliases had a particular structure (unlike character names), but
832     // we assume that it's ASCII only and drop anything that isn't ASCII.
833     let mut start = 0;
834     let mut starts_with_is = false;
835     if slice.len() >= 2 {
836         // Ignore any "is" prefix.
837         starts_with_is = slice[0..2] == b"is"[..]
838             || slice[0..2] == b"IS"[..]
839             || slice[0..2] == b"iS"[..]
840             || slice[0..2] == b"Is"[..];
841         if starts_with_is {
842             start = 2;
843         }
844     }
845     let mut next_write = 0;
846     for i in start..slice.len() {
847         // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid
848         // UTF-8, we ensure that the slice contains only ASCII bytes. In
849         // particular, we drop every non-ASCII byte from the normalized string.
850         let b = slice[i];
851         if b == b' ' || b == b'_' || b == b'-' {
852             continue;
853         } else if b'A' <= b && b <= b'Z' {
854             slice[next_write] = b + (b'a' - b'A');
855             next_write += 1;
856         } else if b <= 0x7F {
857             slice[next_write] = b;
858             next_write += 1;
859         }
860     }
861     // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally
862     // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross
863     // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it
864     // is actually an alias for the 'Other' general category.
865     if starts_with_is && next_write == 1 && slice[0] == b'c' {
866         slice[0] = b'i';
867         slice[1] = b's';
868         slice[2] = b'c';
869         next_write = 3;
870     }
871     &mut slice[..next_write]
872 }
873 
874 #[cfg(test)]
875 mod tests {
876     use super::{
877         contains_simple_case_mapping, simple_fold, symbolic_name_normalize,
878         symbolic_name_normalize_bytes,
879     };
880 
881     #[cfg(feature = "unicode-case")]
simple_fold_ok(c: char) -> impl Iterator<Item = char>882     fn simple_fold_ok(c: char) -> impl Iterator<Item = char> {
883         simple_fold(c).unwrap().unwrap()
884     }
885 
886     #[cfg(feature = "unicode-case")]
simple_fold_err(c: char) -> Option<char>887     fn simple_fold_err(c: char) -> Option<char> {
888         match simple_fold(c).unwrap() {
889             Ok(_) => unreachable!("simple_fold returned Ok iterator"),
890             Err(next) => next,
891         }
892     }
893 
894     #[cfg(feature = "unicode-case")]
contains_case_map(start: char, end: char) -> bool895     fn contains_case_map(start: char, end: char) -> bool {
896         contains_simple_case_mapping(start, end).unwrap()
897     }
898 
899     #[test]
900     #[cfg(feature = "unicode-case")]
simple_fold_k()901     fn simple_fold_k() {
902         let xs: Vec<char> = simple_fold_ok('k').collect();
903         assert_eq!(xs, vec!['K', 'K']);
904 
905         let xs: Vec<char> = simple_fold_ok('K').collect();
906         assert_eq!(xs, vec!['k', 'K']);
907 
908         let xs: Vec<char> = simple_fold_ok('K').collect();
909         assert_eq!(xs, vec!['K', 'k']);
910     }
911 
912     #[test]
913     #[cfg(feature = "unicode-case")]
simple_fold_a()914     fn simple_fold_a() {
915         let xs: Vec<char> = simple_fold_ok('a').collect();
916         assert_eq!(xs, vec!['A']);
917 
918         let xs: Vec<char> = simple_fold_ok('A').collect();
919         assert_eq!(xs, vec!['a']);
920     }
921 
922     #[test]
923     #[cfg(feature = "unicode-case")]
simple_fold_empty()924     fn simple_fold_empty() {
925         assert_eq!(Some('A'), simple_fold_err('?'));
926         assert_eq!(Some('A'), simple_fold_err('@'));
927         assert_eq!(Some('a'), simple_fold_err('['));
928         assert_eq!(Some('Ⰰ'), simple_fold_err('☃'));
929     }
930 
931     #[test]
932     #[cfg(feature = "unicode-case")]
simple_fold_max()933     fn simple_fold_max() {
934         assert_eq!(None, simple_fold_err('\u{10FFFE}'));
935         assert_eq!(None, simple_fold_err('\u{10FFFF}'));
936     }
937 
938     #[test]
939     #[cfg(not(feature = "unicode-case"))]
simple_fold_disabled()940     fn simple_fold_disabled() {
941         assert!(simple_fold('a').is_err());
942     }
943 
944     #[test]
945     #[cfg(feature = "unicode-case")]
range_contains()946     fn range_contains() {
947         assert!(contains_case_map('A', 'A'));
948         assert!(contains_case_map('Z', 'Z'));
949         assert!(contains_case_map('A', 'Z'));
950         assert!(contains_case_map('@', 'A'));
951         assert!(contains_case_map('Z', '['));
952         assert!(contains_case_map('☃', 'Ⰰ'));
953 
954         assert!(!contains_case_map('[', '['));
955         assert!(!contains_case_map('[', '`'));
956 
957         assert!(!contains_case_map('☃', '☃'));
958     }
959 
960     #[test]
961     #[cfg(not(feature = "unicode-case"))]
range_contains_disabled()962     fn range_contains_disabled() {
963         assert!(contains_simple_case_mapping('a', 'a').is_err());
964     }
965 
966     #[test]
967     #[cfg(feature = "unicode-gencat")]
regression_466()968     fn regression_466() {
969         use super::{CanonicalClassQuery, ClassQuery};
970 
971         let q = ClassQuery::OneLetter('C');
972         assert_eq!(
973             q.canonicalize().unwrap(),
974             CanonicalClassQuery::GeneralCategory("Other")
975         );
976     }
977 
978     #[test]
sym_normalize()979     fn sym_normalize() {
980         let sym_norm = symbolic_name_normalize;
981 
982         assert_eq!(sym_norm("Line_Break"), "linebreak");
983         assert_eq!(sym_norm("Line-break"), "linebreak");
984         assert_eq!(sym_norm("linebreak"), "linebreak");
985         assert_eq!(sym_norm("BA"), "ba");
986         assert_eq!(sym_norm("ba"), "ba");
987         assert_eq!(sym_norm("Greek"), "greek");
988         assert_eq!(sym_norm("isGreek"), "greek");
989         assert_eq!(sym_norm("IS_Greek"), "greek");
990         assert_eq!(sym_norm("isc"), "isc");
991         assert_eq!(sym_norm("is c"), "isc");
992         assert_eq!(sym_norm("is_c"), "isc");
993     }
994 
995     #[test]
valid_utf8_symbolic()996     fn valid_utf8_symbolic() {
997         let mut x = b"abc\xFFxyz".to_vec();
998         let y = symbolic_name_normalize_bytes(&mut x);
999         assert_eq!(y, b"abcxyz");
1000     }
1001 }
1002