• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use core::iter::FusedIterator;
2 
3 use crate::lookups::{
4     canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed,
5     stream_safe_trailing_nonstarters,
6 };
7 use crate::normalize::{hangul_decomposition_length, is_hangul_syllable};
8 use crate::tables::stream_safe_leading_nonstarters;
9 
10 pub(crate) const MAX_NONSTARTERS: usize = 30;
11 const COMBINING_GRAPHEME_JOINER: char = '\u{034F}';
12 
13 /// UAX15-D4: This iterator keeps track of how many non-starters there have been
14 /// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner
15 /// (U+034F) if the count exceeds 30.
16 pub struct StreamSafe<I> {
17     iter: I,
18     nonstarter_count: usize,
19     buffer: Option<char>,
20 }
21 
22 impl<I> StreamSafe<I> {
new(iter: I) -> Self23     pub(crate) fn new(iter: I) -> Self {
24         Self {
25             iter,
26             nonstarter_count: 0,
27             buffer: None,
28         }
29     }
30 }
31 
32 impl<I: Iterator<Item = char>> Iterator for StreamSafe<I> {
33     type Item = char;
34 
35     #[inline]
next(&mut self) -> Option<char>36     fn next(&mut self) -> Option<char> {
37         let next_ch = match self.buffer.take().or_else(|| self.iter.next()) {
38             None => return None,
39             Some(c) => c,
40         };
41         let d = classify_nonstarters(next_ch);
42         if self.nonstarter_count + d.leading_nonstarters > MAX_NONSTARTERS {
43             // Since we're emitting a CGJ, the suffix of the emitted string in NFKD has no trailing
44             // nonstarters, so we can reset the counter to zero. Put `next_ch` back into the
45             // iterator (via `self.buffer`), and we'll reclassify it next iteration.
46             self.nonstarter_count = 0;
47             self.buffer = Some(next_ch);
48             return Some(COMBINING_GRAPHEME_JOINER);
49         }
50 
51         // Is the character all nonstarters in NFKD? If so, increment our counter of contiguous
52         // nonstarters in NKFD.
53         if d.leading_nonstarters == d.decomposition_len {
54             self.nonstarter_count += d.decomposition_len;
55         }
56         // Otherwise, reset the counter to the decomposition's number of trailing nonstarters.
57         else {
58             self.nonstarter_count = d.trailing_nonstarters;
59         }
60         Some(next_ch)
61     }
62 }
63 
64 impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for StreamSafe<I> {}
65 
66 #[derive(Debug)]
67 pub(crate) struct Decomposition {
68     pub(crate) leading_nonstarters: usize,
69     pub(crate) trailing_nonstarters: usize,
70     pub(crate) decomposition_len: usize,
71 }
72 
73 #[inline]
classify_nonstarters(c: char) -> Decomposition74 pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
75     // As usual, fast path for ASCII (which is always a starter)
76     if c <= '\x7f' {
77         return Decomposition {
78             leading_nonstarters: 0,
79             trailing_nonstarters: 0,
80             decomposition_len: 1,
81         };
82     }
83     // Next, special case Hangul, since it's not handled by our tables.
84     if is_hangul_syllable(c) {
85         return Decomposition {
86             leading_nonstarters: 0,
87             trailing_nonstarters: 0,
88             decomposition_len: hangul_decomposition_length(c),
89         };
90     }
91     let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
92     match decomp {
93         Some(decomp) => Decomposition {
94             leading_nonstarters: stream_safe_leading_nonstarters(c),
95             trailing_nonstarters: stream_safe_trailing_nonstarters(c),
96             decomposition_len: decomp.len(),
97         },
98         None => {
99             let is_nonstarter = canonical_combining_class(c) != 0;
100             let nonstarter = if is_nonstarter { 1 } else { 0 };
101             Decomposition {
102                 leading_nonstarters: nonstarter,
103                 trailing_nonstarters: nonstarter,
104                 decomposition_len: 1,
105             }
106         }
107     }
108 }
109 
110 #[cfg(test)]
111 mod tests {
112     use super::{classify_nonstarters, StreamSafe};
113     use crate::lookups::canonical_combining_class;
114     use crate::normalize::decompose_compatible;
115 
116     #[cfg(not(feature = "std"))]
117     use alloc::{string::String, vec::Vec};
118 
119     use core::char;
120 
stream_safe(s: &str) -> String121     fn stream_safe(s: &str) -> String {
122         StreamSafe::new(s.chars()).collect()
123     }
124 
125     #[test]
test_simple()126     fn test_simple() {
127         let technically_okay = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}ngerzone";
128         assert_eq!(stream_safe(technically_okay), technically_okay);
129 
130         let too_much = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
131         let fixed_it = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}ngerzone";
132         assert_eq!(stream_safe(too_much), fixed_it);
133 
134         let woah_nelly = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
135         let its_cool = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{034f}\u{031d}\u{032e}ngerzone";
136         assert_eq!(stream_safe(woah_nelly), its_cool);
137     }
138 
139     #[test]
test_all_nonstarters()140     fn test_all_nonstarters() {
141         let s = "\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}";
142         let expected = "\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{034F}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}";
143         assert_eq!(stream_safe(s), expected);
144     }
145 
146     #[test]
test_classify_nonstarters()147     fn test_classify_nonstarters() {
148         // Highest character in the `compat_fully_decomp` table is 2FA1D
149         for ch in 0..0x2FA1E {
150             let ch = match char::from_u32(ch) {
151                 Some(c) => c,
152                 None => continue,
153             };
154             let c = classify_nonstarters(ch);
155             let mut s = Vec::new();
156             decompose_compatible(ch, |c| s.push(c));
157 
158             assert_eq!(s.len(), c.decomposition_len);
159 
160             let num_leading = s
161                 .iter()
162                 .take_while(|&c| canonical_combining_class(*c) != 0)
163                 .count();
164             let num_trailing = s
165                 .iter()
166                 .rev()
167                 .take_while(|&c| canonical_combining_class(*c) != 0)
168                 .count();
169 
170             assert_eq!(num_leading, c.leading_nonstarters);
171             assert_eq!(num_trailing, c.trailing_nonstarters);
172         }
173     }
174 }
175