• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 pub mod errors;
6 mod langid;
7 mod locale;
8 
9 pub use errors::ParseError;
10 pub use langid::*;
11 
12 pub use locale::*;
13 
14 // Safety-usable invariant: returns a prefix of `slice`
skip_before_separator(slice: &[u8]) -> &[u8]15 const fn skip_before_separator(slice: &[u8]) -> &[u8] {
16     let mut end = 0;
17     // Invariant: end ≤ slice.len() since len is a nonnegative integer and end is 0
18 
19     #[allow(clippy::indexing_slicing)] // very protected, should optimize out
20     while end < slice.len() && !matches!(slice[end], b'-') {
21         // Invariant at beginning of loop: end < slice.len()
22         // Advance until we reach end of slice or a separator.
23         end += 1;
24         // Invariant at end of loop: end ≤ slice.len()
25     }
26 
27     // Notice: this slice may be empty for cases like `"en-"` or `"en--US"`
28     // SAFETY: end ≤ slice.len() by while loop
29     // Safety-usable invariant upheld: returned a prefix of the slice
30     unsafe { slice.split_at_unchecked(end).0 }
31 }
32 
33 // `SubtagIterator` is a helper iterator for [`LanguageIdentifier`] and [`Locale`] parsing.
34 //
35 // It is quite extraordinary due to focus on performance and Rust limitations for `const`
36 // functions.
37 //
38 // The iterator is eager and fallible allowing it to reject invalid slices such as `"-"`, `"-en"`,
39 // `"en-"` etc.
40 //
41 // The iterator provides methods available for static users - `next_manual` and `peek_manual`,
42 // as well as typical `Peekable` iterator APIs - `next` and `peek`.
43 //
44 // All methods return an `Option` of a `Result`.
45 #[derive(Copy, Clone, Debug)]
46 pub struct SubtagIterator<'a> {
47     remaining: &'a [u8],
48     // Safety invariant: current is a prefix of remaining
49     current: Option<&'a [u8]>,
50 }
51 
52 impl<'a> SubtagIterator<'a> {
new(rest: &'a [u8]) -> Self53     pub const fn new(rest: &'a [u8]) -> Self {
54         Self {
55             remaining: rest,
56             // Safety invariant upheld: skip_before_separator() returns a prefix of `rest`
57             current: Some(skip_before_separator(rest)),
58         }
59     }
60 
next_const(mut self) -> (Self, Option<&'a [u8]>)61     pub const fn next_const(mut self) -> (Self, Option<&'a [u8]>) {
62         let Some(result) = self.current else {
63             return (self, None);
64         };
65 
66         self.current = if result.len() < self.remaining.len() {
67             // If there is more after `result`, by construction `current` starts with a separator
68             // SAFETY: `self.remaining` is strictly longer than `result` due to `result` being a prefix (from the safety invariant)
69             self.remaining = unsafe { self.remaining.split_at_unchecked(result.len() + 1).1 };
70             // Safety invariant upheld: skip_before_separator() returns a prefix of `rest`, and we don't
71             // mutate self.remaining after this
72             Some(skip_before_separator(self.remaining))
73         } else {
74             None
75         };
76         (self, Some(result))
77     }
78 
peek(&self) -> Option<&'a [u8]>79     pub const fn peek(&self) -> Option<&'a [u8]> {
80         self.current
81     }
82 }
83 
84 impl<'a> Iterator for SubtagIterator<'a> {
85     type Item = &'a [u8];
86 
next(&mut self) -> Option<Self::Item>87     fn next(&mut self) -> Option<Self::Item> {
88         let (s, res) = self.next_const();
89         *self = s;
90         res
91     }
92 }
93 
94 #[cfg(test)]
95 mod test {
96     use super::*;
97 
slice_to_str(input: &[u8]) -> &str98     fn slice_to_str(input: &[u8]) -> &str {
99         std::str::from_utf8(input).unwrap()
100     }
101 
102     #[test]
subtag_iterator_peek_test()103     fn subtag_iterator_peek_test() {
104         let slice = "de-at-u-ca-foobar";
105         let mut si = SubtagIterator::new(slice.as_bytes());
106 
107         assert_eq!(si.peek().map(slice_to_str), Some("de"));
108         assert_eq!(si.peek().map(slice_to_str), Some("de"));
109         assert_eq!(si.next().map(slice_to_str), Some("de"));
110 
111         assert_eq!(si.peek().map(slice_to_str), Some("at"));
112         assert_eq!(si.peek().map(slice_to_str), Some("at"));
113         assert_eq!(si.next().map(slice_to_str), Some("at"));
114     }
115 
116     #[test]
subtag_iterator_test()117     fn subtag_iterator_test() {
118         let slice = "";
119         let mut si = SubtagIterator::new(slice.as_bytes());
120         assert_eq!(si.next().map(slice_to_str), Some(""));
121 
122         let slice = "-";
123         let mut si = SubtagIterator::new(slice.as_bytes());
124         assert_eq!(si.next().map(slice_to_str), Some(""));
125 
126         let slice = "-en";
127         let mut si = SubtagIterator::new(slice.as_bytes());
128         assert_eq!(si.next().map(slice_to_str), Some(""));
129         assert_eq!(si.next().map(slice_to_str), Some("en"));
130         assert_eq!(si.next(), None);
131 
132         let slice = "en";
133         let si = SubtagIterator::new(slice.as_bytes());
134         assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en",]);
135 
136         let slice = "en-";
137         let si = SubtagIterator::new(slice.as_bytes());
138         assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en", "",]);
139 
140         let slice = "--";
141         let mut si = SubtagIterator::new(slice.as_bytes());
142         assert_eq!(si.next().map(slice_to_str), Some(""));
143         assert_eq!(si.next().map(slice_to_str), Some(""));
144         assert_eq!(si.next().map(slice_to_str), Some(""));
145         assert_eq!(si.next(), None);
146 
147         let slice = "-en-";
148         let mut si = SubtagIterator::new(slice.as_bytes());
149         assert_eq!(si.next().map(slice_to_str), Some(""));
150         assert_eq!(si.next().map(slice_to_str), Some("en"));
151         assert_eq!(si.next().map(slice_to_str), Some(""));
152         assert_eq!(si.next(), None);
153 
154         let slice = "de-at-u-ca-foobar";
155         let si = SubtagIterator::new(slice.as_bytes());
156         assert_eq!(
157             si.map(slice_to_str).collect::<Vec<_>>(),
158             vec!["de", "at", "u", "ca", "foobar",]
159         );
160     }
161 
162     #[test]
skip_before_separator_test()163     fn skip_before_separator_test() {
164         let current = skip_before_separator(b"");
165         assert_eq!(current, b"");
166 
167         let current = skip_before_separator(b"en");
168         assert_eq!(current, b"en");
169 
170         let current = skip_before_separator(b"en-");
171         assert_eq!(current, b"en");
172 
173         let current = skip_before_separator(b"en--US");
174         assert_eq!(current, b"en");
175 
176         let current = skip_before_separator(b"-US");
177         assert_eq!(current, b"");
178 
179         let current = skip_before_separator(b"US");
180         assert_eq!(current, b"US");
181 
182         let current = skip_before_separator(b"-");
183         assert_eq!(current, b"");
184     }
185 }
186