• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use regex_automata::DFA;
2 
3 use crate::{
4     ext_slice::ByteSlice,
5     unicode::fsm::{
6         simple_word_fwd::SIMPLE_WORD_FWD, word_break_fwd::WORD_BREAK_FWD,
7     },
8     utf8,
9 };
10 
11 /// An iterator over words in a byte string.
12 ///
13 /// This iterator is typically constructed by
14 /// [`ByteSlice::words`](trait.ByteSlice.html#method.words).
15 ///
16 /// This is similar to the [`WordsWithBreaks`](struct.WordsWithBreaks.html)
17 /// iterator, except it only returns elements that contain a "word" character.
18 /// A word character is defined by UTS #18 (Annex C) to be the combination
19 /// of the `Alphabetic` and `Join_Control` properties, along with the
20 /// `Decimal_Number`, `Mark` and `Connector_Punctuation` general categories.
21 ///
22 /// Since words are made up of one or more codepoints, this iterator yields
23 /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
24 /// are [substituted](index.html#handling-of-invalid-utf-8).
25 ///
26 /// This iterator yields words in accordance with the default word boundary
27 /// rules specified in
28 /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
29 /// In particular, this may not be suitable for Japanese and Chinese scripts
30 /// that do not use spaces between words.
31 #[derive(Clone, Debug)]
32 pub struct Words<'a>(WordsWithBreaks<'a>);
33 
34 impl<'a> Words<'a> {
new(bs: &'a [u8]) -> Words<'a>35     pub(crate) fn new(bs: &'a [u8]) -> Words<'a> {
36         Words(WordsWithBreaks::new(bs))
37     }
38 
39     /// View the underlying data as a subslice of the original data.
40     ///
41     /// The slice returned has the same lifetime as the original slice, and so
42     /// the iterator can continue to be used while this exists.
43     ///
44     /// # Examples
45     ///
46     /// ```
47     /// use bstr::ByteSlice;
48     ///
49     /// let mut it = b"foo bar baz".words();
50     ///
51     /// assert_eq!(b"foo bar baz", it.as_bytes());
52     /// it.next();
53     /// it.next();
54     /// assert_eq!(b" baz", it.as_bytes());
55     /// it.next();
56     /// assert_eq!(b"", it.as_bytes());
57     /// ```
58     #[inline]
as_bytes(&self) -> &'a [u8]59     pub fn as_bytes(&self) -> &'a [u8] {
60         self.0.as_bytes()
61     }
62 }
63 
64 impl<'a> Iterator for Words<'a> {
65     type Item = &'a str;
66 
67     #[inline]
next(&mut self) -> Option<&'a str>68     fn next(&mut self) -> Option<&'a str> {
69         while let Some(word) = self.0.next() {
70             if SIMPLE_WORD_FWD.is_match(word.as_bytes()) {
71                 return Some(word);
72             }
73         }
74         None
75     }
76 }
77 
78 /// An iterator over words in a byte string and their byte index positions.
79 ///
80 /// This iterator is typically constructed by
81 /// [`ByteSlice::word_indices`](trait.ByteSlice.html#method.word_indices).
82 ///
83 /// This is similar to the
84 /// [`WordsWithBreakIndices`](struct.WordsWithBreakIndices.html) iterator,
85 /// except it only returns elements that contain a "word" character. A
86 /// word character is defined by UTS #18 (Annex C) to be the combination
87 /// of the `Alphabetic` and `Join_Control` properties, along with the
88 /// `Decimal_Number`, `Mark` and `Connector_Punctuation` general categories.
89 ///
90 /// Since words are made up of one or more codepoints, this iterator
91 /// yields `&str` elements (along with their start and end byte offsets).
92 /// When invalid UTF-8 is encountered, replacement codepoints are
93 /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
94 /// indices yielded by this iterator may not correspond to the length of the
95 /// word yielded with those indices. For example, when this iterator encounters
96 /// `\xFF` in the byte string, then it will yield a pair of indices ranging
97 /// over a single byte, but will provide an `&str` equivalent to `"\u{FFFD}"`,
98 /// which is three bytes in length. However, when given only valid UTF-8, then
99 /// all indices are in exact correspondence with their paired word.
100 ///
101 /// This iterator yields words in accordance with the default word boundary
102 /// rules specified in
103 /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
104 /// In particular, this may not be suitable for Japanese and Chinese scripts
105 /// that do not use spaces between words.
106 #[derive(Clone, Debug)]
107 pub struct WordIndices<'a>(WordsWithBreakIndices<'a>);
108 
109 impl<'a> WordIndices<'a> {
new(bs: &'a [u8]) -> WordIndices<'a>110     pub(crate) fn new(bs: &'a [u8]) -> WordIndices<'a> {
111         WordIndices(WordsWithBreakIndices::new(bs))
112     }
113 
114     /// View the underlying data as a subslice of the original data.
115     ///
116     /// The slice returned has the same lifetime as the original slice, and so
117     /// the iterator can continue to be used while this exists.
118     ///
119     /// # Examples
120     ///
121     /// ```
122     /// use bstr::ByteSlice;
123     ///
124     /// let mut it = b"foo bar baz".word_indices();
125     ///
126     /// assert_eq!(b"foo bar baz", it.as_bytes());
127     /// it.next();
128     /// it.next();
129     /// assert_eq!(b" baz", it.as_bytes());
130     /// it.next();
131     /// it.next();
132     /// assert_eq!(b"", it.as_bytes());
133     /// ```
134     #[inline]
as_bytes(&self) -> &'a [u8]135     pub fn as_bytes(&self) -> &'a [u8] {
136         self.0.as_bytes()
137     }
138 }
139 
140 impl<'a> Iterator for WordIndices<'a> {
141     type Item = (usize, usize, &'a str);
142 
143     #[inline]
next(&mut self) -> Option<(usize, usize, &'a str)>144     fn next(&mut self) -> Option<(usize, usize, &'a str)> {
145         while let Some((start, end, word)) = self.0.next() {
146             if SIMPLE_WORD_FWD.is_match(word.as_bytes()) {
147                 return Some((start, end, word));
148             }
149         }
150         None
151     }
152 }
153 
154 /// An iterator over all word breaks in a byte string.
155 ///
156 /// This iterator is typically constructed by
157 /// [`ByteSlice::words_with_breaks`](trait.ByteSlice.html#method.words_with_breaks).
158 ///
159 /// This iterator yields not only all words, but the content that comes between
160 /// words. In particular, if all elements yielded by this iterator are
161 /// concatenated, then the result is the original string (subject to Unicode
162 /// replacement codepoint substitutions).
163 ///
164 /// Since words are made up of one or more codepoints, this iterator yields
165 /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
166 /// are [substituted](index.html#handling-of-invalid-utf-8).
167 ///
168 /// This iterator yields words in accordance with the default word boundary
169 /// rules specified in
170 /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
171 /// In particular, this may not be suitable for Japanese and Chinese scripts
172 /// that do not use spaces between words.
173 #[derive(Clone, Debug)]
174 pub struct WordsWithBreaks<'a> {
175     bs: &'a [u8],
176 }
177 
178 impl<'a> WordsWithBreaks<'a> {
new(bs: &'a [u8]) -> WordsWithBreaks<'a>179     pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreaks<'a> {
180         WordsWithBreaks { bs }
181     }
182 
183     /// View the underlying data as a subslice of the original data.
184     ///
185     /// The slice returned has the same lifetime as the original slice, and so
186     /// the iterator can continue to be used while this exists.
187     ///
188     /// # Examples
189     ///
190     /// ```
191     /// use bstr::ByteSlice;
192     ///
193     /// let mut it = b"foo bar baz".words_with_breaks();
194     ///
195     /// assert_eq!(b"foo bar baz", it.as_bytes());
196     /// it.next();
197     /// assert_eq!(b" bar baz", it.as_bytes());
198     /// it.next();
199     /// it.next();
200     /// assert_eq!(b" baz", it.as_bytes());
201     /// it.next();
202     /// it.next();
203     /// assert_eq!(b"", it.as_bytes());
204     /// ```
205     #[inline]
as_bytes(&self) -> &'a [u8]206     pub fn as_bytes(&self) -> &'a [u8] {
207         self.bs
208     }
209 }
210 
211 impl<'a> Iterator for WordsWithBreaks<'a> {
212     type Item = &'a str;
213 
214     #[inline]
next(&mut self) -> Option<&'a str>215     fn next(&mut self) -> Option<&'a str> {
216         let (word, size) = decode_word(self.bs);
217         if size == 0 {
218             return None;
219         }
220         self.bs = &self.bs[size..];
221         Some(word)
222     }
223 }
224 
225 /// An iterator over all word breaks in a byte string, along with their byte
226 /// index positions.
227 ///
228 /// This iterator is typically constructed by
229 /// [`ByteSlice::words_with_break_indices`](trait.ByteSlice.html#method.words_with_break_indices).
230 ///
231 /// This iterator yields not only all words, but the content that comes between
232 /// words. In particular, if all elements yielded by this iterator are
233 /// concatenated, then the result is the original string (subject to Unicode
234 /// replacement codepoint substitutions).
235 ///
236 /// Since words are made up of one or more codepoints, this iterator
237 /// yields `&str` elements (along with their start and end byte offsets).
238 /// When invalid UTF-8 is encountered, replacement codepoints are
239 /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
240 /// indices yielded by this iterator may not correspond to the length of the
241 /// word yielded with those indices. For example, when this iterator encounters
242 /// `\xFF` in the byte string, then it will yield a pair of indices ranging
243 /// over a single byte, but will provide an `&str` equivalent to `"\u{FFFD}"`,
244 /// which is three bytes in length. However, when given only valid UTF-8, then
245 /// all indices are in exact correspondence with their paired word.
246 ///
247 /// This iterator yields words in accordance with the default word boundary
248 /// rules specified in
249 /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
250 /// In particular, this may not be suitable for Japanese and Chinese scripts
251 /// that do not use spaces between words.
252 #[derive(Clone, Debug)]
253 pub struct WordsWithBreakIndices<'a> {
254     bs: &'a [u8],
255     forward_index: usize,
256 }
257 
258 impl<'a> WordsWithBreakIndices<'a> {
new(bs: &'a [u8]) -> WordsWithBreakIndices<'a>259     pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreakIndices<'a> {
260         WordsWithBreakIndices { bs, forward_index: 0 }
261     }
262 
263     /// View the underlying data as a subslice of the original data.
264     ///
265     /// The slice returned has the same lifetime as the original slice, and so
266     /// the iterator can continue to be used while this exists.
267     ///
268     /// # Examples
269     ///
270     /// ```
271     /// use bstr::ByteSlice;
272     ///
273     /// let mut it = b"foo bar baz".words_with_break_indices();
274     ///
275     /// assert_eq!(b"foo bar baz", it.as_bytes());
276     /// it.next();
277     /// assert_eq!(b" bar baz", it.as_bytes());
278     /// it.next();
279     /// it.next();
280     /// assert_eq!(b" baz", it.as_bytes());
281     /// it.next();
282     /// it.next();
283     /// assert_eq!(b"", it.as_bytes());
284     /// ```
285     #[inline]
as_bytes(&self) -> &'a [u8]286     pub fn as_bytes(&self) -> &'a [u8] {
287         self.bs
288     }
289 }
290 
291 impl<'a> Iterator for WordsWithBreakIndices<'a> {
292     type Item = (usize, usize, &'a str);
293 
294     #[inline]
next(&mut self) -> Option<(usize, usize, &'a str)>295     fn next(&mut self) -> Option<(usize, usize, &'a str)> {
296         let index = self.forward_index;
297         let (word, size) = decode_word(self.bs);
298         if size == 0 {
299             return None;
300         }
301         self.bs = &self.bs[size..];
302         self.forward_index += size;
303         Some((index, index + size, word))
304     }
305 }
306 
decode_word(bs: &[u8]) -> (&str, usize)307 fn decode_word(bs: &[u8]) -> (&str, usize) {
308     if bs.is_empty() {
309         ("", 0)
310     } else if let Some(end) = WORD_BREAK_FWD.find(bs) {
311         // Safe because a match can only occur for valid UTF-8.
312         let word = unsafe { bs[..end].to_str_unchecked() };
313         (word, word.len())
314     } else {
315         const INVALID: &'static str = "\u{FFFD}";
316         // No match on non-empty bytes implies we found invalid UTF-8.
317         let (_, size) = utf8::decode_lossy(bs);
318         (INVALID, size)
319     }
320 }
321 
322 #[cfg(all(test, feature = "std"))]
323 mod tests {
324     #[cfg(not(miri))]
325     use ucd_parse::WordBreakTest;
326 
327     use crate::ext_slice::ByteSlice;
328 
329     #[test]
330     #[cfg(not(miri))]
forward_ucd()331     fn forward_ucd() {
332         for (i, test) in ucdtests().into_iter().enumerate() {
333             let given = test.words.concat();
334             let got = words(given.as_bytes());
335             assert_eq!(
336                 test.words,
337                 got,
338                 "\n\nword forward break test {} failed:\n\
339                  given:    {:?}\n\
340                  expected: {:?}\n\
341                  got:      {:?}\n",
342                 i,
343                 given,
344                 strs_to_bstrs(&test.words),
345                 strs_to_bstrs(&got),
346             );
347         }
348     }
349 
350     // Some additional tests that don't seem to be covered by the UCD tests.
351     //
352     // It's pretty amazing that the UCD tests miss these cases. I only found
353     // them by running this crate's segmenter and ICU's segmenter on the same
354     // text and comparing the output.
355     #[test]
forward_additional()356     fn forward_additional() {
357         assert_eq!(vec!["a", ".", "  ", "Y"], words(b"a.  Y"));
358         assert_eq!(vec!["r", ".", "  ", "Yo"], words(b"r.  Yo"));
359         assert_eq!(
360             vec!["whatsoever", ".", "  ", "You", " ", "may"],
361             words(b"whatsoever.  You may")
362         );
363         assert_eq!(
364             vec!["21stcentury'syesterday"],
365             words(b"21stcentury'syesterday")
366         );
367 
368         assert_eq!(vec!["Bonta_", "'", "s"], words(b"Bonta_'s"));
369         assert_eq!(vec!["_vhat's"], words(b"_vhat's"));
370         assert_eq!(vec!["__on'anima"], words(b"__on'anima"));
371         assert_eq!(vec!["123_", "'", "4"], words(b"123_'4"));
372         assert_eq!(vec!["_123'4"], words(b"_123'4"));
373         assert_eq!(vec!["__12'345"], words(b"__12'345"));
374 
375         assert_eq!(
376             vec!["tomorrowat4", ":", "00", ","],
377             words(b"tomorrowat4:00,")
378         );
379         assert_eq!(vec!["RS1", "'", "s"], words(b"RS1's"));
380         assert_eq!(vec!["X38"], words(b"X38"));
381 
382         assert_eq!(vec!["4abc", ":", "00", ","], words(b"4abc:00,"));
383         assert_eq!(vec!["12S", "'", "1"], words(b"12S'1"));
384         assert_eq!(vec!["1XY"], words(b"1XY"));
385 
386         assert_eq!(vec!["\u{FEFF}", "Ты"], words("\u{FEFF}Ты".as_bytes()));
387 
388         // Tests that Vithkuqi works, which was introduced in Unicode 14.
389         // This test fails prior to Unicode 14.
390         assert_eq!(
391             vec!["\u{10570}\u{10597}"],
392             words("\u{10570}\u{10597}".as_bytes())
393         );
394     }
395 
words(bytes: &[u8]) -> Vec<&str>396     fn words(bytes: &[u8]) -> Vec<&str> {
397         bytes.words_with_breaks().collect()
398     }
399 
400     #[cfg(not(miri))]
strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]>401     fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
402         strs.iter().map(|s| s.as_ref().as_bytes()).collect()
403     }
404 
405     /// Return all of the UCD for word breaks.
406     #[cfg(not(miri))]
ucdtests() -> Vec<WordBreakTest>407     fn ucdtests() -> Vec<WordBreakTest> {
408         const TESTDATA: &'static str = include_str!("data/WordBreakTest.txt");
409 
410         let mut tests = vec![];
411         for mut line in TESTDATA.lines() {
412             line = line.trim();
413             if line.starts_with("#") || line.contains("surrogate") {
414                 continue;
415             }
416             tests.push(line.parse().unwrap());
417         }
418         tests
419     }
420 }
421