• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use regex_automata::DFA;
2 
3 use crate::ext_slice::ByteSlice;
4 use crate::unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD;
5 use crate::utf8;
6 
7 /// An iterator over sentences in a byte string.
8 ///
9 /// This iterator is typically constructed by
10 /// [`ByteSlice::sentences`](trait.ByteSlice.html#method.sentences).
11 ///
12 /// Sentences typically include their trailing punctuation and whitespace.
13 ///
14 /// Since sentences are made up of one or more codepoints, this iterator yields
15 /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
16 /// are [substituted](index.html#handling-of-invalid-utf-8).
17 ///
18 /// This iterator yields words in accordance with the default sentence boundary
19 /// rules specified in
20 /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries).
21 #[derive(Clone, Debug)]
22 pub struct Sentences<'a> {
23     bs: &'a [u8],
24 }
25 
26 impl<'a> Sentences<'a> {
new(bs: &'a [u8]) -> Sentences<'a>27     pub(crate) fn new(bs: &'a [u8]) -> Sentences<'a> {
28         Sentences { bs }
29     }
30 
31     /// View the underlying data as a subslice of the original data.
32     ///
33     /// The slice returned has the same lifetime as the original slice, and so
34     /// the iterator can continue to be used while this exists.
35     ///
36     /// # Examples
37     ///
38     /// ```
39     /// use bstr::ByteSlice;
40     ///
41     /// let mut it = b"I want this. Not that. Right now.".sentences();
42     ///
43     /// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes());
44     /// it.next();
45     /// assert_eq!(b"Not that. Right now.", it.as_bytes());
46     /// it.next();
47     /// it.next();
48     /// assert_eq!(b"", it.as_bytes());
49     /// ```
50     #[inline]
as_bytes(&self) -> &'a [u8]51     pub fn as_bytes(&self) -> &'a [u8] {
52         self.bs
53     }
54 }
55 
56 impl<'a> Iterator for Sentences<'a> {
57     type Item = &'a str;
58 
59     #[inline]
next(&mut self) -> Option<&'a str>60     fn next(&mut self) -> Option<&'a str> {
61         let (sentence, size) = decode_sentence(self.bs);
62         if size == 0 {
63             return None;
64         }
65         self.bs = &self.bs[size..];
66         Some(sentence)
67     }
68 }
69 
70 /// An iterator over sentences in a byte string, along with their byte offsets.
71 ///
72 /// This iterator is typically constructed by
73 /// [`ByteSlice::sentence_indices`](trait.ByteSlice.html#method.sentence_indices).
74 ///
75 /// Sentences typically include their trailing punctuation and whitespace.
76 ///
77 /// Since sentences are made up of one or more codepoints, this iterator
78 /// yields `&str` elements (along with their start and end byte offsets).
79 /// When invalid UTF-8 is encountered, replacement codepoints are
80 /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
81 /// indices yielded by this iterator may not correspond to the length of the
82 /// sentence yielded with those indices. For example, when this iterator
83 /// encounters `\xFF` in the byte string, then it will yield a pair of indices
84 /// ranging over a single byte, but will provide an `&str` equivalent to
85 /// `"\u{FFFD}"`, which is three bytes in length. However, when given only
86 /// valid UTF-8, then all indices are in exact correspondence with their paired
87 /// word.
88 ///
89 /// This iterator yields words in accordance with the default sentence boundary
90 /// rules specified in
91 /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries).
92 #[derive(Clone, Debug)]
93 pub struct SentenceIndices<'a> {
94     bs: &'a [u8],
95     forward_index: usize,
96 }
97 
98 impl<'a> SentenceIndices<'a> {
new(bs: &'a [u8]) -> SentenceIndices<'a>99     pub(crate) fn new(bs: &'a [u8]) -> SentenceIndices<'a> {
100         SentenceIndices { bs: bs, forward_index: 0 }
101     }
102 
103     /// View the underlying data as a subslice of the original data.
104     ///
105     /// The slice returned has the same lifetime as the original slice, and so
106     /// the iterator can continue to be used while this exists.
107     ///
108     /// # Examples
109     ///
110     /// ```
111     /// use bstr::ByteSlice;
112     ///
113     /// let mut it = b"I want this. Not that. Right now.".sentence_indices();
114     ///
115     /// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes());
116     /// it.next();
117     /// assert_eq!(b"Not that. Right now.", it.as_bytes());
118     /// it.next();
119     /// it.next();
120     /// assert_eq!(b"", it.as_bytes());
121     /// ```
122     #[inline]
as_bytes(&self) -> &'a [u8]123     pub fn as_bytes(&self) -> &'a [u8] {
124         self.bs
125     }
126 }
127 
128 impl<'a> Iterator for SentenceIndices<'a> {
129     type Item = (usize, usize, &'a str);
130 
131     #[inline]
next(&mut self) -> Option<(usize, usize, &'a str)>132     fn next(&mut self) -> Option<(usize, usize, &'a str)> {
133         let index = self.forward_index;
134         let (word, size) = decode_sentence(self.bs);
135         if size == 0 {
136             return None;
137         }
138         self.bs = &self.bs[size..];
139         self.forward_index += size;
140         Some((index, index + size, word))
141     }
142 }
143 
decode_sentence(bs: &[u8]) -> (&str, usize)144 fn decode_sentence(bs: &[u8]) -> (&str, usize) {
145     if bs.is_empty() {
146         ("", 0)
147     } else if let Some(end) = SENTENCE_BREAK_FWD.find(bs) {
148         // Safe because a match can only occur for valid UTF-8.
149         let sentence = unsafe { bs[..end].to_str_unchecked() };
150         (sentence, sentence.len())
151     } else {
152         const INVALID: &'static str = "\u{FFFD}";
153         // No match on non-empty bytes implies we found invalid UTF-8.
154         let (_, size) = utf8::decode_lossy(bs);
155         (INVALID, size)
156     }
157 }
158 
159 #[cfg(test)]
160 mod tests {
161     use ucd_parse::SentenceBreakTest;
162 
163     use crate::ext_slice::ByteSlice;
164 
165     #[test]
forward_ucd()166     fn forward_ucd() {
167         for (i, test) in ucdtests().into_iter().enumerate() {
168             let given = test.sentences.concat();
169             let got = sentences(given.as_bytes());
170             assert_eq!(
171                 test.sentences,
172                 got,
173                 "\n\nsentence forward break test {} failed:\n\
174                  given:    {:?}\n\
175                  expected: {:?}\n\
176                  got:      {:?}\n",
177                 i,
178                 given,
179                 strs_to_bstrs(&test.sentences),
180                 strs_to_bstrs(&got),
181             );
182         }
183     }
184 
185     // Some additional tests that don't seem to be covered by the UCD tests.
186     #[test]
forward_additional()187     fn forward_additional() {
188         assert_eq!(vec!["a.. ", "A"], sentences(b"a.. A"));
189         assert_eq!(vec!["a.. a"], sentences(b"a.. a"));
190 
191         assert_eq!(vec!["a... ", "A"], sentences(b"a... A"));
192         assert_eq!(vec!["a... a"], sentences(b"a... a"));
193 
194         assert_eq!(vec!["a...,..., a"], sentences(b"a...,..., a"));
195     }
196 
sentences(bytes: &[u8]) -> Vec<&str>197     fn sentences(bytes: &[u8]) -> Vec<&str> {
198         bytes.sentences().collect()
199     }
200 
strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]>201     fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
202         strs.iter().map(|s| s.as_ref().as_bytes()).collect()
203     }
204 
205     /// Return all of the UCD for sentence breaks.
ucdtests() -> Vec<SentenceBreakTest>206     fn ucdtests() -> Vec<SentenceBreakTest> {
207         const TESTDATA: &'static str =
208             include_str!("data/SentenceBreakTest.txt");
209 
210         let mut tests = vec![];
211         for mut line in TESTDATA.lines() {
212             line = line.trim();
213             if line.starts_with("#") || line.contains("surrogate") {
214                 continue;
215             }
216             tests.push(line.parse().unwrap());
217         }
218         tests
219     }
220 }
221