• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 use core::cmp;
12 use core::iter::Filter;
13 
14 // All of the logic for forward iteration over sentences
15 mod fwd {
16     use crate::tables::sentence::SentenceCat;
17     use core::cmp;
18 
19     // Describe a parsed part of source string as described in this table:
20     // https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
21     #[derive(Debug, Clone, Copy, PartialEq, Eq)]
22     enum StatePart {
23         Sot,
24         Eot,
25         Other,
26         CR,
27         LF,
28         Sep,
29         ATerm,
30         UpperLower,
31         ClosePlus,
32         SpPlus,
33         STerm,
34     }
35 
36     #[derive(Debug, Clone, PartialEq, Eq)]
37     struct SentenceBreaksState(pub [StatePart; 4]);
38 
39     const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
40         StatePart::Sot,
41         StatePart::Sot,
42         StatePart::Sot,
43         StatePart::Sot,
44     ]);
45 
46     #[derive(Debug, Clone)]
47     pub struct SentenceBreaks<'a> {
48         pub string: &'a str,
49         pos: usize,
50         state: SentenceBreaksState,
51     }
52 
53     impl SentenceBreaksState {
54         // Attempt to advance the internal state by one part
55         // Whitespace and some punctutation will be collapsed
next(&self, cat: SentenceCat) -> SentenceBreaksState56         fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
57             let &SentenceBreaksState(parts) = self;
58             let parts = match (parts[3], cat) {
59                 (StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
60                 (StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
61                 _ => [
62                     parts[1],
63                     parts[2],
64                     parts[3],
65                     match cat {
66                         SentenceCat::SC_CR => StatePart::CR,
67                         SentenceCat::SC_LF => StatePart::LF,
68                         SentenceCat::SC_Sep => StatePart::Sep,
69                         SentenceCat::SC_ATerm => StatePart::ATerm,
70                         SentenceCat::SC_Upper | SentenceCat::SC_Lower => StatePart::UpperLower,
71                         SentenceCat::SC_Close => StatePart::ClosePlus,
72                         SentenceCat::SC_Sp => StatePart::SpPlus,
73                         SentenceCat::SC_STerm => StatePart::STerm,
74                         _ => StatePart::Other,
75                     },
76                 ],
77             };
78             SentenceBreaksState(parts)
79         }
80 
end(&self) -> SentenceBreaksState81         fn end(&self) -> SentenceBreaksState {
82             let &SentenceBreaksState(parts) = self;
83             SentenceBreaksState([parts[1], parts[2], parts[3], StatePart::Eot])
84         }
85 
86         // Helper function to check if state head matches a single `StatePart`
match1(&self, part: StatePart) -> bool87         fn match1(&self, part: StatePart) -> bool {
88             let &SentenceBreaksState(parts) = self;
89             part == parts[3]
90         }
91 
92         // Helper function to check if first two `StateParts` in state match
93         // the given two
match2(&self, part1: StatePart, part2: StatePart) -> bool94         fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
95             let &SentenceBreaksState(parts) = self;
96             part1 == parts[2] && part2 == parts[3]
97         }
98     }
99 
100     // https://unicode.org/reports/tr29/#SB8
101     // TODO cache this, it is currently quadratic
match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool102     fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
103         let &SentenceBreaksState(parts) = state;
104         let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
105         if parts[idx] == StatePart::ClosePlus {
106             idx -= 1
107         }
108 
109         if parts[idx] == StatePart::ATerm {
110             use crate::tables::sentence as se;
111 
112             for next_char in ahead.chars() {
113                 //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
114                 match se::sentence_category(next_char).2 {
115                     se::SC_Lower => return true,
116                     se::SC_OLetter
117                     | se::SC_Upper
118                     | se::SC_Sep
119                     | se::SC_CR
120                     | se::SC_LF
121                     | se::SC_STerm
122                     | se::SC_ATerm => return false,
123                     _ => continue,
124                 }
125             }
126         }
127 
128         false
129     }
130 
131     // https://unicode.org/reports/tr29/#SB8a
match_sb8a(state: &SentenceBreaksState) -> bool132     fn match_sb8a(state: &SentenceBreaksState) -> bool {
133         // SATerm Close* Sp*
134         let &SentenceBreaksState(parts) = state;
135         let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
136         if parts[idx] == StatePart::ClosePlus {
137             idx -= 1
138         }
139         parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
140     }
141 
142     // https://unicode.org/reports/tr29/#SB9
match_sb9(state: &SentenceBreaksState) -> bool143     fn match_sb9(state: &SentenceBreaksState) -> bool {
144         // SATerm Close*
145         let &SentenceBreaksState(parts) = state;
146         let idx = if parts[3] == StatePart::ClosePlus {
147             2
148         } else {
149             3
150         };
151         parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
152     }
153 
154     // https://unicode.org/reports/tr29/#SB11
match_sb11(state: &SentenceBreaksState) -> bool155     fn match_sb11(state: &SentenceBreaksState) -> bool {
156         // SATerm Close* Sp* ParaSep?
157         let &SentenceBreaksState(parts) = state;
158         let mut idx = match parts[3] {
159             StatePart::Sep | StatePart::CR | StatePart::LF => 2,
160             _ => 3,
161         };
162 
163         if parts[idx] == StatePart::SpPlus {
164             idx -= 1
165         }
166         if parts[idx] == StatePart::ClosePlus {
167             idx -= 1
168         }
169 
170         parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
171     }
172 
173     impl<'a> Iterator for SentenceBreaks<'a> {
174         // Returns the index of the character which follows a break
175         type Item = usize;
176 
177         #[inline]
size_hint(&self) -> (usize, Option<usize>)178         fn size_hint(&self) -> (usize, Option<usize>) {
179             let slen = self.string.len();
180             // A sentence could be one character
181             (cmp::min(slen, 2), Some(slen + 1))
182         }
183 
184         #[inline]
next(&mut self) -> Option<usize>185         fn next(&mut self) -> Option<usize> {
186             use crate::tables::sentence as se;
187 
188             for next_char in self.string[self.pos..].chars() {
189                 let position_before = self.pos;
190                 let state_before = self.state.clone();
191 
192                 let next_cat = se::sentence_category(next_char).2;
193 
194                 self.pos += next_char.len_utf8();
195                 self.state = self.state.next(next_cat);
196 
197                 match next_cat {
198                     // SB1 https://unicode.org/reports/tr29/#SB1
199                     _ if state_before.match1(StatePart::Sot) => return Some(position_before),
200 
201                     // SB2 is handled when inner iterator (chars) is finished
202 
203                     // SB3 https://unicode.org/reports/tr29/#SB3
204                     SentenceCat::SC_LF if state_before.match1(StatePart::CR) => continue,
205 
206                     // SB4 https://unicode.org/reports/tr29/#SB4
207                     _ if state_before.match1(StatePart::Sep)
208                         || state_before.match1(StatePart::CR)
209                         || state_before.match1(StatePart::LF) =>
210                     {
211                         return Some(position_before)
212                     }
213 
214                     // SB5 https://unicode.org/reports/tr29/#SB5
215                     SentenceCat::SC_Extend | SentenceCat::SC_Format => self.state = state_before,
216 
217                     // SB6 https://unicode.org/reports/tr29/#SB6
218                     SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) => continue,
219 
220                     // SB7 https://unicode.org/reports/tr29/#SB7
221                     SentenceCat::SC_Upper
222                         if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
223                     {
224                         continue
225                     }
226 
227                     // SB8 https://unicode.org/reports/tr29/#SB8
228                     _ if match_sb8(&state_before, &self.string[position_before..]) => continue,
229 
230                     // SB8a https://unicode.org/reports/tr29/#SB8a
231                     SentenceCat::SC_SContinue | SentenceCat::SC_STerm | SentenceCat::SC_ATerm
232                         if match_sb8a(&state_before) =>
233                     {
234                         continue
235                     }
236 
237                     // SB9 https://unicode.org/reports/tr29/#SB9
238                     SentenceCat::SC_Close
239                     | SentenceCat::SC_Sp
240                     | SentenceCat::SC_Sep
241                     | SentenceCat::SC_CR
242                     | SentenceCat::SC_LF
243                         if match_sb9(&state_before) =>
244                     {
245                         continue
246                     }
247 
248                     // SB10 https://unicode.org/reports/tr29/#SB10
249                     SentenceCat::SC_Sp
250                     | SentenceCat::SC_Sep
251                     | SentenceCat::SC_CR
252                     | SentenceCat::SC_LF
253                         if match_sb8a(&state_before) =>
254                     {
255                         continue
256                     }
257 
258                     // SB11 https://unicode.org/reports/tr29/#SB11
259                     _ if match_sb11(&state_before) => return Some(position_before),
260 
261                     // SB998 https://unicode.org/reports/tr29/#SB998
262                     _ => continue,
263                 }
264             }
265 
266             // SB2 https://unicode.org/reports/tr29/#SB2
267             if self.state.match1(StatePart::Sot) || self.state.match1(StatePart::Eot) {
268                 None
269             } else {
270                 self.state = self.state.end();
271                 Some(self.pos)
272             }
273         }
274     }
275 
new_sentence_breaks(source: &str) -> SentenceBreaks<'_>276     pub fn new_sentence_breaks(source: &str) -> SentenceBreaks<'_> {
277         SentenceBreaks {
278             string: source,
279             pos: 0,
280             state: INITIAL_STATE,
281         }
282     }
283 }
284 
285 /// An iterator over the substrings of a string which, after splitting the string on
286 /// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
287 /// contain any characters with the
288 /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
289 /// property, or with
290 /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
291 ///
292 /// This struct is created by the [`unicode_sentences`] method on the [`UnicodeSegmentation`]
293 /// trait. See its documentation for more.
294 ///
295 /// [`unicode_sentences`]: trait.UnicodeSegmentation.html#tymethod.unicode_sentences
296 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
297 #[derive(Debug, Clone)]
298 pub struct UnicodeSentences<'a> {
299     inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
300 }
301 
302 /// External iterator for a string's
303 /// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
304 ///
305 /// This struct is created by the [`split_sentence_bounds`] method on the [`UnicodeSegmentation`]
306 /// trait. See its documentation for more.
307 ///
308 /// [`split_sentence_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bounds
309 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
310 #[derive(Debug, Clone)]
311 pub struct USentenceBounds<'a> {
312     iter: fwd::SentenceBreaks<'a>,
313     sentence_start: Option<usize>,
314 }
315 
316 /// External iterator for sentence boundaries and byte offsets.
317 ///
318 /// This struct is created by the [`split_sentence_bound_indices`] method on the
319 /// [`UnicodeSegmentation`] trait. See its documentation for more.
320 ///
321 /// [`split_sentence_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bound_indices
322 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
323 #[derive(Debug, Clone)]
324 pub struct USentenceBoundIndices<'a> {
325     start_offset: usize,
326     iter: USentenceBounds<'a>,
327 }
328 
329 #[inline]
new_sentence_bounds(source: &str) -> USentenceBounds<'_>330 pub fn new_sentence_bounds(source: &str) -> USentenceBounds<'_> {
331     USentenceBounds {
332         iter: fwd::new_sentence_breaks(source),
333         sentence_start: None,
334     }
335 }
336 
337 #[inline]
new_sentence_bound_indices(source: &str) -> USentenceBoundIndices<'_>338 pub fn new_sentence_bound_indices(source: &str) -> USentenceBoundIndices<'_> {
339     USentenceBoundIndices {
340         start_offset: source.as_ptr() as usize,
341         iter: new_sentence_bounds(source),
342     }
343 }
344 
345 #[inline]
new_unicode_sentences(s: &str) -> UnicodeSentences<'_>346 pub fn new_unicode_sentences(s: &str) -> UnicodeSentences<'_> {
347     use super::UnicodeSegmentation;
348     use crate::tables::util::is_alphanumeric;
349 
350     fn has_alphanumeric(s: &&str) -> bool {
351         s.chars().any(is_alphanumeric)
352     }
353     let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
354 
355     UnicodeSentences {
356         inner: s.split_sentence_bounds().filter(has_alphanumeric),
357     }
358 }
359 
360 impl<'a> Iterator for UnicodeSentences<'a> {
361     type Item = &'a str;
362 
363     #[inline]
next(&mut self) -> Option<&'a str>364     fn next(&mut self) -> Option<&'a str> {
365         self.inner.next()
366     }
367 
368     #[inline]
size_hint(&self) -> (usize, Option<usize>)369     fn size_hint(&self) -> (usize, Option<usize>) {
370         self.inner.size_hint()
371     }
372 }
373 
374 impl<'a> Iterator for USentenceBounds<'a> {
375     type Item = &'a str;
376 
377     #[inline]
size_hint(&self) -> (usize, Option<usize>)378     fn size_hint(&self) -> (usize, Option<usize>) {
379         let (lower, upper) = self.iter.size_hint();
380         (cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1)))
381     }
382 
383     #[inline]
next(&mut self) -> Option<&'a str>384     fn next(&mut self) -> Option<&'a str> {
385         if self.sentence_start.is_none() {
386             if let Some(start_pos) = self.iter.next() {
387                 self.sentence_start = Some(start_pos)
388             } else {
389                 return None;
390             }
391         }
392 
393         if let Some(break_pos) = self.iter.next() {
394             let start_pos = self.sentence_start.unwrap();
395             let sentence = &self.iter.string[start_pos..break_pos];
396             self.sentence_start = Some(break_pos);
397             Some(sentence)
398         } else {
399             None
400         }
401     }
402 }
403 
404 impl<'a> Iterator for USentenceBoundIndices<'a> {
405     type Item = (usize, &'a str);
406 
407     #[inline]
next(&mut self) -> Option<(usize, &'a str)>408     fn next(&mut self) -> Option<(usize, &'a str)> {
409         self.iter
410             .next()
411             .map(|s| (s.as_ptr() as usize - self.start_offset, s))
412     }
413 
414     #[inline]
size_hint(&self) -> (usize, Option<usize>)415     fn size_hint(&self) -> (usize, Option<usize>) {
416         self.iter.size_hint()
417     }
418 }
419