• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 use core::cmp;
12 use core::iter::Filter;
13 
14 use crate::tables::word::WordCat;
15 
16 /// An iterator over the substrings of a string which, after splitting the string on
17 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
18 /// contain any characters with the
19 /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
20 /// property, or with
21 /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
22 ///
23 /// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
24 /// its documentation for more.
25 ///
26 /// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
27 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
28 pub struct UnicodeWords<'a> {
29     inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
30 }
31 
32 impl<'a> Iterator for UnicodeWords<'a> {
33     type Item = &'a str;
34 
35     #[inline]
next(&mut self) -> Option<&'a str>36     fn next(&mut self) -> Option<&'a str> {
37         self.inner.next()
38     }
39 }
40 impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
41     #[inline]
next_back(&mut self) -> Option<&'a str>42     fn next_back(&mut self) -> Option<&'a str> {
43         self.inner.next_back()
44     }
45 }
46 
47 /// An iterator over the substrings of a string which, after splitting the string on
48 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
49 /// contain any characters with the
50 /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
51 /// property, or with
52 /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
53 /// This iterator also provides the byte offsets for each substring.
54 ///
55 /// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
56 /// its documentation for more.
57 ///
58 /// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
59 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
60 pub struct UnicodeWordIndices<'a> {
61     inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
62 }
63 
64 impl<'a> Iterator for UnicodeWordIndices<'a> {
65     type Item = (usize, &'a str);
66 
67     #[inline]
next(&mut self) -> Option<(usize, &'a str)>68     fn next(&mut self) -> Option<(usize, &'a str)> {
69         self.inner.next()
70     }
71 }
72 impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
73     #[inline]
next_back(&mut self) -> Option<(usize, &'a str)>74     fn next_back(&mut self) -> Option<(usize, &'a str)> {
75         self.inner.next_back()
76     }
77 }
78 
79 /// External iterator for a string's
80 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
81 ///
82 /// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`]
83 /// trait. See its documentation for more.
84 ///
85 /// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds
86 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
87 #[derive(Clone)]
88 pub struct UWordBounds<'a> {
89     string: &'a str,
90     cat: Option<WordCat>,
91     catb: Option<WordCat>,
92 }
93 
94 /// External iterator for word boundaries and byte offsets.
95 ///
96 /// This struct is created by the [`split_word_bound_indices`] method on the
97 /// [`UnicodeSegmentation`] trait. See its documentation for more.
98 ///
99 /// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices
100 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
101 #[derive(Clone)]
102 pub struct UWordBoundIndices<'a> {
103     start_offset: usize,
104     iter: UWordBounds<'a>,
105 }
106 
107 impl<'a> UWordBoundIndices<'a> {
108     #[inline]
109     /// View the underlying data (the part yet to be iterated) as a slice of the original string.
110     ///
111     /// ```rust
112     /// # use unicode_segmentation::UnicodeSegmentation;
113     /// let mut iter = "Hello world".split_word_bound_indices();
114     /// assert_eq!(iter.as_str(), "Hello world");
115     /// iter.next();
116     /// assert_eq!(iter.as_str(), " world");
117     /// iter.next();
118     /// assert_eq!(iter.as_str(), "world");
119     /// ```
as_str(&self) -> &'a str120     pub fn as_str(&self) -> &'a str {
121         self.iter.as_str()
122     }
123 }
124 
125 impl<'a> Iterator for UWordBoundIndices<'a> {
126     type Item = (usize, &'a str);
127 
128     #[inline]
next(&mut self) -> Option<(usize, &'a str)>129     fn next(&mut self) -> Option<(usize, &'a str)> {
130         self.iter
131             .next()
132             .map(|s| (s.as_ptr() as usize - self.start_offset, s))
133     }
134 
135     #[inline]
size_hint(&self) -> (usize, Option<usize>)136     fn size_hint(&self) -> (usize, Option<usize>) {
137         self.iter.size_hint()
138     }
139 }
140 
141 impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
142     #[inline]
next_back(&mut self) -> Option<(usize, &'a str)>143     fn next_back(&mut self) -> Option<(usize, &'a str)> {
144         self.iter
145             .next_back()
146             .map(|s| (s.as_ptr() as usize - self.start_offset, s))
147     }
148 }
149 
150 // state machine for word boundary rules
151 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
152 enum UWordBoundsState {
153     Start,
154     Letter,
155     HLetter,
156     Numeric,
157     Katakana,
158     ExtendNumLet,
159     Regional(RegionalState),
160     FormatExtend(FormatExtendType),
161     Zwj,
162     Emoji,
163     WSegSpace,
164 }
165 
166 // subtypes for FormatExtend state in UWordBoundsState
167 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
168 enum FormatExtendType {
169     AcceptAny,
170     AcceptNone,
171     RequireLetter,
172     RequireHLetter,
173     AcceptQLetter,
174     RequireNumeric,
175 }
176 
177 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
178 enum RegionalState {
179     Half,
180     Full,
181     Unknown,
182 }
183 
is_emoji(ch: char) -> bool184 fn is_emoji(ch: char) -> bool {
185     use crate::tables::emoji;
186     emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
187 }
188 
189 impl<'a> Iterator for UWordBounds<'a> {
190     type Item = &'a str;
191 
192     #[inline]
size_hint(&self) -> (usize, Option<usize>)193     fn size_hint(&self) -> (usize, Option<usize>) {
194         let slen = self.string.len();
195         (cmp::min(slen, 1), Some(slen))
196     }
197 
198     #[inline]
next(&mut self) -> Option<&'a str>199     fn next(&mut self) -> Option<&'a str> {
200         use self::FormatExtendType::*;
201         use self::UWordBoundsState::*;
202         use crate::tables::word as wd;
203         if self.string.len() == 0 {
204             return None;
205         }
206 
207         let mut take_curr = true;
208         let mut take_cat = true;
209         let mut idx = 0;
210         let mut saveidx = 0;
211         let mut state = Start;
212         let mut cat = wd::WC_Any;
213         let mut savecat = wd::WC_Any;
214 
215         // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
216         let mut skipped_format_extend = false;
217         for (curr, ch) in self.string.char_indices() {
218             idx = curr;
219             // Whether or not the previous category was ZWJ
220             // ZWJs get collapsed, so this handles precedence of WB3c over WB4
221             let prev_zwj = cat == wd::WC_ZWJ;
222             // if there's a category cached, grab it
223             cat = match self.cat {
224                 None => wd::word_category(ch).2,
225                 _ => self.cat.take().unwrap(),
226             };
227             take_cat = true;
228 
229             // handle rule WB4
230             // just skip all format, extend, and zwj chars
231             // note that Start is a special case: if there's a bunch of Format | Extend
232             // characters at the beginning of a block of text, dump them out as one unit.
233             //
234             // (This is not obvious from the wording of UAX#29, but if you look at the
235             // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
236             // then the "correct" interpretation of WB4 becomes apparent.)
237             if state != Start {
238                 match cat {
239                     wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
240                         skipped_format_extend = true;
241                         continue;
242                     }
243                     _ => {}
244                 }
245             }
246 
247             // rule WB3c
248             // WB4 makes all ZWJs collapse into the previous state
249             // but you can still be in a Zwj state if you started with Zwj
250             //
251             // This means that an EP + Zwj will collapse into EP, which is wrong,
252             // since EP+EP is not a boundary but EP+ZWJ+EP is
253             //
254             // Thus, we separately keep track of whether or not the last character
255             // was a ZWJ. This is an additional bit of state tracked outside of the
256             // state enum; the state enum represents the last non-zwj state encountered.
257             // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
258             // however we are in the previous state for the purposes of all other rules.
259             if prev_zwj {
260                 if is_emoji(ch) {
261                     state = Emoji;
262                     continue;
263                 }
264             }
265             // Don't use `continue` in this match without updating `cat`
266             state = match state {
267                 Start if cat == wd::WC_CR => {
268                     idx += match self.get_next_cat(idx) {
269                         Some(ncat) if ncat == wd::WC_LF => 1, // rule WB3
270                         _ => 0,
271                     };
272                     break; // rule WB3a
273                 }
274                 Start => match cat {
275                     wd::WC_ALetter => Letter,            // rule WB5, WB6, WB9, WB13a
276                     wd::WC_Hebrew_Letter => HLetter,     // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
277                     wd::WC_Numeric => Numeric,           // rule WB8, WB10, WB12, WB13a
278                     wd::WC_Katakana => Katakana,         // rule WB13, WB13a
279                     wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
280                     wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
281                     wd::WC_LF | wd::WC_Newline => break, // rule WB3a
282                     wd::WC_ZWJ => Zwj,                   // rule WB3c
283                     wd::WC_WSegSpace => WSegSpace,       // rule WB3d
284                     _ => {
285                         if let Some(ncat) = self.get_next_cat(idx) {
286                             // rule WB4
287                             if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ
288                             {
289                                 state = FormatExtend(AcceptNone);
290                                 self.cat = Some(ncat);
291                                 continue;
292                             }
293                         }
294                         break; // rule WB999
295                     }
296                 },
297                 WSegSpace => match cat {
298                     wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
299                     _ => {
300                         take_curr = false;
301                         break;
302                     }
303                 },
304                 Zwj => {
305                     // We already handle WB3c above.
306                     take_curr = false;
307                     break;
308                 }
309                 Letter | HLetter => match cat {
310                     wd::WC_ALetter => Letter,            // rule WB5
311                     wd::WC_Hebrew_Letter => HLetter,     // rule WB5
312                     wd::WC_Numeric => Numeric,           // rule WB9
313                     wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
314                     wd::WC_Double_Quote if state == HLetter => {
315                         savecat = cat;
316                         saveidx = idx;
317                         FormatExtend(RequireHLetter) // rule WB7b
318                     }
319                     wd::WC_Single_Quote if state == HLetter => {
320                         FormatExtend(AcceptQLetter) // rule WB7a
321                     }
322                     wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
323                         savecat = cat;
324                         saveidx = idx;
325                         FormatExtend(RequireLetter) // rule WB6
326                     }
327                     _ => {
328                         take_curr = false;
329                         break;
330                     }
331                 },
332                 Numeric => match cat {
333                     wd::WC_Numeric => Numeric,           // rule WB8
334                     wd::WC_ALetter => Letter,            // rule WB10
335                     wd::WC_Hebrew_Letter => HLetter,     // rule WB10
336                     wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
337                     wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
338                         savecat = cat;
339                         saveidx = idx;
340                         FormatExtend(RequireNumeric) // rule WB12
341                     }
342                     _ => {
343                         take_curr = false;
344                         break;
345                     }
346                 },
347                 Katakana => match cat {
348                     wd::WC_Katakana => Katakana,         // rule WB13
349                     wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
350                     _ => {
351                         take_curr = false;
352                         break;
353                     }
354                 },
355                 ExtendNumLet => match cat {
356                     wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
357                     wd::WC_ALetter => Letter,            // rule WB13b
358                     wd::WC_Hebrew_Letter => HLetter,     // rule WB13b
359                     wd::WC_Numeric => Numeric,           // rule WB13b
360                     wd::WC_Katakana => Katakana,         // rule WB13b
361                     _ => {
362                         take_curr = false;
363                         break;
364                     }
365                 },
366                 Regional(RegionalState::Full) => {
367                     // if it reaches here we've gone too far,
368                     // a full flag can only compose with ZWJ/Extend/Format
369                     // proceeding it.
370                     take_curr = false;
371                     break;
372                 }
373                 Regional(RegionalState::Half) => match cat {
374                     wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
375                     _ => {
376                         take_curr = false;
377                         break;
378                     }
379                 },
380                 Regional(_) => {
381                     unreachable!("RegionalState::Unknown should not occur on forward iteration")
382                 }
383                 Emoji => {
384                     // We already handle WB3c above. If you've reached this point, the emoji sequence is over.
385                     take_curr = false;
386                     break;
387                 }
388                 FormatExtend(t) => match t {
389                     // handle FormatExtends depending on what type
390                     RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
391                     RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
392                     RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
393                     RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
394                     AcceptNone | AcceptQLetter => {
395                         take_curr = false; // emit all the Format|Extend characters
396                         take_cat = false;
397                         break;
398                     }
399                     _ => break, // rewind (in if statement below)
400                 },
401             }
402         }
403 
404         if let FormatExtend(t) = state {
405             // we were looking for something and didn't find it; we have to back up
406             if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
407                 idx = saveidx;
408                 cat = savecat;
409                 take_curr = false;
410             }
411         }
412 
413         self.cat = if take_curr {
414             idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
415             None
416         } else if take_cat {
417             Some(cat)
418         } else {
419             None
420         };
421 
422         let retstr = &self.string[..idx];
423         self.string = &self.string[idx..];
424         Some(retstr)
425     }
426 }
427 
428 impl<'a> DoubleEndedIterator for UWordBounds<'a> {
429     #[inline]
next_back(&mut self) -> Option<&'a str>430     fn next_back(&mut self) -> Option<&'a str> {
431         use self::FormatExtendType::*;
432         use self::UWordBoundsState::*;
433         use crate::tables::word as wd;
434         if self.string.len() == 0 {
435             return None;
436         }
437 
438         let mut take_curr = true;
439         let mut take_cat = true;
440         let mut idx = self.string.len();
441         idx -= self.string.chars().next_back().unwrap().len_utf8();
442         let mut previdx = idx;
443         let mut saveidx = idx;
444         let mut state = Start;
445         let mut savestate = Start;
446         let mut cat = wd::WC_Any;
447 
448         let mut skipped_format_extend = false;
449 
450         for (curr, ch) in self.string.char_indices().rev() {
451             previdx = idx;
452             idx = curr;
453 
454             // if there's a category cached, grab it
455             cat = match self.catb {
456                 None => wd::word_category(ch).2,
457                 _ => self.catb.take().unwrap(),
458             };
459             take_cat = true;
460 
461             // backward iterator over word boundaries. Mostly the same as the forward
462             // iterator, with two weirdnesses:
463             // (1) If we encounter a single quote in the Start state, we have to check for a
464             //     Hebrew Letter immediately before it.
465             // (2) Format and Extend char handling takes some gymnastics.
466 
467             if cat == wd::WC_Extend || cat == wd::WC_Format || (cat == wd::WC_ZWJ && state != Zwj) {
468                 // WB3c has more priority so we should not
469                 // fold in that case
470                 if match state {
471                     FormatExtend(_) | Start => false,
472                     _ => true,
473                 } {
474                     saveidx = previdx;
475                     savestate = state;
476                     state = FormatExtend(AcceptNone);
477                 }
478 
479                 if state != Start {
480                     continue;
481                 }
482             } else if state == FormatExtend(AcceptNone) {
483                 // finished a scan of some Format|Extend chars, restore previous state
484                 state = savestate;
485                 previdx = saveidx;
486                 take_cat = false;
487                 skipped_format_extend = true;
488             }
489 
490             // Don't use `continue` in this match without updating `catb`
491             state = match state {
492                 Start | FormatExtend(AcceptAny) => match cat {
493                     _ if is_emoji(ch) => Zwj,
494                     wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b
495                     wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b
496                     wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
497                     wd::WC_Katakana => Katakana, // rule WB13, WB13b
498                     wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
499                     wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
500                     // rule WB4:
501                     wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
502                     wd::WC_Single_Quote => {
503                         saveidx = idx;
504                         FormatExtend(AcceptQLetter) // rule WB7a
505                     }
506                     wd::WC_WSegSpace => WSegSpace,
507                     wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
508                         if state == Start {
509                             if cat == wd::WC_LF {
510                                 idx -= match self.get_prev_cat(idx) {
511                                     Some(pcat) if pcat == wd::WC_CR => 1, // rule WB3
512                                     _ => 0,
513                                 };
514                             }
515                         } else {
516                             take_curr = false;
517                         }
518                         break; // rule WB3a
519                     }
520                     _ => break, // rule WB999
521                 },
522                 Zwj => match cat {
523                     // rule WB3c
524                     wd::WC_ZWJ => FormatExtend(AcceptAny),
525                     _ => {
526                         take_curr = false;
527                         break;
528                     }
529                 },
530                 WSegSpace => match cat {
531                     // rule WB3d
532                     wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
533                     _ => {
534                         take_curr = false;
535                         break;
536                     }
537                 },
538                 Letter | HLetter => match cat {
539                     wd::WC_ALetter => Letter,            // rule WB5
540                     wd::WC_Hebrew_Letter => HLetter,     // rule WB5
541                     wd::WC_Numeric => Numeric,           // rule WB10
542                     wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
543                     wd::WC_Double_Quote if state == HLetter => {
544                         saveidx = previdx;
545                         FormatExtend(RequireHLetter) // rule WB7c
546                     }
547                     wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
548                         saveidx = previdx;
549                         FormatExtend(RequireLetter) // rule WB7
550                     }
551                     _ => {
552                         take_curr = false;
553                         break;
554                     }
555                 },
556                 Numeric => match cat {
557                     wd::WC_Numeric => Numeric,           // rule WB8
558                     wd::WC_ALetter => Letter,            // rule WB9
559                     wd::WC_Hebrew_Letter => HLetter,     // rule WB9
560                     wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
561                     wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
562                         saveidx = previdx;
563                         FormatExtend(RequireNumeric) // rule WB11
564                     }
565                     _ => {
566                         take_curr = false;
567                         break;
568                     }
569                 },
570                 Katakana => match cat {
571                     wd::WC_Katakana => Katakana,         // rule WB13
572                     wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
573                     _ => {
574                         take_curr = false;
575                         break;
576                     }
577                 },
578                 ExtendNumLet => match cat {
579                     wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
580                     wd::WC_ALetter => Letter,            // rule WB13a
581                     wd::WC_Hebrew_Letter => HLetter,     // rule WB13a
582                     wd::WC_Numeric => Numeric,           // rule WB13a
583                     wd::WC_Katakana => Katakana,         // rule WB13a
584                     _ => {
585                         take_curr = false;
586                         break;
587                     }
588                 },
589                 Regional(mut regional_state) => match cat {
590                     // rule WB13c
591                     wd::WC_Regional_Indicator => {
592                         if regional_state == RegionalState::Unknown {
593                             let count = self.string[..previdx]
594                                 .chars()
595                                 .rev()
596                                 .map(|c| wd::word_category(c).2)
597                                 .filter(|&c| {
598                                     !(c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format)
599                                 })
600                                 .take_while(|&c| c == wd::WC_Regional_Indicator)
601                                 .count();
602                             regional_state = if count % 2 == 0 {
603                                 RegionalState::Full
604                             } else {
605                                 RegionalState::Half
606                             };
607                         }
608                         if regional_state == RegionalState::Full {
609                             take_curr = false;
610                             break;
611                         } else {
612                             Regional(RegionalState::Full)
613                         }
614                     }
615                     _ => {
616                         take_curr = false;
617                         break;
618                     }
619                 },
620                 Emoji => {
621                     if is_emoji(ch) {
622                         // rule WB3c
623                         Zwj
624                     } else {
625                         take_curr = false;
626                         break;
627                     }
628                 }
629                 FormatExtend(t) => match t {
630                     RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12
631                     RequireLetter if cat == wd::WC_ALetter => Letter,   // rule WB6
632                     RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6
633                     AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a
634                     RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
635                     _ => break,                                         // backtrack will happens
636                 },
637             }
638         }
639 
640         if let FormatExtend(t) = state {
641             // if we required something but didn't find it, backtrack
642             if t == RequireLetter
643                 || t == RequireHLetter
644                 || t == RequireNumeric
645                 || t == AcceptNone
646                 || t == AcceptQLetter
647             {
648                 previdx = saveidx;
649                 take_cat = false;
650                 take_curr = false;
651             }
652         }
653 
654         self.catb = if take_curr {
655             None
656         } else {
657             idx = previdx;
658             if take_cat {
659                 Some(cat)
660             } else {
661                 None
662             }
663         };
664 
665         let retstr = &self.string[idx..];
666         self.string = &self.string[..idx];
667         Some(retstr)
668     }
669 }
670 
671 impl<'a> UWordBounds<'a> {
672     #[inline]
673     /// View the underlying data (the part yet to be iterated) as a slice of the original string.
674     ///
675     /// ```rust
676     /// # use unicode_segmentation::UnicodeSegmentation;
677     /// let mut iter = "Hello world".split_word_bounds();
678     /// assert_eq!(iter.as_str(), "Hello world");
679     /// iter.next();
680     /// assert_eq!(iter.as_str(), " world");
681     /// iter.next();
682     /// assert_eq!(iter.as_str(), "world");
683     /// ```
as_str(&self) -> &'a str684     pub fn as_str(&self) -> &'a str {
685         self.string
686     }
687 
688     #[inline]
get_next_cat(&self, idx: usize) -> Option<WordCat>689     fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
690         use crate::tables::word as wd;
691         let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
692         if nidx < self.string.len() {
693             let nch = self.string[nidx..].chars().next().unwrap();
694             Some(wd::word_category(nch).2)
695         } else {
696             None
697         }
698     }
699 
700     #[inline]
get_prev_cat(&self, idx: usize) -> Option<WordCat>701     fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
702         use crate::tables::word as wd;
703         if idx > 0 {
704             let nch = self.string[..idx].chars().next_back().unwrap();
705             Some(wd::word_category(nch).2)
706         } else {
707             None
708         }
709     }
710 }
711 
712 #[inline]
new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b>713 pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> {
714     UWordBounds {
715         string: s,
716         cat: None,
717         catb: None,
718     }
719 }
720 
721 #[inline]
new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b>722 pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
723     UWordBoundIndices {
724         start_offset: s.as_ptr() as usize,
725         iter: new_word_bounds(s),
726     }
727 }
728 
729 #[inline]
has_alphanumeric(s: &&str) -> bool730 fn has_alphanumeric(s: &&str) -> bool {
731     use crate::tables::util::is_alphanumeric;
732 
733     s.chars().any(|c| is_alphanumeric(c))
734 }
735 
736 #[inline]
new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b>737 pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
738     use super::UnicodeSegmentation;
739 
740     UnicodeWords {
741         inner: s.split_word_bounds().filter(has_alphanumeric),
742     }
743 }
744 
745 #[inline]
new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b>746 pub fn new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b> {
747     use super::UnicodeSegmentation;
748 
749     UnicodeWordIndices {
750         inner: s
751             .split_word_bound_indices()
752             .filter(|(_, c)| has_alphanumeric(c)),
753     }
754 }
755