1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 use core::cmp;
12 use core::iter::Filter;
13
14 use crate::tables::word::WordCat;
15
16 /// An iterator over the substrings of a string which, after splitting the string on
17 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
18 /// contain any characters with the
19 /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
20 /// property, or with
21 /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
22 ///
23 /// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
24 /// its documentation for more.
25 ///
26 /// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
27 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
28 pub struct UnicodeWords<'a> {
29 inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
30 }
31
32 impl<'a> Iterator for UnicodeWords<'a> {
33 type Item = &'a str;
34
35 #[inline]
next(&mut self) -> Option<&'a str>36 fn next(&mut self) -> Option<&'a str> {
37 self.inner.next()
38 }
39 }
40 impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
41 #[inline]
next_back(&mut self) -> Option<&'a str>42 fn next_back(&mut self) -> Option<&'a str> {
43 self.inner.next_back()
44 }
45 }
46
47 /// An iterator over the substrings of a string which, after splitting the string on
48 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
49 /// contain any characters with the
50 /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
51 /// property, or with
52 /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
53 /// This iterator also provides the byte offsets for each substring.
54 ///
55 /// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
56 /// its documentation for more.
57 ///
58 /// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
59 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
60 pub struct UnicodeWordIndices<'a> {
61 inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
62 }
63
64 impl<'a> Iterator for UnicodeWordIndices<'a> {
65 type Item = (usize, &'a str);
66
67 #[inline]
next(&mut self) -> Option<(usize, &'a str)>68 fn next(&mut self) -> Option<(usize, &'a str)> {
69 self.inner.next()
70 }
71 }
72 impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
73 #[inline]
next_back(&mut self) -> Option<(usize, &'a str)>74 fn next_back(&mut self) -> Option<(usize, &'a str)> {
75 self.inner.next_back()
76 }
77 }
78
79 /// External iterator for a string's
80 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
81 ///
82 /// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`]
83 /// trait. See its documentation for more.
84 ///
85 /// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds
86 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
87 #[derive(Clone)]
88 pub struct UWordBounds<'a> {
89 string: &'a str,
90 cat: Option<WordCat>,
91 catb: Option<WordCat>,
92 }
93
94 /// External iterator for word boundaries and byte offsets.
95 ///
96 /// This struct is created by the [`split_word_bound_indices`] method on the
97 /// [`UnicodeSegmentation`] trait. See its documentation for more.
98 ///
99 /// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices
100 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
101 #[derive(Clone)]
102 pub struct UWordBoundIndices<'a> {
103 start_offset: usize,
104 iter: UWordBounds<'a>,
105 }
106
107 impl<'a> UWordBoundIndices<'a> {
108 #[inline]
109 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
110 ///
111 /// ```rust
112 /// # use unicode_segmentation::UnicodeSegmentation;
113 /// let mut iter = "Hello world".split_word_bound_indices();
114 /// assert_eq!(iter.as_str(), "Hello world");
115 /// iter.next();
116 /// assert_eq!(iter.as_str(), " world");
117 /// iter.next();
118 /// assert_eq!(iter.as_str(), "world");
119 /// ```
as_str(&self) -> &'a str120 pub fn as_str(&self) -> &'a str {
121 self.iter.as_str()
122 }
123 }
124
125 impl<'a> Iterator for UWordBoundIndices<'a> {
126 type Item = (usize, &'a str);
127
128 #[inline]
next(&mut self) -> Option<(usize, &'a str)>129 fn next(&mut self) -> Option<(usize, &'a str)> {
130 self.iter
131 .next()
132 .map(|s| (s.as_ptr() as usize - self.start_offset, s))
133 }
134
135 #[inline]
size_hint(&self) -> (usize, Option<usize>)136 fn size_hint(&self) -> (usize, Option<usize>) {
137 self.iter.size_hint()
138 }
139 }
140
141 impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
142 #[inline]
next_back(&mut self) -> Option<(usize, &'a str)>143 fn next_back(&mut self) -> Option<(usize, &'a str)> {
144 self.iter
145 .next_back()
146 .map(|s| (s.as_ptr() as usize - self.start_offset, s))
147 }
148 }
149
150 // state machine for word boundary rules
151 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
152 enum UWordBoundsState {
153 Start,
154 Letter,
155 HLetter,
156 Numeric,
157 Katakana,
158 ExtendNumLet,
159 Regional(RegionalState),
160 FormatExtend(FormatExtendType),
161 Zwj,
162 Emoji,
163 WSegSpace,
164 }
165
166 // subtypes for FormatExtend state in UWordBoundsState
167 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
168 enum FormatExtendType {
169 AcceptAny,
170 AcceptNone,
171 RequireLetter,
172 RequireHLetter,
173 AcceptQLetter,
174 RequireNumeric,
175 }
176
177 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
178 enum RegionalState {
179 Half,
180 Full,
181 Unknown,
182 }
183
is_emoji(ch: char) -> bool184 fn is_emoji(ch: char) -> bool {
185 use crate::tables::emoji;
186 emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
187 }
188
189 impl<'a> Iterator for UWordBounds<'a> {
190 type Item = &'a str;
191
192 #[inline]
size_hint(&self) -> (usize, Option<usize>)193 fn size_hint(&self) -> (usize, Option<usize>) {
194 let slen = self.string.len();
195 (cmp::min(slen, 1), Some(slen))
196 }
197
198 #[inline]
next(&mut self) -> Option<&'a str>199 fn next(&mut self) -> Option<&'a str> {
200 use self::FormatExtendType::*;
201 use self::UWordBoundsState::*;
202 use crate::tables::word as wd;
203 if self.string.len() == 0 {
204 return None;
205 }
206
207 let mut take_curr = true;
208 let mut take_cat = true;
209 let mut idx = 0;
210 let mut saveidx = 0;
211 let mut state = Start;
212 let mut cat = wd::WC_Any;
213 let mut savecat = wd::WC_Any;
214
215 // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
216 let mut skipped_format_extend = false;
217 for (curr, ch) in self.string.char_indices() {
218 idx = curr;
219 // Whether or not the previous category was ZWJ
220 // ZWJs get collapsed, so this handles precedence of WB3c over WB4
221 let prev_zwj = cat == wd::WC_ZWJ;
222 // if there's a category cached, grab it
223 cat = match self.cat {
224 None => wd::word_category(ch).2,
225 _ => self.cat.take().unwrap(),
226 };
227 take_cat = true;
228
229 // handle rule WB4
230 // just skip all format, extend, and zwj chars
231 // note that Start is a special case: if there's a bunch of Format | Extend
232 // characters at the beginning of a block of text, dump them out as one unit.
233 //
234 // (This is not obvious from the wording of UAX#29, but if you look at the
235 // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
236 // then the "correct" interpretation of WB4 becomes apparent.)
237 if state != Start {
238 match cat {
239 wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
240 skipped_format_extend = true;
241 continue;
242 }
243 _ => {}
244 }
245 }
246
247 // rule WB3c
248 // WB4 makes all ZWJs collapse into the previous state
249 // but you can still be in a Zwj state if you started with Zwj
250 //
251 // This means that an EP + Zwj will collapse into EP, which is wrong,
252 // since EP+EP is not a boundary but EP+ZWJ+EP is
253 //
254 // Thus, we separately keep track of whether or not the last character
255 // was a ZWJ. This is an additional bit of state tracked outside of the
256 // state enum; the state enum represents the last non-zwj state encountered.
257 // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
258 // however we are in the previous state for the purposes of all other rules.
259 if prev_zwj {
260 if is_emoji(ch) {
261 state = Emoji;
262 continue;
263 }
264 }
265 // Don't use `continue` in this match without updating `cat`
266 state = match state {
267 Start if cat == wd::WC_CR => {
268 idx += match self.get_next_cat(idx) {
269 Some(ncat) if ncat == wd::WC_LF => 1, // rule WB3
270 _ => 0,
271 };
272 break; // rule WB3a
273 }
274 Start => match cat {
275 wd::WC_ALetter => Letter, // rule WB5, WB6, WB9, WB13a
276 wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
277 wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a
278 wd::WC_Katakana => Katakana, // rule WB13, WB13a
279 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
280 wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
281 wd::WC_LF | wd::WC_Newline => break, // rule WB3a
282 wd::WC_ZWJ => Zwj, // rule WB3c
283 wd::WC_WSegSpace => WSegSpace, // rule WB3d
284 _ => {
285 if let Some(ncat) = self.get_next_cat(idx) {
286 // rule WB4
287 if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ
288 {
289 state = FormatExtend(AcceptNone);
290 self.cat = Some(ncat);
291 continue;
292 }
293 }
294 break; // rule WB999
295 }
296 },
297 WSegSpace => match cat {
298 wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
299 _ => {
300 take_curr = false;
301 break;
302 }
303 },
304 Zwj => {
305 // We already handle WB3c above.
306 take_curr = false;
307 break;
308 }
309 Letter | HLetter => match cat {
310 wd::WC_ALetter => Letter, // rule WB5
311 wd::WC_Hebrew_Letter => HLetter, // rule WB5
312 wd::WC_Numeric => Numeric, // rule WB9
313 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
314 wd::WC_Double_Quote if state == HLetter => {
315 savecat = cat;
316 saveidx = idx;
317 FormatExtend(RequireHLetter) // rule WB7b
318 }
319 wd::WC_Single_Quote if state == HLetter => {
320 FormatExtend(AcceptQLetter) // rule WB7a
321 }
322 wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
323 savecat = cat;
324 saveidx = idx;
325 FormatExtend(RequireLetter) // rule WB6
326 }
327 _ => {
328 take_curr = false;
329 break;
330 }
331 },
332 Numeric => match cat {
333 wd::WC_Numeric => Numeric, // rule WB8
334 wd::WC_ALetter => Letter, // rule WB10
335 wd::WC_Hebrew_Letter => HLetter, // rule WB10
336 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
337 wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
338 savecat = cat;
339 saveidx = idx;
340 FormatExtend(RequireNumeric) // rule WB12
341 }
342 _ => {
343 take_curr = false;
344 break;
345 }
346 },
347 Katakana => match cat {
348 wd::WC_Katakana => Katakana, // rule WB13
349 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
350 _ => {
351 take_curr = false;
352 break;
353 }
354 },
355 ExtendNumLet => match cat {
356 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
357 wd::WC_ALetter => Letter, // rule WB13b
358 wd::WC_Hebrew_Letter => HLetter, // rule WB13b
359 wd::WC_Numeric => Numeric, // rule WB13b
360 wd::WC_Katakana => Katakana, // rule WB13b
361 _ => {
362 take_curr = false;
363 break;
364 }
365 },
366 Regional(RegionalState::Full) => {
367 // if it reaches here we've gone too far,
368 // a full flag can only compose with ZWJ/Extend/Format
369 // proceeding it.
370 take_curr = false;
371 break;
372 }
373 Regional(RegionalState::Half) => match cat {
374 wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
375 _ => {
376 take_curr = false;
377 break;
378 }
379 },
380 Regional(_) => {
381 unreachable!("RegionalState::Unknown should not occur on forward iteration")
382 }
383 Emoji => {
384 // We already handle WB3c above. If you've reached this point, the emoji sequence is over.
385 take_curr = false;
386 break;
387 }
388 FormatExtend(t) => match t {
389 // handle FormatExtends depending on what type
390 RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
391 RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
392 RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
393 RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
394 AcceptNone | AcceptQLetter => {
395 take_curr = false; // emit all the Format|Extend characters
396 take_cat = false;
397 break;
398 }
399 _ => break, // rewind (in if statement below)
400 },
401 }
402 }
403
404 if let FormatExtend(t) = state {
405 // we were looking for something and didn't find it; we have to back up
406 if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
407 idx = saveidx;
408 cat = savecat;
409 take_curr = false;
410 }
411 }
412
413 self.cat = if take_curr {
414 idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
415 None
416 } else if take_cat {
417 Some(cat)
418 } else {
419 None
420 };
421
422 let retstr = &self.string[..idx];
423 self.string = &self.string[idx..];
424 Some(retstr)
425 }
426 }
427
428 impl<'a> DoubleEndedIterator for UWordBounds<'a> {
429 #[inline]
next_back(&mut self) -> Option<&'a str>430 fn next_back(&mut self) -> Option<&'a str> {
431 use self::FormatExtendType::*;
432 use self::UWordBoundsState::*;
433 use crate::tables::word as wd;
434 if self.string.len() == 0 {
435 return None;
436 }
437
438 let mut take_curr = true;
439 let mut take_cat = true;
440 let mut idx = self.string.len();
441 idx -= self.string.chars().next_back().unwrap().len_utf8();
442 let mut previdx = idx;
443 let mut saveidx = idx;
444 let mut state = Start;
445 let mut savestate = Start;
446 let mut cat = wd::WC_Any;
447
448 let mut skipped_format_extend = false;
449
450 for (curr, ch) in self.string.char_indices().rev() {
451 previdx = idx;
452 idx = curr;
453
454 // if there's a category cached, grab it
455 cat = match self.catb {
456 None => wd::word_category(ch).2,
457 _ => self.catb.take().unwrap(),
458 };
459 take_cat = true;
460
461 // backward iterator over word boundaries. Mostly the same as the forward
462 // iterator, with two weirdnesses:
463 // (1) If we encounter a single quote in the Start state, we have to check for a
464 // Hebrew Letter immediately before it.
465 // (2) Format and Extend char handling takes some gymnastics.
466
467 if cat == wd::WC_Extend || cat == wd::WC_Format || (cat == wd::WC_ZWJ && state != Zwj) {
468 // WB3c has more priority so we should not
469 // fold in that case
470 if match state {
471 FormatExtend(_) | Start => false,
472 _ => true,
473 } {
474 saveidx = previdx;
475 savestate = state;
476 state = FormatExtend(AcceptNone);
477 }
478
479 if state != Start {
480 continue;
481 }
482 } else if state == FormatExtend(AcceptNone) {
483 // finished a scan of some Format|Extend chars, restore previous state
484 state = savestate;
485 previdx = saveidx;
486 take_cat = false;
487 skipped_format_extend = true;
488 }
489
490 // Don't use `continue` in this match without updating `catb`
491 state = match state {
492 Start | FormatExtend(AcceptAny) => match cat {
493 _ if is_emoji(ch) => Zwj,
494 wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b
495 wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b
496 wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
497 wd::WC_Katakana => Katakana, // rule WB13, WB13b
498 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
499 wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
500 // rule WB4:
501 wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
502 wd::WC_Single_Quote => {
503 saveidx = idx;
504 FormatExtend(AcceptQLetter) // rule WB7a
505 }
506 wd::WC_WSegSpace => WSegSpace,
507 wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
508 if state == Start {
509 if cat == wd::WC_LF {
510 idx -= match self.get_prev_cat(idx) {
511 Some(pcat) if pcat == wd::WC_CR => 1, // rule WB3
512 _ => 0,
513 };
514 }
515 } else {
516 take_curr = false;
517 }
518 break; // rule WB3a
519 }
520 _ => break, // rule WB999
521 },
522 Zwj => match cat {
523 // rule WB3c
524 wd::WC_ZWJ => FormatExtend(AcceptAny),
525 _ => {
526 take_curr = false;
527 break;
528 }
529 },
530 WSegSpace => match cat {
531 // rule WB3d
532 wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
533 _ => {
534 take_curr = false;
535 break;
536 }
537 },
538 Letter | HLetter => match cat {
539 wd::WC_ALetter => Letter, // rule WB5
540 wd::WC_Hebrew_Letter => HLetter, // rule WB5
541 wd::WC_Numeric => Numeric, // rule WB10
542 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
543 wd::WC_Double_Quote if state == HLetter => {
544 saveidx = previdx;
545 FormatExtend(RequireHLetter) // rule WB7c
546 }
547 wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
548 saveidx = previdx;
549 FormatExtend(RequireLetter) // rule WB7
550 }
551 _ => {
552 take_curr = false;
553 break;
554 }
555 },
556 Numeric => match cat {
557 wd::WC_Numeric => Numeric, // rule WB8
558 wd::WC_ALetter => Letter, // rule WB9
559 wd::WC_Hebrew_Letter => HLetter, // rule WB9
560 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
561 wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
562 saveidx = previdx;
563 FormatExtend(RequireNumeric) // rule WB11
564 }
565 _ => {
566 take_curr = false;
567 break;
568 }
569 },
570 Katakana => match cat {
571 wd::WC_Katakana => Katakana, // rule WB13
572 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
573 _ => {
574 take_curr = false;
575 break;
576 }
577 },
578 ExtendNumLet => match cat {
579 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
580 wd::WC_ALetter => Letter, // rule WB13a
581 wd::WC_Hebrew_Letter => HLetter, // rule WB13a
582 wd::WC_Numeric => Numeric, // rule WB13a
583 wd::WC_Katakana => Katakana, // rule WB13a
584 _ => {
585 take_curr = false;
586 break;
587 }
588 },
589 Regional(mut regional_state) => match cat {
590 // rule WB13c
591 wd::WC_Regional_Indicator => {
592 if regional_state == RegionalState::Unknown {
593 let count = self.string[..previdx]
594 .chars()
595 .rev()
596 .map(|c| wd::word_category(c).2)
597 .filter(|&c| {
598 !(c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format)
599 })
600 .take_while(|&c| c == wd::WC_Regional_Indicator)
601 .count();
602 regional_state = if count % 2 == 0 {
603 RegionalState::Full
604 } else {
605 RegionalState::Half
606 };
607 }
608 if regional_state == RegionalState::Full {
609 take_curr = false;
610 break;
611 } else {
612 Regional(RegionalState::Full)
613 }
614 }
615 _ => {
616 take_curr = false;
617 break;
618 }
619 },
620 Emoji => {
621 if is_emoji(ch) {
622 // rule WB3c
623 Zwj
624 } else {
625 take_curr = false;
626 break;
627 }
628 }
629 FormatExtend(t) => match t {
630 RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12
631 RequireLetter if cat == wd::WC_ALetter => Letter, // rule WB6
632 RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6
633 AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a
634 RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
635 _ => break, // backtrack will happens
636 },
637 }
638 }
639
640 if let FormatExtend(t) = state {
641 // if we required something but didn't find it, backtrack
642 if t == RequireLetter
643 || t == RequireHLetter
644 || t == RequireNumeric
645 || t == AcceptNone
646 || t == AcceptQLetter
647 {
648 previdx = saveidx;
649 take_cat = false;
650 take_curr = false;
651 }
652 }
653
654 self.catb = if take_curr {
655 None
656 } else {
657 idx = previdx;
658 if take_cat {
659 Some(cat)
660 } else {
661 None
662 }
663 };
664
665 let retstr = &self.string[idx..];
666 self.string = &self.string[..idx];
667 Some(retstr)
668 }
669 }
670
671 impl<'a> UWordBounds<'a> {
672 #[inline]
673 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
674 ///
675 /// ```rust
676 /// # use unicode_segmentation::UnicodeSegmentation;
677 /// let mut iter = "Hello world".split_word_bounds();
678 /// assert_eq!(iter.as_str(), "Hello world");
679 /// iter.next();
680 /// assert_eq!(iter.as_str(), " world");
681 /// iter.next();
682 /// assert_eq!(iter.as_str(), "world");
683 /// ```
as_str(&self) -> &'a str684 pub fn as_str(&self) -> &'a str {
685 self.string
686 }
687
688 #[inline]
get_next_cat(&self, idx: usize) -> Option<WordCat>689 fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
690 use crate::tables::word as wd;
691 let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
692 if nidx < self.string.len() {
693 let nch = self.string[nidx..].chars().next().unwrap();
694 Some(wd::word_category(nch).2)
695 } else {
696 None
697 }
698 }
699
700 #[inline]
get_prev_cat(&self, idx: usize) -> Option<WordCat>701 fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
702 use crate::tables::word as wd;
703 if idx > 0 {
704 let nch = self.string[..idx].chars().next_back().unwrap();
705 Some(wd::word_category(nch).2)
706 } else {
707 None
708 }
709 }
710 }
711
712 #[inline]
new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b>713 pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> {
714 UWordBounds {
715 string: s,
716 cat: None,
717 catb: None,
718 }
719 }
720
721 #[inline]
new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b>722 pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
723 UWordBoundIndices {
724 start_offset: s.as_ptr() as usize,
725 iter: new_word_bounds(s),
726 }
727 }
728
729 #[inline]
has_alphanumeric(s: &&str) -> bool730 fn has_alphanumeric(s: &&str) -> bool {
731 use crate::tables::util::is_alphanumeric;
732
733 s.chars().any(|c| is_alphanumeric(c))
734 }
735
736 #[inline]
new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b>737 pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
738 use super::UnicodeSegmentation;
739
740 UnicodeWords {
741 inner: s.split_word_bounds().filter(has_alphanumeric),
742 }
743 }
744
745 #[inline]
new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b>746 pub fn new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b> {
747 use super::UnicodeSegmentation;
748
749 UnicodeWordIndices {
750 inner: s
751 .split_word_bound_indices()
752 .filter(|(_, c)| has_alphanumeric(c)),
753 }
754 }
755