• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 use core::cmp;
12 
13 use crate::tables::grapheme::GraphemeCat;
14 
15 /// External iterator for grapheme clusters and byte offsets.
16 ///
17 /// This struct is created by the [`grapheme_indices`] method on the [`UnicodeSegmentation`]
18 /// trait. See its documentation for more.
19 ///
20 /// [`grapheme_indices`]: trait.UnicodeSegmentation.html#tymethod.grapheme_indices
21 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
22 #[derive(Debug, Clone)]
23 pub struct GraphemeIndices<'a> {
24     start_offset: usize,
25     iter: Graphemes<'a>,
26 }
27 
28 impl<'a> GraphemeIndices<'a> {
29     #[inline]
30     /// View the underlying data (the part yet to be iterated) as a slice of the original string.
31     ///
32     /// ```rust
33     /// # use unicode_segmentation::UnicodeSegmentation;
34     /// let mut iter = "abc".grapheme_indices(true);
35     /// assert_eq!(iter.as_str(), "abc");
36     /// iter.next();
37     /// assert_eq!(iter.as_str(), "bc");
38     /// iter.next();
39     /// iter.next();
40     /// assert_eq!(iter.as_str(), "");
41     /// ```
as_str(&self) -> &'a str42     pub fn as_str(&self) -> &'a str {
43         self.iter.as_str()
44     }
45 }
46 
47 impl<'a> Iterator for GraphemeIndices<'a> {
48     type Item = (usize, &'a str);
49 
50     #[inline]
next(&mut self) -> Option<(usize, &'a str)>51     fn next(&mut self) -> Option<(usize, &'a str)> {
52         self.iter
53             .next()
54             .map(|s| (s.as_ptr() as usize - self.start_offset, s))
55     }
56 
57     #[inline]
size_hint(&self) -> (usize, Option<usize>)58     fn size_hint(&self) -> (usize, Option<usize>) {
59         self.iter.size_hint()
60     }
61 }
62 
63 impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
64     #[inline]
next_back(&mut self) -> Option<(usize, &'a str)>65     fn next_back(&mut self) -> Option<(usize, &'a str)> {
66         self.iter
67             .next_back()
68             .map(|s| (s.as_ptr() as usize - self.start_offset, s))
69     }
70 }
71 
72 /// External iterator for a string's
73 /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
74 ///
75 /// This struct is created by the [`graphemes`] method on the [`UnicodeSegmentation`] trait. See its
76 /// documentation for more.
77 ///
78 /// [`graphemes`]: trait.UnicodeSegmentation.html#tymethod.graphemes
79 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
80 #[derive(Clone, Debug)]
81 pub struct Graphemes<'a> {
82     string: &'a str,
83     cursor: GraphemeCursor,
84     cursor_back: GraphemeCursor,
85 }
86 
87 impl<'a> Graphemes<'a> {
88     #[inline]
89     /// View the underlying data (the part yet to be iterated) as a slice of the original string.
90     ///
91     /// ```rust
92     /// # use unicode_segmentation::UnicodeSegmentation;
93     /// let mut iter = "abc".graphemes(true);
94     /// assert_eq!(iter.as_str(), "abc");
95     /// iter.next();
96     /// assert_eq!(iter.as_str(), "bc");
97     /// iter.next();
98     /// iter.next();
99     /// assert_eq!(iter.as_str(), "");
100     /// ```
as_str(&self) -> &'a str101     pub fn as_str(&self) -> &'a str {
102         &self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
103     }
104 }
105 
106 impl<'a> Iterator for Graphemes<'a> {
107     type Item = &'a str;
108 
109     #[inline]
size_hint(&self) -> (usize, Option<usize>)110     fn size_hint(&self) -> (usize, Option<usize>) {
111         let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
112         (cmp::min(slen, 1), Some(slen))
113     }
114 
115     #[inline]
next(&mut self) -> Option<&'a str>116     fn next(&mut self) -> Option<&'a str> {
117         let start = self.cursor.cur_cursor();
118         if start == self.cursor_back.cur_cursor() {
119             return None;
120         }
121         let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap();
122         Some(&self.string[start..next])
123     }
124 }
125 
126 impl<'a> DoubleEndedIterator for Graphemes<'a> {
127     #[inline]
next_back(&mut self) -> Option<&'a str>128     fn next_back(&mut self) -> Option<&'a str> {
129         let end = self.cursor_back.cur_cursor();
130         if end == self.cursor.cur_cursor() {
131             return None;
132         }
133         let prev = self
134             .cursor_back
135             .prev_boundary(self.string, 0)
136             .unwrap()
137             .unwrap();
138         Some(&self.string[prev..end])
139     }
140 }
141 
142 #[inline]
new_graphemes(s: &str, is_extended: bool) -> Graphemes<'_>143 pub fn new_graphemes(s: &str, is_extended: bool) -> Graphemes<'_> {
144     let len = s.len();
145     Graphemes {
146         string: s,
147         cursor: GraphemeCursor::new(0, len, is_extended),
148         cursor_back: GraphemeCursor::new(len, len, is_extended),
149     }
150 }
151 
152 #[inline]
new_grapheme_indices(s: &str, is_extended: bool) -> GraphemeIndices<'_>153 pub fn new_grapheme_indices(s: &str, is_extended: bool) -> GraphemeIndices<'_> {
154     GraphemeIndices {
155         start_offset: s.as_ptr() as usize,
156         iter: new_graphemes(s, is_extended),
157     }
158 }
159 
160 /// maybe unify with PairResult?
161 /// An enum describing information about a potential boundary.
162 #[derive(PartialEq, Eq, Clone, Debug)]
163 enum GraphemeState {
164     /// No information is known.
165     Unknown,
166     /// It is known to not be a boundary.
167     NotBreak,
168     /// It is known to be a boundary.
169     Break,
170     /// The codepoint after it has Indic_Conjunct_Break=Consonant,
171     /// so there is a break before so a boundary if it is preceded by another
172     /// InCB=Consonant follwoed by a sequence consisting of one or more InCB=Linker
173     /// and zero or more InCB = Extend (in any order).
174     InCbConsonant,
175     /// The codepoint after is a Regional Indicator Symbol, so a boundary iff
176     /// it is preceded by an even number of RIS codepoints. (GB12, GB13)
177     Regional,
178     /// The codepoint after is Extended_Pictographic,
179     /// so whether it's a boundary depends on pre-context according to GB11.
180     Emoji,
181 }
182 
183 /// Cursor-based segmenter for grapheme clusters.
184 ///
185 /// This allows working with ropes and other datastructures where the string is not contiguous or
186 /// fully known at initialization time.
187 #[derive(Clone, Debug)]
188 pub struct GraphemeCursor {
189     /// Current cursor position.
190     offset: usize,
191     /// Total length of the string.
192     len: usize,
193     /// A config flag indicating whether this cursor computes legacy or extended
194     /// grapheme cluster boundaries (enables GB9a and GB9b if set).
195     is_extended: bool,
196     /// Information about the potential boundary at `offset`
197     state: GraphemeState,
198     /// Category of codepoint immediately preceding cursor, if known.
199     cat_before: Option<GraphemeCat>,
200     /// Category of codepoint immediately after cursor, if known.
201     cat_after: Option<GraphemeCat>,
202     /// If set, at least one more codepoint immediately preceding this offset
203     /// is needed to resolve whether there's a boundary at `offset`.
204     pre_context_offset: Option<usize>,
205     /// The number of `InCB=Linker` codepoints preceding `offset`
206     /// (potentially intermingled with `InCB=Extend`).
207     incb_linker_count: Option<usize>,
208     /// The number of RIS codepoints preceding `offset`. If `pre_context_offset`
209     /// is set, then counts the number of RIS between that and `offset`, otherwise
210     /// is an accurate count relative to the string.
211     ris_count: Option<usize>,
212     /// Set if a call to `prev_boundary` or `next_boundary` was suspended due
213     /// to needing more input.
214     resuming: bool,
215     /// Cached grapheme category and associated scalar value range.
216     grapheme_cat_cache: (u32, u32, GraphemeCat),
217 }
218 
219 /// An error return indicating that not enough content was available in the
220 /// provided chunk to satisfy the query, and that more content must be provided.
221 #[derive(PartialEq, Eq, Debug)]
222 pub enum GraphemeIncomplete {
223     /// More pre-context is needed. The caller should call `provide_context`
224     /// with a chunk ending at the offset given, then retry the query. This
225     /// will only be returned if the `chunk_start` parameter is nonzero.
226     PreContext(usize),
227 
228     /// When requesting `prev_boundary`, the cursor is moving past the beginning
229     /// of the current chunk, so the chunk before that is requested. This will
230     /// only be returned if the `chunk_start` parameter is nonzero.
231     PrevChunk,
232 
233     /// When requesting `next_boundary`, the cursor is moving past the end of the
234     /// current chunk, so the chunk after that is requested. This will only be
235     /// returned if the chunk ends before the `len` parameter provided on
236     /// creation of the cursor.
237     NextChunk, // requesting chunk following the one given
238 
239     /// An error returned when the chunk given does not contain the cursor position.
240     InvalidOffset,
241 }
242 
243 // An enum describing the result from lookup of a pair of categories.
244 #[derive(PartialEq, Eq)]
245 enum PairResult {
246     /// definitely not a break
247     NotBreak,
248     /// definitely a break
249     Break,
250     /// a break iff not in extended mode
251     Extended,
252     /// a break unless in extended mode and preceded by
253     /// a sequence of 0 or more InCB=Extend and one or more
254     /// InCB = Linker (in any order),
255     /// preceded by another InCB=Consonant
256     InCbConsonant,
257     /// a break if preceded by an even number of RIS
258     Regional,
259     /// a break if preceded by emoji base and (Extend)*
260     Emoji,
261 }
262 
263 #[inline]
check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult264 fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
265     use self::PairResult::*;
266     use crate::tables::grapheme::GraphemeCat::*;
267     match (before, after) {
268         (GC_CR, GC_LF) => NotBreak,                                 // GB3
269         (GC_Control | GC_CR | GC_LF, _) => Break,                   // GB4
270         (_, GC_Control | GC_CR | GC_LF) => Break,                   // GB5
271         (GC_L, GC_L | GC_V | GC_LV | GC_LVT) => NotBreak,           // GB6
272         (GC_LV | GC_V, GC_V | GC_T) => NotBreak,                    // GB7
273         (GC_LVT | GC_T, GC_T) => NotBreak,                          // GB8
274         (_, GC_Extend | GC_ZWJ) => NotBreak,                        // GB9
275         (_, GC_SpacingMark) => Extended,                            // GB9a
276         (GC_Prepend, _) => Extended,                                // GB9b
277         (_, GC_InCB_Consonant) => InCbConsonant,                    // GB9c
278         (GC_ZWJ, GC_Extended_Pictographic) => Emoji,                // GB11
279         (GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
280         (_, _) => Break,                                            // GB999
281     }
282 }
283 
284 impl GraphemeCursor {
285     /// Create a new cursor. The string and initial offset are given at creation
286     /// time, but the contents of the string are not. The `is_extended` parameter
287     /// controls whether extended grapheme clusters are selected.
288     ///
289     /// The `offset` parameter must be on a codepoint boundary.
290     ///
291     /// ```rust
292     /// # use unicode_segmentation::GraphemeCursor;
293     /// let s = "हिन्दी";
294     /// let mut legacy = GraphemeCursor::new(0, s.len(), false);
295     /// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
296     /// let mut extended = GraphemeCursor::new(0, s.len(), true);
297     /// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
298     /// ```
new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor299     pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
300         let state = if offset == 0 || offset == len {
301             GraphemeState::Break
302         } else {
303             GraphemeState::Unknown
304         };
305         GraphemeCursor {
306             offset,
307             len,
308             state,
309             is_extended,
310             cat_before: None,
311             cat_after: None,
312             pre_context_offset: None,
313             incb_linker_count: None,
314             ris_count: None,
315             resuming: false,
316             grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control),
317         }
318     }
319 
grapheme_category(&mut self, ch: char) -> GraphemeCat320     fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
321         use crate::tables::grapheme as gr;
322         use crate::tables::grapheme::GraphemeCat::*;
323 
324         if ch <= '\u{7e}' {
325             // Special-case optimization for ascii, except U+007F.  This
326             // improves performance even for many primarily non-ascii texts,
327             // due to use of punctuation and white space characters from the
328             // ascii range.
329             if ch >= '\u{20}' {
330                 GC_Any
331             } else if ch == '\n' {
332                 GC_LF
333             } else if ch == '\r' {
334                 GC_CR
335             } else {
336                 GC_Control
337             }
338         } else {
339             // If this char isn't within the cached range, update the cache to the
340             // range that includes it.
341             if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
342                 self.grapheme_cat_cache = gr::grapheme_category(ch);
343             }
344             self.grapheme_cat_cache.2
345         }
346     }
347 
348     // Not sure I'm gonna keep this, the advantage over new() seems thin.
349 
350     /// Set the cursor to a new location in the same string.
351     ///
352     /// ```rust
353     /// # use unicode_segmentation::GraphemeCursor;
354     /// let s = "abcd";
355     /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
356     /// assert_eq!(cursor.cur_cursor(), 0);
357     /// cursor.set_cursor(2);
358     /// assert_eq!(cursor.cur_cursor(), 2);
359     /// ```
set_cursor(&mut self, offset: usize)360     pub fn set_cursor(&mut self, offset: usize) {
361         if offset != self.offset {
362             self.offset = offset;
363             self.state = if offset == 0 || offset == self.len {
364                 GraphemeState::Break
365             } else {
366                 GraphemeState::Unknown
367             };
368             // reset state derived from text around cursor
369             self.cat_before = None;
370             self.cat_after = None;
371             self.incb_linker_count = None;
372             self.ris_count = None;
373         }
374     }
375 
376     #[inline]
377     /// The current offset of the cursor. Equal to the last value provided to
378     /// `new()` or `set_cursor()`, or returned from `next_boundary()` or
379     /// `prev_boundary()`.
380     ///
381     /// ```rust
382     /// # use unicode_segmentation::GraphemeCursor;
383     /// // Two flags (��������), each flag is two RIS codepoints, each RIS is 4 bytes.
384     /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
385     /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
386     /// assert_eq!(cursor.cur_cursor(), 4);
387     /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
388     /// assert_eq!(cursor.cur_cursor(), 8);
389     /// ```
cur_cursor(&self) -> usize390     pub fn cur_cursor(&self) -> usize {
391         self.offset
392     }
393 
394     /// Provide additional pre-context when it is needed to decide a boundary.
395     /// The end of the chunk must coincide with the value given in the
396     /// `GraphemeIncomplete::PreContext` request.
397     ///
398     /// ```rust
399     /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
400     /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
401     /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
402     /// // Not enough pre-context to decide if there's a boundary between the two flags.
403     /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
404     /// // Provide one more Regional Indicator Symbol of pre-context
405     /// cursor.provide_context(&flags[4..8], 4);
406     /// // Still not enough context to decide.
407     /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
408     /// // Provide additional requested context.
409     /// cursor.provide_context(&flags[0..4], 0);
410     /// // That's enough to decide (it always is when context goes to the start of the string)
411     /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
412     /// ```
provide_context(&mut self, chunk: &str, chunk_start: usize)413     pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
414         use crate::tables::grapheme as gr;
415         assert!(chunk_start.saturating_add(chunk.len()) == self.pre_context_offset.unwrap());
416         self.pre_context_offset = None;
417         if self.is_extended && chunk_start + chunk.len() == self.offset {
418             let ch = chunk.chars().next_back().unwrap();
419             if self.grapheme_category(ch) == gr::GC_Prepend {
420                 self.decide(false); // GB9b
421                 return;
422             }
423         }
424         match self.state {
425             GraphemeState::InCbConsonant => self.handle_incb_consonant(chunk, chunk_start),
426             GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
427             GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
428             _ => {
429                 if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
430                     let ch = chunk.chars().next_back().unwrap();
431                     self.cat_before = Some(self.grapheme_category(ch));
432                 }
433             }
434         }
435     }
436 
437     #[inline]
decide(&mut self, is_break: bool)438     fn decide(&mut self, is_break: bool) {
439         self.state = if is_break {
440             GraphemeState::Break
441         } else {
442             GraphemeState::NotBreak
443         };
444     }
445 
446     #[inline]
decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete>447     fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
448         self.decide(is_break);
449         Ok(is_break)
450     }
451 
452     #[inline]
is_boundary_result(&self) -> Result<bool, GraphemeIncomplete>453     fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
454         if self.state == GraphemeState::Break {
455             Ok(true)
456         } else if self.state == GraphemeState::NotBreak {
457             Ok(false)
458         } else if let Some(pre_context_offset) = self.pre_context_offset {
459             Err(GraphemeIncomplete::PreContext(pre_context_offset))
460         } else {
461             unreachable!("inconsistent state");
462         }
463     }
464 
465     /// For handling rule GB9c:
466     ///
467     /// There's an `InCB=Consonant` after this, and we need to look back
468     /// to verify whether there should be a break.
469     ///
470     /// Seek backward to find an `InCB=Linker` preceded by an `InCB=Consonsnt`
471     /// (potentially separated by some number of `InCB=Linker` or `InCB=Extend`).
472     /// If we find the consonant in question, then there's no break; if we find a consonant
473     /// with no linker, or a non-linker non-extend non-consonant, or the start of text, there's a break;
474     /// otherwise we need more context
475     #[inline]
handle_incb_consonant(&mut self, chunk: &str, chunk_start: usize)476     fn handle_incb_consonant(&mut self, chunk: &str, chunk_start: usize) {
477         use crate::tables::{self, grapheme as gr};
478 
479         // GB9c only applies to extended grapheme clusters
480         if !self.is_extended {
481             self.decide(true);
482             return;
483         }
484 
485         let mut incb_linker_count = self.incb_linker_count.unwrap_or(0);
486 
487         for ch in chunk.chars().rev() {
488             if tables::is_incb_linker(ch) {
489                 // We found an InCB linker
490                 incb_linker_count += 1;
491                 self.incb_linker_count = Some(incb_linker_count);
492             } else if tables::derived_property::InCB_Extend(ch) {
493                 // We ignore InCB extends, continue
494             } else {
495                 // Prev character is neither linker nor extend, break suppressed iff it's InCB=Consonant
496                 let result = !(self.incb_linker_count.unwrap_or(0) > 0
497                     && self.grapheme_category(ch) == gr::GC_InCB_Consonant);
498                 self.decide(result);
499                 return;
500             }
501         }
502 
503         if chunk_start == 0 {
504             // Start of text and we still haven't found a consonant, so break
505             self.decide(true);
506         } else {
507             // We need more context
508             self.pre_context_offset = Some(chunk_start);
509             self.state = GraphemeState::InCbConsonant;
510         }
511     }
512 
513     #[inline]
handle_regional(&mut self, chunk: &str, chunk_start: usize)514     fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
515         use crate::tables::grapheme as gr;
516         let mut ris_count = self.ris_count.unwrap_or(0);
517         for ch in chunk.chars().rev() {
518             if self.grapheme_category(ch) != gr::GC_Regional_Indicator {
519                 self.ris_count = Some(ris_count);
520                 self.decide((ris_count % 2) == 0);
521                 return;
522             }
523             ris_count += 1;
524         }
525         self.ris_count = Some(ris_count);
526         if chunk_start == 0 {
527             self.decide((ris_count % 2) == 0);
528         } else {
529             self.pre_context_offset = Some(chunk_start);
530             self.state = GraphemeState::Regional;
531         }
532     }
533 
534     #[inline]
handle_emoji(&mut self, chunk: &str, chunk_start: usize)535     fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
536         use crate::tables::grapheme as gr;
537         let mut iter = chunk.chars().rev();
538         if let Some(ch) = iter.next() {
539             if self.grapheme_category(ch) != gr::GC_ZWJ {
540                 self.decide(true);
541                 return;
542             }
543         }
544         for ch in iter {
545             match self.grapheme_category(ch) {
546                 gr::GC_Extend => (),
547                 gr::GC_Extended_Pictographic => {
548                     self.decide(false);
549                     return;
550                 }
551                 _ => {
552                     self.decide(true);
553                     return;
554                 }
555             }
556         }
557         if chunk_start == 0 {
558             self.decide(true);
559         } else {
560             self.pre_context_offset = Some(chunk_start);
561             self.state = GraphemeState::Emoji;
562         }
563     }
564 
565     #[inline]
566     /// Determine whether the current cursor location is a grapheme cluster boundary.
567     /// Only a part of the string need be supplied. If `chunk_start` is nonzero or
568     /// the length of `chunk` is not equal to `len` on creation, then this method
569     /// may return `GraphemeIncomplete::PreContext`. The caller should then
570     /// call `provide_context` with the requested chunk, then retry calling this
571     /// method.
572     ///
573     /// For partial chunks, if the cursor is not at the beginning or end of the
574     /// string, the chunk should contain at least the codepoint following the cursor.
575     /// If the string is nonempty, the chunk must be nonempty.
576     ///
577     /// All calls should have consistent chunk contents (ie, if a chunk provides
578     /// content for a given slice, all further chunks covering that slice must have
579     /// the same content for it).
580     ///
581     /// ```rust
582     /// # use unicode_segmentation::GraphemeCursor;
583     /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
584     /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
585     /// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
586     /// cursor.set_cursor(12);
587     /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
588     /// ```
is_boundary( &mut self, chunk: &str, chunk_start: usize, ) -> Result<bool, GraphemeIncomplete>589     pub fn is_boundary(
590         &mut self,
591         chunk: &str,
592         chunk_start: usize,
593     ) -> Result<bool, GraphemeIncomplete> {
594         use crate::tables::grapheme as gr;
595         if self.state == GraphemeState::Break {
596             return Ok(true);
597         }
598         if self.state == GraphemeState::NotBreak {
599             return Ok(false);
600         }
601         if (self.offset < chunk_start || self.offset >= chunk_start.saturating_add(chunk.len()))
602             && (self.offset > chunk_start.saturating_add(chunk.len()) || self.cat_after.is_none())
603         {
604             return Err(GraphemeIncomplete::InvalidOffset);
605         }
606         if let Some(pre_context_offset) = self.pre_context_offset {
607             return Err(GraphemeIncomplete::PreContext(pre_context_offset));
608         }
609         let offset_in_chunk = self.offset.saturating_sub(chunk_start);
610         if self.cat_after.is_none() {
611             let ch = chunk[offset_in_chunk..].chars().next().unwrap();
612             self.cat_after = Some(self.grapheme_category(ch));
613         }
614         if self.offset == chunk_start {
615             let mut need_pre_context = true;
616             match self.cat_after.unwrap() {
617                 gr::GC_InCB_Consonant => self.state = GraphemeState::InCbConsonant,
618                 gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
619                 gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
620                 _ => need_pre_context = self.cat_before.is_none(),
621             }
622             if need_pre_context {
623                 self.pre_context_offset = Some(chunk_start);
624                 return Err(GraphemeIncomplete::PreContext(chunk_start));
625             }
626         }
627         if self.cat_before.is_none() {
628             let ch = chunk[..offset_in_chunk].chars().next_back().unwrap();
629             self.cat_before = Some(self.grapheme_category(ch));
630         }
631         match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
632             PairResult::NotBreak => self.decision(false),
633             PairResult::Break => self.decision(true),
634             PairResult::Extended => {
635                 let is_extended = self.is_extended;
636                 self.decision(!is_extended)
637             }
638             PairResult::InCbConsonant => {
639                 self.handle_incb_consonant(&chunk[..offset_in_chunk], chunk_start);
640                 self.is_boundary_result()
641             }
642             PairResult::Regional => {
643                 if let Some(ris_count) = self.ris_count {
644                     return self.decision((ris_count % 2) == 0);
645                 }
646                 self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
647                 self.is_boundary_result()
648             }
649             PairResult::Emoji => {
650                 self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
651                 self.is_boundary_result()
652             }
653         }
654     }
655 
656     #[inline]
657     /// Find the next boundary after the current cursor position. Only a part of
658     /// the string need be supplied. If the chunk is incomplete, then this
659     /// method might return `GraphemeIncomplete::PreContext` or
660     /// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
661     /// call `provide_context` with the requested chunk, then retry. In the
662     /// latter case, the caller should provide the chunk following the one
663     /// given, then retry.
664     ///
665     /// See `is_boundary` for expectations on the provided chunk.
666     ///
667     /// ```rust
668     /// # use unicode_segmentation::GraphemeCursor;
669     /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
670     /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
671     /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
672     /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
673     /// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
674     /// ```
675     ///
676     /// And an example that uses partial strings:
677     ///
678     /// ```rust
679     /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
680     /// let s = "abcd";
681     /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
682     /// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
683     /// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
684     /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
685     /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
686     /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
687     /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
688     /// ```
next_boundary( &mut self, chunk: &str, chunk_start: usize, ) -> Result<Option<usize>, GraphemeIncomplete>689     pub fn next_boundary(
690         &mut self,
691         chunk: &str,
692         chunk_start: usize,
693     ) -> Result<Option<usize>, GraphemeIncomplete> {
694         if self.offset == self.len {
695             return Ok(None);
696         }
697         let mut iter = chunk[self.offset.saturating_sub(chunk_start)..].chars();
698         let mut ch = match iter.next() {
699             Some(ch) => ch,
700             None => return Err(GraphemeIncomplete::NextChunk),
701         };
702         loop {
703             if self.resuming {
704                 if self.cat_after.is_none() {
705                     self.cat_after = Some(self.grapheme_category(ch));
706                 }
707             } else {
708                 self.offset = self.offset.saturating_add(ch.len_utf8());
709                 self.state = GraphemeState::Unknown;
710                 self.cat_before = self.cat_after.take();
711                 if self.cat_before.is_none() {
712                     self.cat_before = Some(self.grapheme_category(ch));
713                 }
714                 if crate::tables::is_incb_linker(ch) {
715                     self.incb_linker_count = Some(self.incb_linker_count.map_or(1, |c| c + 1));
716                 } else if !crate::tables::derived_property::InCB_Extend(ch) {
717                     self.incb_linker_count = Some(0);
718                 }
719                 if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
720                     self.ris_count = self.ris_count.map(|c| c + 1);
721                 } else {
722                     self.ris_count = Some(0);
723                 }
724                 if let Some(next_ch) = iter.next() {
725                     ch = next_ch;
726                     self.cat_after = Some(self.grapheme_category(ch));
727                 } else if self.offset == self.len {
728                     self.decide(true);
729                 } else {
730                     self.resuming = true;
731                     return Err(GraphemeIncomplete::NextChunk);
732                 }
733             }
734             self.resuming = true;
735             if self.is_boundary(chunk, chunk_start)? {
736                 self.resuming = false;
737                 return Ok(Some(self.offset));
738             }
739             self.resuming = false;
740         }
741     }
742 
743     /// Find the previous boundary after the current cursor position. Only a part
744     /// of the string need be supplied. If the chunk is incomplete, then this
745     /// method might return `GraphemeIncomplete::PreContext` or
746     /// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
747     /// call `provide_context` with the requested chunk, then retry. In the
748     /// latter case, the caller should provide the chunk preceding the one
749     /// given, then retry.
750     ///
751     /// See `is_boundary` for expectations on the provided chunk.
752     ///
753     /// ```rust
754     /// # use unicode_segmentation::GraphemeCursor;
755     /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
756     /// let mut cursor = GraphemeCursor::new(12, flags.len(), false);
757     /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
758     /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
759     /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
760     /// ```
761     ///
762     /// And an example that uses partial strings (note the exact return is not
763     /// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
764     ///
765     /// ```rust
766     /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
767     /// let s = "abcd";
768     /// let mut cursor = GraphemeCursor::new(4, s.len(), false);
769     /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
770     /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
771     /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
772     /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
773     /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
774     /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
775     /// ```
prev_boundary( &mut self, chunk: &str, chunk_start: usize, ) -> Result<Option<usize>, GraphemeIncomplete>776     pub fn prev_boundary(
777         &mut self,
778         chunk: &str,
779         chunk_start: usize,
780     ) -> Result<Option<usize>, GraphemeIncomplete> {
781         if self.offset == 0 {
782             return Ok(None);
783         }
784         if self.offset == chunk_start {
785             return Err(GraphemeIncomplete::PrevChunk);
786         }
787         let mut iter = chunk[..self.offset.saturating_sub(chunk_start)]
788             .chars()
789             .rev();
790         let mut ch = iter.next().unwrap();
791         loop {
792             if self.offset == chunk_start {
793                 self.resuming = true;
794                 return Err(GraphemeIncomplete::PrevChunk);
795             }
796             if self.resuming {
797                 self.cat_before = Some(self.grapheme_category(ch));
798             } else {
799                 self.offset -= ch.len_utf8();
800                 self.cat_after = self.cat_before.take();
801                 self.state = GraphemeState::Unknown;
802                 if let Some(incb_linker_count) = self.incb_linker_count {
803                     self.ris_count = if incb_linker_count > 0 && crate::tables::is_incb_linker(ch) {
804                         Some(incb_linker_count - 1)
805                     } else if crate::tables::derived_property::InCB_Extend(ch) {
806                         Some(incb_linker_count)
807                     } else {
808                         None
809                     };
810                 }
811                 if let Some(ris_count) = self.ris_count {
812                     self.ris_count = if ris_count > 0 {
813                         Some(ris_count - 1)
814                     } else {
815                         None
816                     };
817                 }
818                 if let Some(prev_ch) = iter.next() {
819                     ch = prev_ch;
820                     self.cat_before = Some(self.grapheme_category(ch));
821                 } else if self.offset == 0 {
822                     self.decide(true);
823                 } else {
824                     self.resuming = true;
825                     self.cat_after = Some(self.grapheme_category(ch));
826                     return Err(GraphemeIncomplete::PrevChunk);
827                 }
828             }
829             self.resuming = true;
830             if self.is_boundary(chunk, chunk_start)? {
831                 self.resuming = false;
832                 return Ok(Some(self.offset));
833             }
834             self.resuming = false;
835         }
836     }
837 }
838 
839 #[test]
test_grapheme_cursor_ris_precontext()840 fn test_grapheme_cursor_ris_precontext() {
841     let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}";
842     let mut c = GraphemeCursor::new(8, s.len(), true);
843     assert_eq!(
844         c.is_boundary(&s[4..], 4),
845         Err(GraphemeIncomplete::PreContext(4))
846     );
847     c.provide_context(&s[..4], 0);
848     assert_eq!(c.is_boundary(&s[4..], 4), Ok(true));
849 }
850 
851 #[test]
test_grapheme_cursor_chunk_start_require_precontext()852 fn test_grapheme_cursor_chunk_start_require_precontext() {
853     let s = "\r\n";
854     let mut c = GraphemeCursor::new(1, s.len(), true);
855     assert_eq!(
856         c.is_boundary(&s[1..], 1),
857         Err(GraphemeIncomplete::PreContext(1))
858     );
859     c.provide_context(&s[..1], 0);
860     assert_eq!(c.is_boundary(&s[1..], 1), Ok(false));
861 }
862 
863 #[test]
test_grapheme_cursor_prev_boundary()864 fn test_grapheme_cursor_prev_boundary() {
865     let s = "abcd";
866     let mut c = GraphemeCursor::new(3, s.len(), true);
867     assert_eq!(
868         c.prev_boundary(&s[2..], 2),
869         Err(GraphemeIncomplete::PrevChunk)
870     );
871     assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2)));
872 }
873 
874 #[test]
test_grapheme_cursor_prev_boundary_chunk_start()875 fn test_grapheme_cursor_prev_boundary_chunk_start() {
876     let s = "abcd";
877     let mut c = GraphemeCursor::new(2, s.len(), true);
878     assert_eq!(
879         c.prev_boundary(&s[2..], 2),
880         Err(GraphemeIncomplete::PrevChunk)
881     );
882     assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
883 }
884