• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 //! Iterators which split strings on Grapheme Cluster, Word or Sentence boundaries, according
12 //! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
13 //!
14 //! ```rust
15 //! extern crate unicode_segmentation;
16 //!
17 //! use unicode_segmentation::UnicodeSegmentation;
18 //!
19 //! fn main() {
20 //!     let s = "a̐éö̲\r\n";
21 //!     let g = UnicodeSegmentation::graphemes(s, true).collect::<Vec<&str>>();
22 //!     let b: &[_] = &["a̐", "é", "ö̲", "\r\n"];
23 //!     assert_eq!(g, b);
24 //!
25 //!     let s = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
26 //!     let w = s.unicode_words().collect::<Vec<&str>>();
27 //!     let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
28 //!     assert_eq!(w, b);
29 //!
30 //!     let s = "The quick (\"brown\")  fox";
31 //!     let w = s.split_word_bounds().collect::<Vec<&str>>();
32 //!     let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "  ", "fox"];
33 //!     assert_eq!(w, b);
34 //! }
35 //! ```
36 //!
37 //! # no_std
38 //!
39 //! unicode-segmentation does not depend on libstd, so it can be used in crates
40 //! with the `#![no_std]` attribute.
41 //!
42 //! # crates.io
43 //!
44 //! You can use this package in your project by adding the following
45 //! to your `Cargo.toml`:
46 //!
47 //! ```toml
48 //! [dependencies]
49 //! unicode-segmentation = "1.7.1"
50 //! ```
51 
52 #![deny(missing_docs, unsafe_code)]
53 #![doc(html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
54        html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png")]
55 
56 #![no_std]
57 
58 // ANDROID: Always import std to enable building as a dylib
59 #[macro_use]
60 extern crate std;
61 
62 #[cfg(test)]
63 #[macro_use]
64 extern crate quickcheck;
65 
66 pub use grapheme::{Graphemes, GraphemeIndices};
67 pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
68 pub use tables::UNICODE_VERSION;
69 pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
70 pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences};
71 
72 mod grapheme;
73 mod tables;
74 mod word;
75 mod sentence;
76 
77 #[cfg(test)]
78 mod test;
79 #[cfg(test)]
80 mod testdata;
81 
82 /// Methods for segmenting strings according to
83 /// [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/).
84 pub trait UnicodeSegmentation {
85     /// Returns an iterator over the [grapheme clusters][graphemes] of `self`.
86     ///
87     /// [graphemes]: http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
88     ///
89     /// If `is_extended` is true, the iterator is over the
90     /// *extended grapheme clusters*;
91     /// otherwise, the iterator is over the *legacy grapheme clusters*.
92     /// [UAX#29](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
93     /// recommends extended grapheme cluster boundaries for general processing.
94     ///
95     /// # Examples
96     ///
97     /// ```
98     /// # use self::unicode_segmentation::UnicodeSegmentation;
99     /// let gr1 = UnicodeSegmentation::graphemes("a\u{310}e\u{301}o\u{308}\u{332}", true)
100     ///           .collect::<Vec<&str>>();
101     /// let b: &[_] = &["a\u{310}", "e\u{301}", "o\u{308}\u{332}"];
102     ///
103     /// assert_eq!(&gr1[..], b);
104     ///
105     /// let gr2 = UnicodeSegmentation::graphemes("a\r\nb��������", true).collect::<Vec<&str>>();
106     /// let b: &[_] = &["a", "\r\n", "b", "����", "����"];
107     ///
108     /// assert_eq!(&gr2[..], b);
109     /// ```
graphemes<'a>(&'a self, is_extended: bool) -> Graphemes<'a>110     fn graphemes<'a>(&'a self, is_extended: bool) -> Graphemes<'a>;
111 
112     /// Returns an iterator over the grapheme clusters of `self` and their
113     /// byte offsets. See `graphemes()` for more information.
114     ///
115     /// # Examples
116     ///
117     /// ```
118     /// # use self::unicode_segmentation::UnicodeSegmentation;
119     /// let gr_inds = UnicodeSegmentation::grapheme_indices("a̐éö̲\r\n", true)
120     ///               .collect::<Vec<(usize, &str)>>();
121     /// let b: &[_] = &[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")];
122     ///
123     /// assert_eq!(&gr_inds[..], b);
124     /// ```
grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>125     fn grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>;
126 
127     /// Returns an iterator over the words of `self`, separated on
128     /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
129     ///
130     /// Here, "words" are just those substrings which, after splitting on
131     /// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
132     /// substring must contain at least one character with the
133     /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
134     /// property, or with
135     /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
136     ///
137     /// # Example
138     ///
139     /// ```
140     /// # use self::unicode_segmentation::UnicodeSegmentation;
141     /// let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
142     /// let uw1 = uws.unicode_words().collect::<Vec<&str>>();
143     /// let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
144     ///
145     /// assert_eq!(&uw1[..], b);
146     /// ```
unicode_words<'a>(&'a self) -> UnicodeWords<'a>147     fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>;
148 
149     /// Returns an iterator over substrings of `self` separated on
150     /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
151     ///
152     /// The concatenation of the substrings returned by this function is just the original string.
153     ///
154     /// # Example
155     ///
156     /// ```
157     /// # use self::unicode_segmentation::UnicodeSegmentation;
158     /// let swu1 = "The quick (\"brown\")  fox".split_word_bounds().collect::<Vec<&str>>();
159     /// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "  ", "fox"];
160     ///
161     /// assert_eq!(&swu1[..], b);
162     /// ```
split_word_bounds<'a>(&'a self) -> UWordBounds<'a>163     fn split_word_bounds<'a>(&'a self) -> UWordBounds<'a>;
164 
165     /// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
166     /// and their offsets. See `split_word_bounds()` for more information.
167     ///
168     /// # Example
169     ///
170     /// ```
171     /// # use self::unicode_segmentation::UnicodeSegmentation;
172     /// let swi1 = "Brr, it's 29.3°F!".split_word_bound_indices().collect::<Vec<(usize, &str)>>();
173     /// let b: &[_] = &[(0, "Brr"), (3, ","), (4, " "), (5, "it's"), (9, " "), (10, "29.3"),
174     ///                 (14, "°"), (16, "F"), (17, "!")];
175     ///
176     /// assert_eq!(&swi1[..], b);
177     /// ```
split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>178     fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>;
179 
180     /// Returns an iterator over substrings of `self` separated on
181     /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
182     ///
183     /// Here, "sentences" are just those substrings which, after splitting on
184     /// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the
185     /// substring must contain at least one character with the
186     /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
187     /// property, or with
188     /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
189     ///
190     /// # Example
191     ///
192     /// ```
193     /// # use self::unicode_segmentation::UnicodeSegmentation;
194     /// let uss = "Mr. Fox jumped. [...] The dog was too lazy.";
195     /// let us1 = uss.unicode_sentences().collect::<Vec<&str>>();
196     /// let b: &[_] = &["Mr. ", "Fox jumped. ", "The dog was too lazy."];
197     ///
198     /// assert_eq!(&us1[..], b);
199     /// ```
unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>200     fn unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>;
201 
202     /// Returns an iterator over substrings of `self` separated on
203     /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
204     ///
205     /// The concatenation of the substrings returned by this function is just the original string.
206     ///
207     /// # Example
208     ///
209     /// ```
210     /// # use self::unicode_segmentation::UnicodeSegmentation;
211     /// let ssbs = "Mr. Fox jumped. [...] The dog was too lazy.";
212     /// let ssb1 = ssbs.split_sentence_bounds().collect::<Vec<&str>>();
213     /// let b: &[_] = &["Mr. ", "Fox jumped. ", "[...] ", "The dog was too lazy."];
214     ///
215     /// assert_eq!(&ssb1[..], b);
216     /// ```
split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>217     fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>;
218 
219     /// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries,
220     /// and their offsets. See `split_sentence_bounds()` for more information.
221     ///
222     /// # Example
223     ///
224     /// ```
225     /// # use self::unicode_segmentation::UnicodeSegmentation;
226     /// let ssis = "Mr. Fox jumped. [...] The dog was too lazy.";
227     /// let ssi1 = ssis.split_sentence_bound_indices().collect::<Vec<(usize, &str)>>();
228     /// let b: &[_] = &[(0, "Mr. "), (4, "Fox jumped. "), (16, "[...] "),
229     ///                 (22, "The dog was too lazy.")];
230     ///
231     /// assert_eq!(&ssi1[..], b);
232     /// ```
split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>233     fn split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>;
234 }
235 
236 impl UnicodeSegmentation for str {
237     #[inline]
graphemes(&self, is_extended: bool) -> Graphemes238     fn graphemes(&self, is_extended: bool) -> Graphemes {
239         grapheme::new_graphemes(self, is_extended)
240     }
241 
242     #[inline]
grapheme_indices(&self, is_extended: bool) -> GraphemeIndices243     fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices {
244         grapheme::new_grapheme_indices(self, is_extended)
245     }
246 
247     #[inline]
unicode_words(&self) -> UnicodeWords248     fn unicode_words(&self) -> UnicodeWords {
249         word::new_unicode_words(self)
250     }
251 
252     #[inline]
split_word_bounds(&self) -> UWordBounds253     fn split_word_bounds(&self) -> UWordBounds {
254         word::new_word_bounds(self)
255     }
256 
257     #[inline]
split_word_bound_indices(&self) -> UWordBoundIndices258     fn split_word_bound_indices(&self) -> UWordBoundIndices {
259         word::new_word_bound_indices(self)
260     }
261 
262     #[inline]
unicode_sentences(&self) -> UnicodeSentences263     fn unicode_sentences(&self) -> UnicodeSentences {
264         sentence::new_unicode_sentences(self)
265     }
266 
267     #[inline]
split_sentence_bounds(&self) -> USentenceBounds268     fn split_sentence_bounds(&self) -> USentenceBounds {
269         sentence::new_sentence_bounds(self)
270     }
271 
272     #[inline]
split_sentence_bound_indices(&self) -> USentenceBoundIndices273     fn split_sentence_bound_indices(&self) -> USentenceBoundIndices {
274         sentence::new_sentence_bound_indices(self)
275     }
276 }
277