1 //! Functionality for finding words.
2 //!
3 //! In order to wrap text, we need to know where the legal break
4 //! points are, i.e., where the words of the text are. This means that
5 //! we need to define what a "word" is.
6 //!
7 //! A simple approach is to simply split the text on whitespace, but
8 //! this does not work for East-Asian languages such as Chinese or
9 //! Japanese where there are no spaces between words. Breaking a long
10 //! sequence of emojis is another example where line breaks might be
11 //! wanted even if there are no whitespace to be found.
12 //!
13 //! The [`WordSeparator`] trait is responsible for determining where
14 //! there words are in a line of text. Please refer to the trait and
15 //! the structs which implement it for more information.
16
17 #[cfg(feature = "unicode-linebreak")]
18 use crate::core::skip_ansi_escape_sequence;
19 use crate::core::Word;
20
21 /// Describes where words occur in a line of text.
22 ///
23 /// The simplest approach is say that words are separated by one or
24 /// more ASCII spaces (`' '`). This works for Western languages
25 /// without emojis. A more complex approach is to use the Unicode line
26 /// breaking algorithm, which finds break points in non-ASCII text.
27 ///
28 /// The line breaks occur between words, please see
29 /// [`WordSplitter`](crate::WordSplitter) for options of how to handle
30 /// hyphenation of individual words.
31 ///
32 /// # Examples
33 ///
34 /// ```
35 /// use textwrap::core::Word;
36 /// use textwrap::WordSeparator::AsciiSpace;
37 ///
38 /// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>();
39 /// assert_eq!(words, vec![Word::from("Hello "), Word::from("World!")]);
40 /// ```
41 #[derive(Clone, Copy)]
42 pub enum WordSeparator {
43 /// Find words by splitting on runs of `' '` characters.
44 ///
45 /// # Examples
46 ///
47 /// ```
48 /// use textwrap::core::Word;
49 /// use textwrap::WordSeparator::AsciiSpace;
50 ///
51 /// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>();
52 /// assert_eq!(words, vec![Word::from("Hello "),
53 /// Word::from("World!")]);
54 /// ```
55 AsciiSpace,
56
57 /// Split `line` into words using Unicode break properties.
58 ///
59 /// This word separator uses the Unicode line breaking algorithm
60 /// described in [Unicode Standard Annex
61 /// #14](https://www.unicode.org/reports/tr14/) to find legal places
62 /// to break lines. There is a small difference in that the U+002D
63 /// (Hyphen-Minus) and U+00AD (Soft Hyphen) don’t create a line break:
64 /// to allow a line break at a hyphen, use
65 /// [`WordSplitter::HyphenSplitter`](crate::WordSplitter::HyphenSplitter).
66 /// Soft hyphens are not currently supported.
67 ///
68 /// # Examples
69 ///
70 /// Unlike [`WordSeparator::AsciiSpace`], the Unicode line
71 /// breaking algorithm will find line break opportunities between
72 /// some characters with no intervening whitespace:
73 ///
74 /// ```
75 /// #[cfg(feature = "unicode-linebreak")] {
76 /// use textwrap::core::Word;
77 /// use textwrap::WordSeparator::UnicodeBreakProperties;
78 ///
79 /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: ").collect::<Vec<_>>(),
80 /// vec![Word::from("Emojis: "),
81 /// Word::from(""),
82 /// Word::from("")]);
83 ///
84 /// assert_eq!(UnicodeBreakProperties.find_words("CJK: 你好").collect::<Vec<_>>(),
85 /// vec![Word::from("CJK: "),
86 /// Word::from("你"),
87 /// Word::from("好")]);
88 /// }
89 /// ```
90 ///
91 /// A U+2060 (Word Joiner) character can be inserted if you want to
92 /// manually override the defaults and keep the characters together:
93 ///
94 /// ```
95 /// #[cfg(feature = "unicode-linebreak")] {
96 /// use textwrap::core::Word;
97 /// use textwrap::WordSeparator::UnicodeBreakProperties;
98 ///
99 /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: \u{2060}").collect::<Vec<_>>(),
100 /// vec![Word::from("Emojis: "),
101 /// Word::from("\u{2060}")]);
102 /// }
103 /// ```
104 ///
105 /// The Unicode line breaking algorithm will also automatically
106 /// suppress break breaks around certain punctuation characters::
107 ///
108 /// ```
109 /// #[cfg(feature = "unicode-linebreak")] {
110 /// use textwrap::core::Word;
111 /// use textwrap::WordSeparator::UnicodeBreakProperties;
112 ///
113 /// assert_eq!(UnicodeBreakProperties.find_words("[ foo ] bar !").collect::<Vec<_>>(),
114 /// vec![Word::from("[ foo ] "),
115 /// Word::from("bar !")]);
116 /// }
117 /// ```
118 #[cfg(feature = "unicode-linebreak")]
119 UnicodeBreakProperties,
120
121 /// Find words using a custom word separator
122 Custom(fn(line: &str) -> Box<dyn Iterator<Item = Word<'_>> + '_>),
123 }
124
125 impl std::fmt::Debug for WordSeparator {
fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result126 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
127 match self {
128 WordSeparator::AsciiSpace => f.write_str("AsciiSpace"),
129 #[cfg(feature = "unicode-linebreak")]
130 WordSeparator::UnicodeBreakProperties => f.write_str("UnicodeBreakProperties"),
131 WordSeparator::Custom(_) => f.write_str("Custom(...)"),
132 }
133 }
134 }
135
136 impl WordSeparator {
137 // This function should really return impl Iterator<Item = Word>, but
138 // this isn't possible until Rust supports higher-kinded types:
139 // https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md
140 /// Find all words in `line`.
find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a>141 pub fn find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
142 match self {
143 WordSeparator::AsciiSpace => find_words_ascii_space(line),
144 #[cfg(feature = "unicode-linebreak")]
145 WordSeparator::UnicodeBreakProperties => find_words_unicode_break_properties(line),
146 WordSeparator::Custom(func) => func(line),
147 }
148 }
149 }
150
find_words_ascii_space<'a>(line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a>151 fn find_words_ascii_space<'a>(line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
152 let mut start = 0;
153 let mut in_whitespace = false;
154 let mut char_indices = line.char_indices();
155
156 Box::new(std::iter::from_fn(move || {
157 // for (idx, ch) in char_indices does not work, gives this
158 // error:
159 //
160 // > cannot move out of `char_indices`, a captured variable in
161 // > an `FnMut` closure
162 #[allow(clippy::while_let_on_iterator)]
163 while let Some((idx, ch)) = char_indices.next() {
164 if in_whitespace && ch != ' ' {
165 let word = Word::from(&line[start..idx]);
166 start = idx;
167 in_whitespace = ch == ' ';
168 return Some(word);
169 }
170
171 in_whitespace = ch == ' ';
172 }
173
174 if start < line.len() {
175 let word = Word::from(&line[start..]);
176 start = line.len();
177 return Some(word);
178 }
179
180 None
181 }))
182 }
183
184 // Strip all ANSI escape sequences from `text`.
185 #[cfg(feature = "unicode-linebreak")]
strip_ansi_escape_sequences(text: &str) -> String186 fn strip_ansi_escape_sequences(text: &str) -> String {
187 let mut result = String::with_capacity(text.len());
188
189 let mut chars = text.chars();
190 while let Some(ch) = chars.next() {
191 if skip_ansi_escape_sequence(ch, &mut chars) {
192 continue;
193 }
194 result.push(ch);
195 }
196
197 result
198 }
199
200 /// Soft hyphen, also knows as a “shy hyphen”. Should show up as ‘-’
201 /// if a line is broken at this point, and otherwise be invisible.
202 /// Textwrap does not currently support breaking words at soft
203 /// hyphens.
204 #[cfg(feature = "unicode-linebreak")]
205 const SHY: char = '\u{00ad}';
206
207 /// Find words in line. ANSI escape sequences are ignored in `line`.
208 #[cfg(feature = "unicode-linebreak")]
find_words_unicode_break_properties<'a>( line: &'a str, ) -> Box<dyn Iterator<Item = Word<'a>> + 'a>209 fn find_words_unicode_break_properties<'a>(
210 line: &'a str,
211 ) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
212 // Construct an iterator over (original index, stripped index)
213 // tuples. We find the Unicode linebreaks on a stripped string,
214 // but we need the original indices so we can form words based on
215 // the original string.
216 let mut last_stripped_idx = 0;
217 let mut char_indices = line.char_indices();
218 let mut idx_map = std::iter::from_fn(move || match char_indices.next() {
219 Some((orig_idx, ch)) => {
220 let stripped_idx = last_stripped_idx;
221 if !skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) {
222 last_stripped_idx += ch.len_utf8();
223 }
224 Some((orig_idx, stripped_idx))
225 }
226 None => None,
227 });
228
229 let stripped = strip_ansi_escape_sequences(line);
230 let mut opportunities = unicode_linebreak::linebreaks(&stripped)
231 .filter(|(idx, _)| {
232 #[allow(clippy::match_like_matches_macro)]
233 match &stripped[..*idx].chars().next_back() {
234 // We suppress breaks at ‘-’ since we want to control
235 // this via the WordSplitter.
236 Some('-') => false,
237 // Soft hyphens are currently not supported since we
238 // require all `Word` fragments to be continuous in
239 // the input string.
240 Some(SHY) => false,
241 // Other breaks should be fine!
242 _ => true,
243 }
244 })
245 .collect::<Vec<_>>()
246 .into_iter();
247
248 // Remove final break opportunity, we will add it below using
249 // &line[start..]; This ensures that we correctly include a
250 // trailing ANSI escape sequence.
251 opportunities.next_back();
252
253 let mut start = 0;
254 Box::new(std::iter::from_fn(move || {
255 #[allow(clippy::while_let_on_iterator)]
256 while let Some((idx, _)) = opportunities.next() {
257 if let Some((orig_idx, _)) = idx_map.find(|&(_, stripped_idx)| stripped_idx == idx) {
258 let word = Word::from(&line[start..orig_idx]);
259 start = orig_idx;
260 return Some(word);
261 }
262 }
263
264 if start < line.len() {
265 let word = Word::from(&line[start..]);
266 start = line.len();
267 return Some(word);
268 }
269
270 None
271 }))
272 }
273
274 #[cfg(test)]
275 mod tests {
276 use super::WordSeparator::*;
277 use super::*;
278
279 // Like assert_eq!, but the left expression is an iterator.
280 macro_rules! assert_iter_eq {
281 ($left:expr, $right:expr) => {
282 assert_eq!($left.collect::<Vec<_>>(), $right);
283 };
284 }
285
to_words<'a>(words: Vec<&'a str>) -> Vec<Word<'a>>286 fn to_words<'a>(words: Vec<&'a str>) -> Vec<Word<'a>> {
287 words.into_iter().map(|w: &str| Word::from(&w)).collect()
288 }
289
290 macro_rules! test_find_words {
291 ($ascii_name:ident,
292 $unicode_name:ident,
293 $([ $line:expr, $ascii_words:expr, $unicode_words:expr ]),+) => {
294 #[test]
295 fn $ascii_name() {
296 $(
297 let expected_words = to_words($ascii_words.to_vec());
298 let actual_words = WordSeparator::AsciiSpace
299 .find_words($line)
300 .collect::<Vec<_>>();
301 assert_eq!(actual_words, expected_words, "Line: {:?}", $line);
302 )+
303 }
304
305 #[test]
306 #[cfg(feature = "unicode-linebreak")]
307 fn $unicode_name() {
308 $(
309 let expected_words = to_words($unicode_words.to_vec());
310 let actual_words = WordSeparator::UnicodeBreakProperties
311 .find_words($line)
312 .collect::<Vec<_>>();
313 assert_eq!(actual_words, expected_words, "Line: {:?}", $line);
314 )+
315 }
316 };
317 }
318
319 test_find_words!(ascii_space_empty, unicode_empty, ["", [], []]);
320
321 test_find_words!(
322 ascii_single_word,
323 unicode_single_word,
324 ["foo", ["foo"], ["foo"]]
325 );
326
327 test_find_words!(
328 ascii_two_words,
329 unicode_two_words,
330 ["foo bar", ["foo ", "bar"], ["foo ", "bar"]]
331 );
332
333 test_find_words!(
334 ascii_multiple_words,
335 unicode_multiple_words,
336 ["foo bar", ["foo ", "bar"], ["foo ", "bar"]],
337 ["x y z", ["x ", "y ", "z"], ["x ", "y ", "z"]]
338 );
339
340 test_find_words!(
341 ascii_only_whitespace,
342 unicode_only_whitespace,
343 [" ", [" "], [" "]],
344 [" ", [" "], [" "]]
345 );
346
347 test_find_words!(
348 ascii_inter_word_whitespace,
349 unicode_inter_word_whitespace,
350 ["foo bar", ["foo ", "bar"], ["foo ", "bar"]]
351 );
352
353 test_find_words!(
354 ascii_trailing_whitespace,
355 unicode_trailing_whitespace,
356 ["foo ", ["foo "], ["foo "]]
357 );
358
359 test_find_words!(
360 ascii_leading_whitespace,
361 unicode_leading_whitespace,
362 [" foo", [" ", "foo"], [" ", "foo"]]
363 );
364
365 test_find_words!(
366 ascii_multi_column_char,
367 unicode_multi_column_char,
368 ["\u{1f920}", ["\u{1f920}"], ["\u{1f920}"]] // cowboy emoji
369 );
370
371 test_find_words!(
372 ascii_hyphens,
373 unicode_hyphens,
374 ["foo-bar", ["foo-bar"], ["foo-bar"]],
375 ["foo- bar", ["foo- ", "bar"], ["foo- ", "bar"]],
376 ["foo - bar", ["foo ", "- ", "bar"], ["foo ", "- ", "bar"]],
377 ["foo -bar", ["foo ", "-bar"], ["foo ", "-bar"]]
378 );
379
380 test_find_words!(
381 ascii_newline,
382 unicode_newline,
383 ["foo\nbar", ["foo\nbar"], ["foo\n", "bar"]]
384 );
385
386 test_find_words!(
387 ascii_tab,
388 unicode_tab,
389 ["foo\tbar", ["foo\tbar"], ["foo\t", "bar"]]
390 );
391
392 test_find_words!(
393 ascii_non_breaking_space,
394 unicode_non_breaking_space,
395 ["foo\u{00A0}bar", ["foo\u{00A0}bar"], ["foo\u{00A0}bar"]]
396 );
397
398 #[test]
399 #[cfg(unix)]
find_words_colored_text()400 fn find_words_colored_text() {
401 use termion::color::{Blue, Fg, Green, Reset};
402
403 let green_hello = format!("{}Hello{} ", Fg(Green), Fg(Reset));
404 let blue_world = format!("{}World!{}", Fg(Blue), Fg(Reset));
405 assert_iter_eq!(
406 AsciiSpace.find_words(&format!("{}{}", green_hello, blue_world)),
407 vec![Word::from(&green_hello), Word::from(&blue_world)]
408 );
409
410 #[cfg(feature = "unicode-linebreak")]
411 assert_iter_eq!(
412 UnicodeBreakProperties.find_words(&format!("{}{}", green_hello, blue_world)),
413 vec![Word::from(&green_hello), Word::from(&blue_world)]
414 );
415 }
416
417 #[test]
find_words_color_inside_word()418 fn find_words_color_inside_word() {
419 let text = "foo\u{1b}[0m\u{1b}[32mbar\u{1b}[0mbaz";
420 assert_iter_eq!(AsciiSpace.find_words(&text), vec![Word::from(text)]);
421
422 #[cfg(feature = "unicode-linebreak")]
423 assert_iter_eq!(
424 UnicodeBreakProperties.find_words(&text),
425 vec![Word::from(text)]
426 );
427 }
428 }
429