textwrap/src/word_separators.rs

//! Functionality for finding words.
//!
//! In order to wrap text, we need to know where the legal break
//! points are, i.e., where the words of the text are. This means that
//! we need to define what a "word" is.
//!
//! A simple approach is to simply split the text on whitespace, but
//! this does not work for East-Asian languages such as Chinese or
//! Japanese where there are no spaces between words. Breaking a long
//! sequence of emojis is another example where line breaks might be
//! wanted even if there are no whitespace to be found.
//!
//! The [`WordSeparator`] trait is responsible for determining where
//! there words are in a line of text. Please refer to the trait and
//! the structs which implement it for more information.

#[cfg(feature = "unicode-linebreak")]
use crate::core::skip_ansi_escape_sequence;
use crate::core::Word;

/// Describes where words occur in a line of text.
///
/// The simplest approach is say that words are separated by one or
/// more ASCII spaces (`' '`). This works for Western languages
/// without emojis. A more complex approach is to use the Unicode line
/// breaking algorithm, which finds break points in non-ASCII text.
///
/// The line breaks occur between words, please see
/// [`WordSplitter`](crate::WordSplitter) for options of how to handle
/// hyphenation of individual words.
///
/// # Examples
///
/// ```
/// use textwrap::core::Word;
/// use textwrap::WordSeparator::AsciiSpace;
///
/// let words = AsciiSpace.find_words("Hello World!").collect::<Vec<_>>();
/// assert_eq!(words, vec![Word::from("Hello "), Word::from("World!")]);
/// ```
#[derive(Clone, Copy)]
pub enum WordSeparator {
    /// Find words by splitting on runs of `' '` characters.
    ///
    /// # Examples
    ///
    /// ```
    /// use textwrap::core::Word;
    /// use textwrap::WordSeparator::AsciiSpace;
    ///
    /// let words = AsciiSpace.find_words("Hello   World!").collect::<Vec<_>>();
    /// assert_eq!(words, vec![Word::from("Hello   "),
    ///                        Word::from("World!")]);
    /// ```
    AsciiSpace,

    /// Split `line` into words using Unicode break properties.
    ///
    /// This word separator uses the Unicode line breaking algorithm
    /// described in [Unicode Standard Annex
    /// #14](https://www.unicode.org/reports/tr14/) to find legal places
    /// to break lines. There is a small difference in that the U+002D
    /// (Hyphen-Minus) and U+00AD (Soft Hyphen) don’t create a line break:
    /// to allow a line break at a hyphen, use
    /// [`WordSplitter::HyphenSplitter`](crate::WordSplitter::HyphenSplitter).
    /// Soft hyphens are not currently supported.
    ///
    /// # Examples
    ///
    /// Unlike [`WordSeparator::AsciiSpace`], the Unicode line
    /// breaking algorithm will find line break opportunities between
    /// some characters with no intervening whitespace:
    ///
    /// ```
    /// #[cfg(feature = "unicode-linebreak")] {
    /// use textwrap::core::Word;
    /// use textwrap::WordSeparator::UnicodeBreakProperties;
    ///
    /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: ����").collect::<Vec<_>>(),
    ///            vec![Word::from("Emojis: "),
    ///                 Word::from("��"),
    ///                 Word::from("��")]);
    ///
    /// assert_eq!(UnicodeBreakProperties.find_words("CJK: 你好").collect::<Vec<_>>(),
    ///            vec![Word::from("CJK: "),
    ///                 Word::from("你"),
    ///                 Word::from("好")]);
    /// }
    /// ```
    ///
    /// A U+2060 (Word Joiner) character can be inserted if you want to
    /// manually override the defaults and keep the characters together:
    ///
    /// ```
    /// #[cfg(feature = "unicode-linebreak")] {
    /// use textwrap::core::Word;
    /// use textwrap::WordSeparator::UnicodeBreakProperties;
    ///
    /// assert_eq!(UnicodeBreakProperties.find_words("Emojis: ��\u{2060}��").collect::<Vec<_>>(),
    ///            vec![Word::from("Emojis: "),
    ///                 Word::from("��\u{2060}��")]);
    /// }
    /// ```
    ///
    /// The Unicode line breaking algorithm will also automatically
    /// suppress break breaks around certain punctuation characters::
    ///
    /// ```
    /// #[cfg(feature = "unicode-linebreak")] {
    /// use textwrap::core::Word;
    /// use textwrap::WordSeparator::UnicodeBreakProperties;
    ///
    /// assert_eq!(UnicodeBreakProperties.find_words("[ foo ] bar !").collect::<Vec<_>>(),
    ///            vec![Word::from("[ foo ] "),
    ///                 Word::from("bar !")]);
    /// }
    /// ```
    #[cfg(feature = "unicode-linebreak")]
    UnicodeBreakProperties,

    /// Find words using a custom word separator
    Custom(fn(line: &str) -> Box<dyn Iterator<Item = Word<'_>> + '_>),
}

impl std::fmt::Debug for WordSeparator {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            WordSeparator::AsciiSpace => f.write_str("AsciiSpace"),
            #[cfg(feature = "unicode-linebreak")]
            WordSeparator::UnicodeBreakProperties => f.write_str("UnicodeBreakProperties"),
            WordSeparator::Custom(_) => f.write_str("Custom(...)"),
        }
    }
}

impl WordSeparator {
    // This function should really return impl Iterator<Item = Word>, but
    // this isn't possible until Rust supports higher-kinded types:
    // https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md
    /// Find all words in `line`.
    pub fn find_words<'a>(&self, line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
        match self {
            WordSeparator::AsciiSpace => find_words_ascii_space(line),
            #[cfg(feature = "unicode-linebreak")]
            WordSeparator::UnicodeBreakProperties => find_words_unicode_break_properties(line),
            WordSeparator::Custom(func) => func(line),
        }
    }
}

fn find_words_ascii_space<'a>(line: &'a str) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
    let mut start = 0;
    let mut in_whitespace = false;
    let mut char_indices = line.char_indices();

    Box::new(std::iter::from_fn(move || {
        // for (idx, ch) in char_indices does not work, gives this
        // error:
        //
        // > cannot move out of `char_indices`, a captured variable in
        // > an `FnMut` closure
        #[allow(clippy::while_let_on_iterator)]
        while let Some((idx, ch)) = char_indices.next() {
            if in_whitespace && ch != ' ' {
                let word = Word::from(&line[start..idx]);
                start = idx;
                in_whitespace = ch == ' ';
                return Some(word);
            }

            in_whitespace = ch == ' ';
        }

        if start < line.len() {
            let word = Word::from(&line[start..]);
            start = line.len();
            return Some(word);
        }

        None
    }))
}

// Strip all ANSI escape sequences from `text`.
#[cfg(feature = "unicode-linebreak")]
fn strip_ansi_escape_sequences(text: &str) -> String {
    let mut result = String::with_capacity(text.len());

    let mut chars = text.chars();
    while let Some(ch) = chars.next() {
        if skip_ansi_escape_sequence(ch, &mut chars) {
            continue;
        }
        result.push(ch);
    }

    result
}

/// Soft hyphen, also knows as a “shy hyphen”. Should show up as ‘-’
/// if a line is broken at this point, and otherwise be invisible.
/// Textwrap does not currently support breaking words at soft
/// hyphens.
#[cfg(feature = "unicode-linebreak")]
const SHY: char = '\u{00ad}';

/// Find words in line. ANSI escape sequences are ignored in `line`.
#[cfg(feature = "unicode-linebreak")]
fn find_words_unicode_break_properties<'a>(
    line: &'a str,
) -> Box<dyn Iterator<Item = Word<'a>> + 'a> {
    // Construct an iterator over (original index, stripped index)
    // tuples. We find the Unicode linebreaks on a stripped string,
    // but we need the original indices so we can form words based on
    // the original string.
    let mut last_stripped_idx = 0;
    let mut char_indices = line.char_indices();
    let mut idx_map = std::iter::from_fn(move || match char_indices.next() {
        Some((orig_idx, ch)) => {
            let stripped_idx = last_stripped_idx;
            if !skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) {
                last_stripped_idx += ch.len_utf8();
            }
            Some((orig_idx, stripped_idx))
        }
        None => None,
    });

    let stripped = strip_ansi_escape_sequences(line);
    let mut opportunities = unicode_linebreak::linebreaks(&stripped)
        .filter(|(idx, _)| {
            #[allow(clippy::match_like_matches_macro)]
            match &stripped[..*idx].chars().next_back() {
                // We suppress breaks at ‘-’ since we want to control
                // this via the WordSplitter.
                Some('-') => false,
                // Soft hyphens are currently not supported since we
                // require all `Word` fragments to be continuous in
                // the input string.
                Some(SHY) => false,
                // Other breaks should be fine!
                _ => true,
            }
        })
        .collect::<Vec<_>>()
        .into_iter();

    // Remove final break opportunity, we will add it below using
    // &line[start..]; This ensures that we correctly include a
    // trailing ANSI escape sequence.
    opportunities.next_back();

    let mut start = 0;
    Box::new(std::iter::from_fn(move || {
        #[allow(clippy::while_let_on_iterator)]
        while let Some((idx, _)) = opportunities.next() {
            if let Some((orig_idx, _)) = idx_map.find(|&(_, stripped_idx)| stripped_idx == idx) {
                let word = Word::from(&line[start..orig_idx]);
                start = orig_idx;
                return Some(word);
            }
        }

        if start < line.len() {
            let word = Word::from(&line[start..]);
            start = line.len();
            return Some(word);
        }

        None
    }))
}

#[cfg(test)]
mod tests {
    use super::WordSeparator::*;
    use super::*;

    // Like assert_eq!, but the left expression is an iterator.
    macro_rules! assert_iter_eq {
        ($left:expr, $right:expr) => {
            assert_eq!($left.collect::<Vec<_>>(), $right);
        };
    }

    fn to_words<'a>(words: Vec<&'a str>) -> Vec<Word<'a>> {
        words.into_iter().map(|w: &str| Word::from(&w)).collect()
    }

    macro_rules! test_find_words {
        ($ascii_name:ident,
         $unicode_name:ident,
         $([ $line:expr, $ascii_words:expr, $unicode_words:expr ]),+) => {
            #[test]
            fn $ascii_name() {
                $(
                    let expected_words = to_words($ascii_words.to_vec());
                    let actual_words = WordSeparator::AsciiSpace
                        .find_words($line)
                        .collect::<Vec<_>>();
                    assert_eq!(actual_words, expected_words, "Line: {:?}", $line);
                )+
            }

            #[test]
            #[cfg(feature = "unicode-linebreak")]
            fn $unicode_name() {
                $(
                    let expected_words = to_words($unicode_words.to_vec());
                    let actual_words = WordSeparator::UnicodeBreakProperties
                        .find_words($line)
                        .collect::<Vec<_>>();
                    assert_eq!(actual_words, expected_words, "Line: {:?}", $line);
                )+
            }
        };
    }

    test_find_words!(ascii_space_empty, unicode_empty, ["", [], []]);

    test_find_words!(
        ascii_single_word,
        unicode_single_word,
        ["foo", ["foo"], ["foo"]]
    );

    test_find_words!(
        ascii_two_words,
        unicode_two_words,
        ["foo bar", ["foo ", "bar"], ["foo ", "bar"]]
    );

    test_find_words!(
        ascii_multiple_words,
        unicode_multiple_words,
        ["foo bar", ["foo ", "bar"], ["foo ", "bar"]],
        ["x y z", ["x ", "y ", "z"], ["x ", "y ", "z"]]
    );

    test_find_words!(
        ascii_only_whitespace,
        unicode_only_whitespace,
        [" ", [" "], [" "]],
        ["    ", ["    "], ["    "]]
    );

    test_find_words!(
        ascii_inter_word_whitespace,
        unicode_inter_word_whitespace,
        ["foo   bar", ["foo   ", "bar"], ["foo   ", "bar"]]
    );

    test_find_words!(
        ascii_trailing_whitespace,
        unicode_trailing_whitespace,
        ["foo   ", ["foo   "], ["foo   "]]
    );

    test_find_words!(
        ascii_leading_whitespace,
        unicode_leading_whitespace,
        ["   foo", ["   ", "foo"], ["   ", "foo"]]
    );

    test_find_words!(
        ascii_multi_column_char,
        unicode_multi_column_char,
        ["\u{1f920}", ["\u{1f920}"], ["\u{1f920}"]] // cowboy emoji ��
    );

    test_find_words!(
        ascii_hyphens,
        unicode_hyphens,
        ["foo-bar", ["foo-bar"], ["foo-bar"]],
        ["foo- bar", ["foo- ", "bar"], ["foo- ", "bar"]],
        ["foo - bar", ["foo ", "- ", "bar"], ["foo ", "- ", "bar"]],
        ["foo -bar", ["foo ", "-bar"], ["foo ", "-bar"]]
    );

    test_find_words!(
        ascii_newline,
        unicode_newline,
        ["foo\nbar", ["foo\nbar"], ["foo\n", "bar"]]
    );

    test_find_words!(
        ascii_tab,
        unicode_tab,
        ["foo\tbar", ["foo\tbar"], ["foo\t", "bar"]]
    );

    test_find_words!(
        ascii_non_breaking_space,
        unicode_non_breaking_space,
        ["foo\u{00A0}bar", ["foo\u{00A0}bar"], ["foo\u{00A0}bar"]]
    );

    #[test]
    #[cfg(unix)]
    fn find_words_colored_text() {
        use termion::color::{Blue, Fg, Green, Reset};

        let green_hello = format!("{}Hello{} ", Fg(Green), Fg(Reset));
        let blue_world = format!("{}World!{}", Fg(Blue), Fg(Reset));
        assert_iter_eq!(
            AsciiSpace.find_words(&format!("{}{}", green_hello, blue_world)),
            vec![Word::from(&green_hello), Word::from(&blue_world)]
        );

        #[cfg(feature = "unicode-linebreak")]
        assert_iter_eq!(
            UnicodeBreakProperties.find_words(&format!("{}{}", green_hello, blue_world)),
            vec![Word::from(&green_hello), Word::from(&blue_world)]
        );
    }

    #[test]
    fn find_words_color_inside_word() {
        let text = "foo\u{1b}[0m\u{1b}[32mbar\u{1b}[0mbaz";
        assert_iter_eq!(AsciiSpace.find_words(&text), vec![Word::from(text)]);

        #[cfg(feature = "unicode-linebreak")]
        assert_iter_eq!(
            UnicodeBreakProperties.find_words(&text),
            vec![Word::from(text)]
        );
    }
}