/* * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "utils/tokenizer.h" #include #include "gmock/gmock.h" #include "gtest/gtest.h" namespace libtextclassifier3 { namespace { using testing::ElementsAreArray; class TestingTokenizer : public Tokenizer { public: TestingTokenizer( const TokenizationType type, const UniLib* unilib, const std::vector& codepoint_ranges, const std::vector& internal_tokenizer_codepoint_ranges, const bool split_on_script_change, const bool icu_preserve_whitespace_tokens, const bool preserve_floating_numbers) : Tokenizer(type, unilib, codepoint_ranges, internal_tokenizer_codepoint_ranges, split_on_script_change, icu_preserve_whitespace_tokens, preserve_floating_numbers) {} using Tokenizer::FindTokenizationRange; }; class TestingTokenizerProxy { public: TestingTokenizerProxy( TokenizationType type, const std::vector& codepoint_range_configs, const std::vector& internal_codepoint_range_configs, const bool split_on_script_change, const bool icu_preserve_whitespace_tokens, const bool preserve_floating_numbers) : INIT_UNILIB_FOR_TESTING(unilib_) { const int num_configs = codepoint_range_configs.size(); std::vector configs_fb; configs_fb.reserve(num_configs); const int num_internal_configs = internal_codepoint_range_configs.size(); std::vector internal_configs_fb; internal_configs_fb.reserve(num_internal_configs); buffers_.reserve(num_configs + num_internal_configs); for (int i = 0; i < num_configs; i++) { flatbuffers::FlatBufferBuilder builder; builder.Finish(CreateTokenizationCodepointRange( builder, &codepoint_range_configs[i])); buffers_.push_back(builder.Release()); configs_fb.push_back(flatbuffers::GetRoot( buffers_.back().data())); } for (int i = 0; i < num_internal_configs; i++) { flatbuffers::FlatBufferBuilder builder; builder.Finish( CreateCodepointRange(builder, &internal_codepoint_range_configs[i])); buffers_.push_back(builder.Release()); internal_configs_fb.push_back( flatbuffers::GetRoot(buffers_.back().data())); } tokenizer_ = std::unique_ptr(new TestingTokenizer( type, &unilib_, configs_fb, internal_configs_fb, split_on_script_change, icu_preserve_whitespace_tokens, preserve_floating_numbers)); } TokenizationCodepointRange_::Role TestFindTokenizationRole(int c) const { const TokenizationCodepointRangeT* range = tokenizer_->FindTokenizationRange(c); if (range != nullptr) { return range->role; } else { return TokenizationCodepointRange_::Role_DEFAULT_ROLE; } } std::vector Tokenize(const std::string& utf8_text) const { return tokenizer_->Tokenize(utf8_text); } private: UniLib unilib_; std::vector buffers_; std::unique_ptr tokenizer_; }; TEST(TokenizerTest, FindTokenizationRange) { std::vector configs; TokenizationCodepointRangeT* config; configs.emplace_back(); config = &configs.back(); config->start = 0; config->end = 10; config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; configs.emplace_back(); config = &configs.back(); config->start = 32; config->end = 33; config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR; configs.emplace_back(); config = &configs.back(); config->start = 1234; config->end = 12345; config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs, {}, /*split_on_script_change=*/false, /*icu_preserve_whitespace_tokens=*/false, /*preserve_floating_numbers=*/false); // Test hits to the first group. EXPECT_EQ(tokenizer.TestFindTokenizationRole(0), TokenizationCodepointRange_::Role_TOKEN_SEPARATOR); EXPECT_EQ(tokenizer.TestFindTokenizationRole(5), TokenizationCodepointRange_::Role_TOKEN_SEPARATOR); EXPECT_EQ(tokenizer.TestFindTokenizationRole(10), TokenizationCodepointRange_::Role_DEFAULT_ROLE); // Test a hit to the second group. EXPECT_EQ(tokenizer.TestFindTokenizationRole(31), TokenizationCodepointRange_::Role_DEFAULT_ROLE); EXPECT_EQ(tokenizer.TestFindTokenizationRole(32), TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR); EXPECT_EQ(tokenizer.TestFindTokenizationRole(33), TokenizationCodepointRange_::Role_DEFAULT_ROLE); // Test hits to the third group. EXPECT_EQ(tokenizer.TestFindTokenizationRole(1233), TokenizationCodepointRange_::Role_DEFAULT_ROLE); EXPECT_EQ(tokenizer.TestFindTokenizationRole(1234), TokenizationCodepointRange_::Role_TOKEN_SEPARATOR); EXPECT_EQ(tokenizer.TestFindTokenizationRole(12344), TokenizationCodepointRange_::Role_TOKEN_SEPARATOR); EXPECT_EQ(tokenizer.TestFindTokenizationRole(12345), TokenizationCodepointRange_::Role_DEFAULT_ROLE); // Test a hit outside. EXPECT_EQ(tokenizer.TestFindTokenizationRole(99), TokenizationCodepointRange_::Role_DEFAULT_ROLE); } TEST(TokenizerTest, TokenizeOnSpace) { std::vector configs; TokenizationCodepointRangeT* config; configs.emplace_back(); config = &configs.back(); // Space character. config->start = 32; config->end = 33; config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR; TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs, {}, /*split_on_script_change=*/false, /*icu_preserve_whitespace_tokens=*/false, /*preserve_floating_numbers=*/false); std::vector tokens = tokenizer.Tokenize("Hello world!"); EXPECT_THAT(tokens, ElementsAreArray({Token("Hello", 0, 5), Token("world!", 6, 12)})); } TEST(TokenizerTest, TokenizeOnSpaceAndScriptChange) { std::vector configs; TokenizationCodepointRangeT* config; // Latin. configs.emplace_back(); config = &configs.back(); config->start = 0; config->end = 32; config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE; config->script_id = 1; configs.emplace_back(); config = &configs.back(); config->start = 32; config->end = 33; config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR; config->script_id = 1; configs.emplace_back(); config = &configs.back(); config->start = 33; config->end = 0x77F + 1; config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE; config->script_id = 1; TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs, {}, /*split_on_script_change=*/true, /*icu_preserve_whitespace_tokens=*/false, /*preserve_floating_numbers=*/false); EXPECT_THAT(tokenizer.Tokenize("앨라배마 주 전화(123) 456-789웹사이트"), std::vector({Token("앨라배마", 0, 4), Token("주", 5, 6), Token("전화", 7, 10), Token("(123)", 10, 15), Token("456-789", 16, 23), Token("웹사이트", 23, 28)})); } // namespace TEST(TokenizerTest, TokenizeComplex) { std::vector configs; TokenizationCodepointRangeT* config; // Source: http://www.unicode.org/Public/10.0.0/ucd/Blocks-10.0.0d1.txt // Latin - cyrilic. // 0000..007F; Basic Latin // 0080..00FF; Latin-1 Supplement // 0100..017F; Latin Extended-A // 0180..024F; Latin Extended-B // 0250..02AF; IPA Extensions // 02B0..02FF; Spacing Modifier Letters // 0300..036F; Combining Diacritical Marks // 0370..03FF; Greek and Coptic // 0400..04FF; Cyrillic // 0500..052F; Cyrillic Supplement // 0530..058F; Armenian // 0590..05FF; Hebrew // 0600..06FF; Arabic // 0700..074F; Syriac // 0750..077F; Arabic Supplement configs.emplace_back(); config = &configs.back(); config->start = 0; config->end = 32; config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE; configs.emplace_back(); config = &configs.back(); config->start = 32; config->end = 33; config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR; configs.emplace_back(); config = &configs.back(); config->start = 33; config->end = 0x77F + 1; config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE; // CJK // 2E80..2EFF; CJK Radicals Supplement // 3000..303F; CJK Symbols and Punctuation // 3040..309F; Hiragana // 30A0..30FF; Katakana // 3100..312F; Bopomofo // 3130..318F; Hangul Compatibility Jamo // 3190..319F; Kanbun // 31A0..31BF; Bopomofo Extended // 31C0..31EF; CJK Strokes // 31F0..31FF; Katakana Phonetic Extensions // 3200..32FF; Enclosed CJK Letters and Months // 3300..33FF; CJK Compatibility // 3400..4DBF; CJK Unified Ideographs Extension A // 4DC0..4DFF; Yijing Hexagram Symbols // 4E00..9FFF; CJK Unified Ideographs // A000..A48F; Yi Syllables // A490..A4CF; Yi Radicals // A4D0..A4FF; Lisu // A500..A63F; Vai // F900..FAFF; CJK Compatibility Ideographs // FE30..FE4F; CJK Compatibility Forms // 20000..2A6DF; CJK Unified Ideographs Extension B // 2A700..2B73F; CJK Unified Ideographs Extension C // 2B740..2B81F; CJK Unified Ideographs Extension D // 2B820..2CEAF; CJK Unified Ideographs Extension E // 2CEB0..2EBEF; CJK Unified Ideographs Extension F // 2F800..2FA1F; CJK Compatibility Ideographs Supplement configs.emplace_back(); config = &configs.back(); config->start = 0x2E80; config->end = 0x2EFF + 1; config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; configs.emplace_back(); config = &configs.back(); config->start = 0x3000; config->end = 0xA63F + 1; config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; configs.emplace_back(); config = &configs.back(); config->start = 0xF900; config->end = 0xFAFF + 1; config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; configs.emplace_back(); config = &configs.back(); config->start = 0xFE30; config->end = 0xFE4F + 1; config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; configs.emplace_back(); config = &configs.back(); config->start = 0x20000; config->end = 0x2A6DF + 1; config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; configs.emplace_back(); config = &configs.back(); config->start = 0x2A700; config->end = 0x2B73F + 1; config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; configs.emplace_back(); config = &configs.back(); config->start = 0x2B740; config->end = 0x2B81F + 1; config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; configs.emplace_back(); config = &configs.back(); config->start = 0x2B820; config->end = 0x2CEAF + 1; config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; configs.emplace_back(); config = &configs.back(); config->start = 0x2CEB0; config->end = 0x2EBEF + 1; config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; configs.emplace_back(); config = &configs.back(); config->start = 0x2F800; config->end = 0x2FA1F + 1; config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; // Thai. // 0E00..0E7F; Thai configs.emplace_back(); config = &configs.back(); config->start = 0x0E00; config->end = 0x0E7F + 1; config->role = TokenizationCodepointRange_::Role_TOKEN_SEPARATOR; TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs, {}, /*split_on_script_change=*/false, /*icu_preserve_whitespace_tokens=*/false, /*preserve_floating_numbers=*/false); std::vector tokens; tokens = tokenizer.Tokenize( "問少目木輸走猶術権自京門録球変。細開括省用掲情結傍走愛明氷。"); EXPECT_EQ(tokens.size(), 30); tokens = tokenizer.Tokenize("問少目 hello 木輸ยามきゃ"); // clang-format off EXPECT_THAT( tokens, ElementsAreArray({Token("問", 0, 1), Token("少", 1, 2), Token("目", 2, 3), Token("hello", 4, 9), Token("木", 10, 11), Token("輸", 11, 12), Token("ย", 12, 13), Token("า", 13, 14), Token("ม", 14, 15), Token("き", 15, 16), Token("ゃ", 16, 17)})); // clang-format on } #if defined(TC3_TEST_ICU) || defined(__APPLE__) TEST(TokenizerTest, ICUTokenizeWithWhitespaces) { TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {}, /*split_on_script_change=*/false, /*icu_preserve_whitespace_tokens=*/true, /*preserve_floating_numbers=*/false); std::vector tokens = tokenizer.Tokenize("พระบาท สมเด็จ พระ ปร มิ"); // clang-format off ASSERT_EQ(tokens, std::vector({Token("พระบาท", 0, 6), Token(" ", 6, 7), Token("สมเด็จ", 7, 13), Token(" ", 13, 14), Token("พระ", 14, 17), Token(" ", 17, 18), Token("ปร", 18, 20), Token(" ", 20, 21), Token("มิ", 21, 23)})); // clang-format on } TEST(TokenizerTest, ICUTokenizePunctuation) { TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {}, /*split_on_script_change=*/false, /*icu_preserve_whitespace_tokens=*/true, /*preserve_floating_numbers=*/false); std::vector tokens = tokenizer.Tokenize("The interval is: -(12, 138*)"); // clang-format off ASSERT_EQ( tokens, std::vector({Token("The", 0, 3), Token(" ", 3, 4), Token("interval", 4, 12), Token(" ", 12, 13), Token("is", 13, 15), Token(":", 15, 16), Token(" ", 16, 17), Token("-", 17, 18), Token("(", 18, 19), Token("12", 19, 21), Token(",", 21, 22), Token(" ", 22, 23), Token("138", 23, 26), Token("*", 26, 27), Token(")", 27, 28)})); // clang-format on } TEST(TokenizerTest, ICUTokenizeWithNumbers) { TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {}, /*split_on_script_change=*/false, /*icu_preserve_whitespace_tokens=*/true, /*preserve_floating_numbers=*/false); std::vector tokens = tokenizer.Tokenize("3.1 3﹒2 3.3"); // clang-format off ASSERT_EQ(tokens, std::vector({Token("3.1", 0, 3), Token(" ", 3, 4), Token("3﹒2", 4, 7), Token(" ", 7, 8), Token("3.3", 8, 11)})); // clang-format on } #endif #if defined(TC3_TEST_ICU) TEST(TokenizerTest, ICUTokenize) { TestingTokenizerProxy tokenizer(TokenizationType_ICU, {}, {}, /*split_on_script_change=*/false, /*icu_preserve_whitespace_tokens=*/false, /*preserve_floating_numbers=*/false); std::vector tokens = tokenizer.Tokenize("พระบาทสมเด็จพระปรมิ"); // clang-format off ASSERT_EQ(tokens, std::vector({Token("พระบาท", 0, 6), Token("สมเด็จ", 6, 12), Token("พระ", 12, 15), Token("ปร", 15, 17), Token("มิ", 17, 19)})); // clang-format on } TEST(TokenizerTest, MixedTokenize) { std::vector configs; TokenizationCodepointRangeT* config; configs.emplace_back(); config = &configs.back(); config->start = 32; config->end = 33; config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR; std::vector internal_configs; CodepointRangeT* interal_config; internal_configs.emplace_back(); interal_config = &internal_configs.back(); interal_config->start = 0; interal_config->end = 128; internal_configs.emplace_back(); interal_config = &internal_configs.back(); interal_config->start = 128; interal_config->end = 256; internal_configs.emplace_back(); interal_config = &internal_configs.back(); interal_config->start = 256; interal_config->end = 384; internal_configs.emplace_back(); interal_config = &internal_configs.back(); interal_config->start = 384; interal_config->end = 592; TestingTokenizerProxy tokenizer(TokenizationType_MIXED, configs, internal_configs, /*split_on_script_change=*/false, /*icu_preserve_whitespace_tokens=*/false, /*preserve_floating_numbers=*/false); std::vector tokens = tokenizer.Tokenize( "こんにちはJapanese-ląnguagę text 你好世界 http://www.google.com/"); ASSERT_EQ( tokens, // clang-format off std::vector({Token("こんにちは", 0, 5), Token("Japanese-ląnguagę", 5, 22), Token("text", 23, 27), Token("你好", 28, 30), Token("世界", 30, 32), Token("http://www.google.com/", 33, 55)})); // clang-format on } TEST(TokenizerTest, InternalTokenizeOnScriptChange) { std::vector configs; TokenizationCodepointRangeT* config; configs.emplace_back(); config = &configs.back(); config->start = 0; config->end = 256; config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE; { TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs, {}, /*split_on_script_change=*/false, /*icu_preserve_whitespace_tokens=*/false, /*preserve_floating_numbers=*/false); EXPECT_EQ(tokenizer.Tokenize("앨라배마123웹사이트"), std::vector({Token("앨라배마123웹사이트", 0, 11)})); } { TestingTokenizerProxy tokenizer(TokenizationType_INTERNAL_TOKENIZER, configs, {}, /*split_on_script_change=*/true, /*icu_preserve_whitespace_tokens=*/false, /*preserve_floating_numbers=*/false); EXPECT_EQ(tokenizer.Tokenize("앨라배마123웹사이트"), std::vector({Token("앨라배마", 0, 4), Token("123", 4, 7), Token("웹사이트", 7, 11)})); } } #endif TEST(TokenizerTest, LetterDigitTokenize) { TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {}, /*split_on_script_change=*/false, /*icu_preserve_whitespace_tokens=*/false, /*preserve_floating_numbers=*/true); std::vector tokens = tokenizer.Tokenize("7% -3.14 68.9#? 7% $99 .18."); ASSERT_EQ(tokens, std::vector( {Token("7", 0, 1), Token("%", 1, 2), Token(" ", 2, 3), Token("-", 3, 4), Token("3.14", 4, 8), Token(" ", 8, 9), Token("68.9", 9, 13), Token("#", 13, 14), Token("?", 14, 15), Token(" ", 15, 16), Token("7", 16, 17), Token("%", 17, 18), Token(" ", 18, 19), Token("$", 19, 20), Token("99", 20, 22), Token(" ", 22, 23), Token(".", 23, 24), Token("18", 24, 26), Token(".", 26, 27)})); } TEST(TokenizerTest, LetterDigitTokenizeUnicode) { TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {}, /*split_on_script_change=*/false, /*icu_preserve_whitespace_tokens=*/false, /*preserve_floating_numbers=*/true); std::vector tokens = tokenizer.Tokenize("2 pércént 3パーセント"); ASSERT_EQ(tokens, std::vector({Token("2", 0, 1), Token(" ", 1, 2), Token("pércént", 2, 9), Token(" ", 9, 10), Token("3", 10, 11), Token("パーセント", 11, 16)})); } TEST(TokenizerTest, LetterDigitTokenizeWithDots) { TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {}, /*split_on_script_change=*/false, /*icu_preserve_whitespace_tokens=*/false, /*preserve_floating_numbers=*/true); std::vector tokens = tokenizer.Tokenize("3 3﹒2 3.3%"); ASSERT_EQ(tokens, std::vector({Token("3", 0, 1), Token(" ", 1, 2), Token("3﹒2", 2, 5), Token(" ", 5, 6), Token("3.3", 6, 9), Token("%", 9, 10)})); } TEST(TokenizerTest, LetterDigitTokenizeDoNotPreserveFloatingNumbers) { TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {}, /*split_on_script_change=*/false, /*icu_preserve_whitespace_tokens=*/false, /*preserve_floating_numbers=*/false); std::vector tokens = tokenizer.Tokenize("15.12.2019 january's 3.2"); ASSERT_EQ(tokens, std::vector( {Token("15", 0, 2), Token(".", 2, 3), Token("12", 3, 5), Token(".", 5, 6), Token("2019", 6, 10), Token(" ", 10, 11), Token("january", 11, 18), Token("'", 18, 19), Token("s", 19, 20), Token(" ", 20, 21), Token("3", 21, 22), Token(".", 22, 23), Token("2", 23, 24)})); } TEST(TokenizerTest, LetterDigitTokenizeStrangeStringFloatingNumbers) { TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {}, /*split_on_script_change=*/false, /*icu_preserve_whitespace_tokens=*/false, /*preserve_floating_numbers=*/false); std::vector tokens = tokenizer.Tokenize("The+2345++the +íí+"); ASSERT_EQ(tokens, std::vector({Token("The", 0, 3), Token("+", 3, 4), Token("2345", 4, 8), Token("+", 8, 9), Token("+", 9, 10), Token("the", 10, 13), Token(" ", 13, 14), Token("+", 14, 15), Token("íí", 15, 17), Token("+", 17, 18)})); } TEST(TokenizerTest, LetterDigitTokenizeWhitespcesInSameToken) { TestingTokenizerProxy tokenizer(TokenizationType_LETTER_DIGIT, {}, {}, /*split_on_script_change=*/false, /*icu_preserve_whitespace_tokens=*/false, /*preserve_floating_numbers=*/false); std::vector tokens = tokenizer.Tokenize("2 3 4 5"); ASSERT_EQ(tokens, std::vector({Token("2", 0, 1), Token(" ", 1, 2), Token("3", 2, 3), Token(" ", 3, 5), Token("4", 5, 6), Token(" ", 6, 9), Token("5", 9, 10)})); } } // namespace } // namespace libtextclassifier3