/* * Copyright (C) 2017 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "smartselect/tokenizer.h" #include "gmock/gmock.h" #include "gtest/gtest.h" namespace libtextclassifier { namespace { using testing::ElementsAreArray; class TestingTokenizer : public Tokenizer { public: explicit TestingTokenizer( const std::vector& codepoint_range_configs) : Tokenizer(codepoint_range_configs) {} TokenizationCodepointRange::Role TestFindTokenizationRole(int c) const { return FindTokenizationRole(c); } }; TEST(TokenizerTest, FindTokenizationRole) { std::vector configs; TokenizationCodepointRange* config; configs.emplace_back(); config = &configs.back(); config->set_start(0); config->set_end(10); config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR); configs.emplace_back(); config = &configs.back(); config->set_start(32); config->set_end(33); config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR); configs.emplace_back(); config = &configs.back(); config->set_start(1234); config->set_end(12345); config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR); TestingTokenizer tokenizer(configs); // Test hits to the first group. EXPECT_EQ(tokenizer.TestFindTokenizationRole(0), TokenizationCodepointRange::TOKEN_SEPARATOR); EXPECT_EQ(tokenizer.TestFindTokenizationRole(5), TokenizationCodepointRange::TOKEN_SEPARATOR); EXPECT_EQ(tokenizer.TestFindTokenizationRole(10), TokenizationCodepointRange::DEFAULT_ROLE); // Test a hit to the second group. EXPECT_EQ(tokenizer.TestFindTokenizationRole(31), TokenizationCodepointRange::DEFAULT_ROLE); EXPECT_EQ(tokenizer.TestFindTokenizationRole(32), TokenizationCodepointRange::WHITESPACE_SEPARATOR); EXPECT_EQ(tokenizer.TestFindTokenizationRole(33), TokenizationCodepointRange::DEFAULT_ROLE); // Test hits to the third group. EXPECT_EQ(tokenizer.TestFindTokenizationRole(1233), TokenizationCodepointRange::DEFAULT_ROLE); EXPECT_EQ(tokenizer.TestFindTokenizationRole(1234), TokenizationCodepointRange::TOKEN_SEPARATOR); EXPECT_EQ(tokenizer.TestFindTokenizationRole(12344), TokenizationCodepointRange::TOKEN_SEPARATOR); EXPECT_EQ(tokenizer.TestFindTokenizationRole(12345), TokenizationCodepointRange::DEFAULT_ROLE); // Test a hit outside. EXPECT_EQ(tokenizer.TestFindTokenizationRole(99), TokenizationCodepointRange::DEFAULT_ROLE); } TEST(TokenizerTest, TokenizeOnSpace) { std::vector configs; TokenizationCodepointRange* config; configs.emplace_back(); config = &configs.back(); // Space character. config->set_start(32); config->set_end(33); config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR); TestingTokenizer tokenizer(configs); std::vector tokens = tokenizer.Tokenize("Hello world!"); EXPECT_THAT(tokens, ElementsAreArray({Token("Hello", 0, 5), Token("world!", 6, 12)})); } TEST(TokenizerTest, TokenizeComplex) { std::vector configs; TokenizationCodepointRange* config; // Source: http://www.unicode.org/Public/10.0.0/ucd/Blocks-10.0.0d1.txt // Latin - cyrilic. // 0000..007F; Basic Latin // 0080..00FF; Latin-1 Supplement // 0100..017F; Latin Extended-A // 0180..024F; Latin Extended-B // 0250..02AF; IPA Extensions // 02B0..02FF; Spacing Modifier Letters // 0300..036F; Combining Diacritical Marks // 0370..03FF; Greek and Coptic // 0400..04FF; Cyrillic // 0500..052F; Cyrillic Supplement // 0530..058F; Armenian // 0590..05FF; Hebrew // 0600..06FF; Arabic // 0700..074F; Syriac // 0750..077F; Arabic Supplement configs.emplace_back(); config = &configs.back(); config->set_start(0); config->set_end(32); config->set_role(TokenizationCodepointRange::DEFAULT_ROLE); configs.emplace_back(); config = &configs.back(); config->set_start(32); config->set_end(33); config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR); configs.emplace_back(); config = &configs.back(); config->set_start(33); config->set_end(0x77F + 1); config->set_role(TokenizationCodepointRange::DEFAULT_ROLE); // CJK // 2E80..2EFF; CJK Radicals Supplement // 3000..303F; CJK Symbols and Punctuation // 3040..309F; Hiragana // 30A0..30FF; Katakana // 3100..312F; Bopomofo // 3130..318F; Hangul Compatibility Jamo // 3190..319F; Kanbun // 31A0..31BF; Bopomofo Extended // 31C0..31EF; CJK Strokes // 31F0..31FF; Katakana Phonetic Extensions // 3200..32FF; Enclosed CJK Letters and Months // 3300..33FF; CJK Compatibility // 3400..4DBF; CJK Unified Ideographs Extension A // 4DC0..4DFF; Yijing Hexagram Symbols // 4E00..9FFF; CJK Unified Ideographs // A000..A48F; Yi Syllables // A490..A4CF; Yi Radicals // A4D0..A4FF; Lisu // A500..A63F; Vai // F900..FAFF; CJK Compatibility Ideographs // FE30..FE4F; CJK Compatibility Forms // 20000..2A6DF; CJK Unified Ideographs Extension B // 2A700..2B73F; CJK Unified Ideographs Extension C // 2B740..2B81F; CJK Unified Ideographs Extension D // 2B820..2CEAF; CJK Unified Ideographs Extension E // 2CEB0..2EBEF; CJK Unified Ideographs Extension F // 2F800..2FA1F; CJK Compatibility Ideographs Supplement configs.emplace_back(); config = &configs.back(); config->set_start(0x2E80); config->set_end(0x2EFF + 1); config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR); configs.emplace_back(); config = &configs.back(); config->set_start(0x3000); config->set_end(0xA63F + 1); config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR); configs.emplace_back(); config = &configs.back(); config->set_start(0xF900); config->set_end(0xFAFF + 1); config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR); configs.emplace_back(); config = &configs.back(); config->set_start(0xFE30); config->set_end(0xFE4F + 1); config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR); configs.emplace_back(); config = &configs.back(); config->set_start(0x20000); config->set_end(0x2A6DF + 1); config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR); configs.emplace_back(); config = &configs.back(); config->set_start(0x2A700); config->set_end(0x2B73F + 1); config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR); configs.emplace_back(); config = &configs.back(); config->set_start(0x2B740); config->set_end(0x2B81F + 1); config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR); configs.emplace_back(); config = &configs.back(); config->set_start(0x2B820); config->set_end(0x2CEAF + 1); config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR); configs.emplace_back(); config = &configs.back(); config->set_start(0x2CEB0); config->set_end(0x2EBEF + 1); config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR); configs.emplace_back(); config = &configs.back(); config->set_start(0x2F800); config->set_end(0x2FA1F + 1); config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR); // Thai. // 0E00..0E7F; Thai configs.emplace_back(); config = &configs.back(); config->set_start(0x0E00); config->set_end(0x0E7F + 1); config->set_role(TokenizationCodepointRange::TOKEN_SEPARATOR); Tokenizer tokenizer(configs); std::vector tokens; tokens = tokenizer.Tokenize( "問少目木輸走猶術権自京門録球変。細開括省用掲情結傍走愛明氷。"); EXPECT_EQ(tokens.size(), 30); tokens = tokenizer.Tokenize("問少目 hello 木輸ยามきゃ"); // clang-format off EXPECT_THAT( tokens, ElementsAreArray({Token("問", 0, 1), Token("少", 1, 2), Token("目", 2, 3), Token("hello", 4, 9), Token("木", 10, 11), Token("輸", 11, 12), Token("ย", 12, 13), Token("า", 13, 14), Token("ม", 14, 15), Token("き", 15, 16), Token("ゃ", 16, 17)})); // clang-format on } } // namespace } // namespace libtextclassifier