1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "util/utf8/unilib.h"
18
19 #include "util/base/logging.h"
20 #include "util/utf8/unicodetext.h"
21 #include "gmock/gmock.h"
22 #include "gtest/gtest.h"
23
24 namespace libtextclassifier2 {
25 namespace {
26
27 using ::testing::ElementsAre;
28
TEST(UniLibTest,CharacterClassesAscii)29 TEST(UniLibTest, CharacterClassesAscii) {
30 CREATE_UNILIB_FOR_TESTING;
31 EXPECT_TRUE(unilib.IsOpeningBracket('('));
32 EXPECT_TRUE(unilib.IsClosingBracket(')'));
33 EXPECT_FALSE(unilib.IsWhitespace(')'));
34 EXPECT_TRUE(unilib.IsWhitespace(' '));
35 EXPECT_FALSE(unilib.IsDigit(')'));
36 EXPECT_TRUE(unilib.IsDigit('0'));
37 EXPECT_TRUE(unilib.IsDigit('9'));
38 EXPECT_FALSE(unilib.IsUpper(')'));
39 EXPECT_TRUE(unilib.IsUpper('A'));
40 EXPECT_TRUE(unilib.IsUpper('Z'));
41 EXPECT_EQ(unilib.ToLower('A'), 'a');
42 EXPECT_EQ(unilib.ToLower('Z'), 'z');
43 EXPECT_EQ(unilib.ToLower(')'), ')');
44 EXPECT_EQ(unilib.GetPairedBracket(')'), '(');
45 EXPECT_EQ(unilib.GetPairedBracket('}'), '{');
46 }
47
48 #ifndef LIBTEXTCLASSIFIER_UNILIB_DUMMY
TEST(UniLibTest,CharacterClassesUnicode)49 TEST(UniLibTest, CharacterClassesUnicode) {
50 CREATE_UNILIB_FOR_TESTING;
51 EXPECT_TRUE(unilib.IsOpeningBracket(0x0F3C)); // TIBET ANG KHANG GYON
52 EXPECT_TRUE(unilib.IsClosingBracket(0x0F3D)); // TIBET ANG KHANG GYAS
53 EXPECT_FALSE(unilib.IsWhitespace(0x23F0)); // ALARM CLOCK
54 EXPECT_TRUE(unilib.IsWhitespace(0x2003)); // EM SPACE
55 EXPECT_FALSE(unilib.IsDigit(0xA619)); // VAI SYMBOL JONG
56 EXPECT_TRUE(unilib.IsDigit(0xA620)); // VAI DIGIT ZERO
57 EXPECT_TRUE(unilib.IsDigit(0xA629)); // VAI DIGIT NINE
58 EXPECT_FALSE(unilib.IsDigit(0xA62A)); // VAI SYLLABLE NDOLE MA
59 EXPECT_FALSE(unilib.IsUpper(0x0211)); // SMALL R WITH DOUBLE GRAVE
60 EXPECT_TRUE(unilib.IsUpper(0x0212)); // CAPITAL R WITH DOUBLE GRAVE
61 EXPECT_TRUE(unilib.IsUpper(0x0391)); // GREEK CAPITAL ALPHA
62 EXPECT_TRUE(unilib.IsUpper(0x03AB)); // GREEK CAPITAL UPSILON W DIAL
63 EXPECT_FALSE(unilib.IsUpper(0x03AC)); // GREEK SMALL ALPHA WITH TONOS
64 EXPECT_EQ(unilib.ToLower(0x0391), 0x03B1); // GREEK ALPHA
65 EXPECT_EQ(unilib.ToLower(0x03AB), 0x03CB); // GREEK UPSILON WITH DIALYTIKA
66 EXPECT_EQ(unilib.ToLower(0x03C0), 0x03C0); // GREEK SMALL PI
67
68 EXPECT_EQ(unilib.GetPairedBracket(0x0F3C), 0x0F3D);
69 EXPECT_EQ(unilib.GetPairedBracket(0x0F3D), 0x0F3C);
70 }
71 #endif // ndef LIBTEXTCLASSIFIER_UNILIB_DUMMY
72
TEST(UniLibTest,RegexInterface)73 TEST(UniLibTest, RegexInterface) {
74 CREATE_UNILIB_FOR_TESTING;
75 const UnicodeText regex_pattern =
76 UTF8ToUnicodeText("[0-9]+", /*do_copy=*/true);
77 std::unique_ptr<UniLib::RegexPattern> pattern =
78 unilib.CreateRegexPattern(regex_pattern);
79 const UnicodeText input = UTF8ToUnicodeText("hello 0123", /*do_copy=*/false);
80 int status;
81 std::unique_ptr<UniLib::RegexMatcher> matcher = pattern->Matcher(input);
82 TC_LOG(INFO) << matcher->Matches(&status);
83 TC_LOG(INFO) << matcher->Find(&status);
84 TC_LOG(INFO) << matcher->Start(0, &status);
85 TC_LOG(INFO) << matcher->End(0, &status);
86 TC_LOG(INFO) << matcher->Group(0, &status).size_codepoints();
87 }
88
89 #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
TEST(UniLibTest,Regex)90 TEST(UniLibTest, Regex) {
91 CREATE_UNILIB_FOR_TESTING;
92
93 // The smiley face is a 4-byte UTF8 codepoint 0x1F60B, and it's important to
94 // test the regex functionality with it to verify we are handling the indices
95 // correctly.
96 const UnicodeText regex_pattern =
97 UTF8ToUnicodeText("[0-9]+", /*do_copy=*/false);
98 std::unique_ptr<UniLib::RegexPattern> pattern =
99 unilib.CreateRegexPattern(regex_pattern);
100 int status;
101 std::unique_ptr<UniLib::RegexMatcher> matcher;
102
103 matcher = pattern->Matcher(UTF8ToUnicodeText("0123", /*do_copy=*/false));
104 EXPECT_TRUE(matcher->Matches(&status));
105 EXPECT_TRUE(matcher->ApproximatelyMatches(&status));
106 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
107 EXPECT_TRUE(matcher->Matches(&status)); // Check that the state is reset.
108 EXPECT_TRUE(matcher->ApproximatelyMatches(&status));
109 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
110
111 matcher = pattern->Matcher(
112 UTF8ToUnicodeText("hello 0123 world", /*do_copy=*/false));
113 EXPECT_FALSE(matcher->Matches(&status));
114 EXPECT_FALSE(matcher->ApproximatelyMatches(&status));
115 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
116
117 matcher = pattern->Matcher(
118 UTF8ToUnicodeText("hello 0123 world", /*do_copy=*/false));
119 EXPECT_TRUE(matcher->Find(&status));
120 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
121 EXPECT_EQ(matcher->Start(0, &status), 8);
122 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
123 EXPECT_EQ(matcher->End(0, &status), 13);
124 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
125 EXPECT_EQ(matcher->Group(0, &status).ToUTF8String(), "0123");
126 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
127 }
128 #endif // LIBTEXTCLASSIFIER_UNILIB_ICU
129
130 #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
TEST(UniLibTest,RegexGroups)131 TEST(UniLibTest, RegexGroups) {
132 CREATE_UNILIB_FOR_TESTING;
133
134 // The smiley face is a 4-byte UTF8 codepoint 0x1F60B, and it's important to
135 // test the regex functionality with it to verify we are handling the indices
136 // correctly.
137 const UnicodeText regex_pattern = UTF8ToUnicodeText(
138 "(?<group1>[0-9])(?<group2>[0-9]+)", /*do_copy=*/false);
139 std::unique_ptr<UniLib::RegexPattern> pattern =
140 unilib.CreateRegexPattern(regex_pattern);
141 int status;
142 std::unique_ptr<UniLib::RegexMatcher> matcher;
143
144 matcher = pattern->Matcher(
145 UTF8ToUnicodeText("hello 0123 world", /*do_copy=*/false));
146 EXPECT_TRUE(matcher->Find(&status));
147 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
148 EXPECT_EQ(matcher->Start(0, &status), 8);
149 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
150 EXPECT_EQ(matcher->Start(1, &status), 8);
151 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
152 EXPECT_EQ(matcher->Start(2, &status), 9);
153 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
154 EXPECT_EQ(matcher->End(0, &status), 13);
155 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
156 EXPECT_EQ(matcher->End(1, &status), 9);
157 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
158 EXPECT_EQ(matcher->End(2, &status), 12);
159 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
160 EXPECT_EQ(matcher->Group(0, &status).ToUTF8String(), "0123");
161 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
162 EXPECT_EQ(matcher->Group(1, &status).ToUTF8String(), "0");
163 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
164 EXPECT_EQ(matcher->Group(2, &status).ToUTF8String(), "123");
165 EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
166 }
167 #endif // LIBTEXTCLASSIFIER_UNILIB_ICU
168
169 #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
170
TEST(UniLibTest,BreakIterator)171 TEST(UniLibTest, BreakIterator) {
172 CREATE_UNILIB_FOR_TESTING;
173 const UnicodeText text = UTF8ToUnicodeText("some text", /*do_copy=*/false);
174 std::unique_ptr<UniLib::BreakIterator> iterator =
175 unilib.CreateBreakIterator(text);
176 std::vector<int> break_indices;
177 int break_index = 0;
178 while ((break_index = iterator->Next()) != UniLib::BreakIterator::kDone) {
179 break_indices.push_back(break_index);
180 }
181 EXPECT_THAT(break_indices, ElementsAre(4, 5, 9));
182 }
183 #endif // LIBTEXTCLASSIFIER_UNILIB_ICU
184
185 #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
TEST(UniLibTest,BreakIterator4ByteUTF8)186 TEST(UniLibTest, BreakIterator4ByteUTF8) {
187 CREATE_UNILIB_FOR_TESTING;
188 const UnicodeText text = UTF8ToUnicodeText("", /*do_copy=*/false);
189 std::unique_ptr<UniLib::BreakIterator> iterator =
190 unilib.CreateBreakIterator(text);
191 std::vector<int> break_indices;
192 int break_index = 0;
193 while ((break_index = iterator->Next()) != UniLib::BreakIterator::kDone) {
194 break_indices.push_back(break_index);
195 }
196 EXPECT_THAT(break_indices, ElementsAre(1, 2, 3));
197 }
198 #endif // LIBTEXTCLASSIFIER_UNILIB_ICU
199
200 #ifndef LIBTEXTCLASSIFIER_UNILIB_JAVAICU
TEST(UniLibTest,IntegerParse)201 TEST(UniLibTest, IntegerParse) {
202 CREATE_UNILIB_FOR_TESTING;
203 int result;
204 EXPECT_TRUE(
205 unilib.ParseInt32(UTF8ToUnicodeText("123", /*do_copy=*/false), &result));
206 EXPECT_EQ(result, 123);
207 }
208 #endif // ndef LIBTEXTCLASSIFIER_UNILIB_JAVAICU
209
210 #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
TEST(UniLibTest,IntegerParseFullWidth)211 TEST(UniLibTest, IntegerParseFullWidth) {
212 CREATE_UNILIB_FOR_TESTING;
213 int result;
214 // The input string here is full width
215 EXPECT_TRUE(unilib.ParseInt32(UTF8ToUnicodeText("123", /*do_copy=*/false),
216 &result));
217 EXPECT_EQ(result, 123);
218 }
219 #endif // LIBTEXTCLASSIFIER_UNILIB_ICU
220
221 #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
TEST(UniLibTest,IntegerParseFullWidthWithAlpha)222 TEST(UniLibTest, IntegerParseFullWidthWithAlpha) {
223 CREATE_UNILIB_FOR_TESTING;
224 int result;
225 // The input string here is full width
226 EXPECT_FALSE(unilib.ParseInt32(UTF8ToUnicodeText("1a3", /*do_copy=*/false),
227 &result));
228 }
229 #endif // LIBTEXTCLASSIFIER_UNILIB_ICU
230
231 } // namespace
232 } // namespace libtextclassifier2
233