• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "util/utf8/unilib.h"
18 
19 #include "util/base/logging.h"
20 #include "util/utf8/unicodetext.h"
21 #include "gmock/gmock.h"
22 #include "gtest/gtest.h"
23 
24 namespace libtextclassifier2 {
25 namespace {
26 
27 using ::testing::ElementsAre;
28 
TEST(UniLibTest,CharacterClassesAscii)29 TEST(UniLibTest, CharacterClassesAscii) {
30   CREATE_UNILIB_FOR_TESTING;
31   EXPECT_TRUE(unilib.IsOpeningBracket('('));
32   EXPECT_TRUE(unilib.IsClosingBracket(')'));
33   EXPECT_FALSE(unilib.IsWhitespace(')'));
34   EXPECT_TRUE(unilib.IsWhitespace(' '));
35   EXPECT_FALSE(unilib.IsDigit(')'));
36   EXPECT_TRUE(unilib.IsDigit('0'));
37   EXPECT_TRUE(unilib.IsDigit('9'));
38   EXPECT_FALSE(unilib.IsUpper(')'));
39   EXPECT_TRUE(unilib.IsUpper('A'));
40   EXPECT_TRUE(unilib.IsUpper('Z'));
41   EXPECT_EQ(unilib.ToLower('A'), 'a');
42   EXPECT_EQ(unilib.ToLower('Z'), 'z');
43   EXPECT_EQ(unilib.ToLower(')'), ')');
44   EXPECT_EQ(unilib.GetPairedBracket(')'), '(');
45   EXPECT_EQ(unilib.GetPairedBracket('}'), '{');
46 }
47 
48 #ifndef LIBTEXTCLASSIFIER_UNILIB_DUMMY
TEST(UniLibTest,CharacterClassesUnicode)49 TEST(UniLibTest, CharacterClassesUnicode) {
50   CREATE_UNILIB_FOR_TESTING;
51   EXPECT_TRUE(unilib.IsOpeningBracket(0x0F3C));  // TIBET ANG KHANG GYON
52   EXPECT_TRUE(unilib.IsClosingBracket(0x0F3D));  // TIBET ANG KHANG GYAS
53   EXPECT_FALSE(unilib.IsWhitespace(0x23F0));     // ALARM CLOCK
54   EXPECT_TRUE(unilib.IsWhitespace(0x2003));      // EM SPACE
55   EXPECT_FALSE(unilib.IsDigit(0xA619));          // VAI SYMBOL JONG
56   EXPECT_TRUE(unilib.IsDigit(0xA620));           // VAI DIGIT ZERO
57   EXPECT_TRUE(unilib.IsDigit(0xA629));           // VAI DIGIT NINE
58   EXPECT_FALSE(unilib.IsDigit(0xA62A));          // VAI SYLLABLE NDOLE MA
59   EXPECT_FALSE(unilib.IsUpper(0x0211));          // SMALL R WITH DOUBLE GRAVE
60   EXPECT_TRUE(unilib.IsUpper(0x0212));           // CAPITAL R WITH DOUBLE GRAVE
61   EXPECT_TRUE(unilib.IsUpper(0x0391));           // GREEK CAPITAL ALPHA
62   EXPECT_TRUE(unilib.IsUpper(0x03AB));           // GREEK CAPITAL UPSILON W DIAL
63   EXPECT_FALSE(unilib.IsUpper(0x03AC));          // GREEK SMALL ALPHA WITH TONOS
64   EXPECT_EQ(unilib.ToLower(0x0391), 0x03B1);     // GREEK ALPHA
65   EXPECT_EQ(unilib.ToLower(0x03AB), 0x03CB);     // GREEK UPSILON WITH DIALYTIKA
66   EXPECT_EQ(unilib.ToLower(0x03C0), 0x03C0);     // GREEK SMALL PI
67 
68   EXPECT_EQ(unilib.GetPairedBracket(0x0F3C), 0x0F3D);
69   EXPECT_EQ(unilib.GetPairedBracket(0x0F3D), 0x0F3C);
70 }
71 #endif  // ndef LIBTEXTCLASSIFIER_UNILIB_DUMMY
72 
TEST(UniLibTest,RegexInterface)73 TEST(UniLibTest, RegexInterface) {
74   CREATE_UNILIB_FOR_TESTING;
75   const UnicodeText regex_pattern =
76       UTF8ToUnicodeText("[0-9]+", /*do_copy=*/true);
77   std::unique_ptr<UniLib::RegexPattern> pattern =
78       unilib.CreateRegexPattern(regex_pattern);
79   const UnicodeText input = UTF8ToUnicodeText("hello 0123", /*do_copy=*/false);
80   int status;
81   std::unique_ptr<UniLib::RegexMatcher> matcher = pattern->Matcher(input);
82   TC_LOG(INFO) << matcher->Matches(&status);
83   TC_LOG(INFO) << matcher->Find(&status);
84   TC_LOG(INFO) << matcher->Start(0, &status);
85   TC_LOG(INFO) << matcher->End(0, &status);
86   TC_LOG(INFO) << matcher->Group(0, &status).size_codepoints();
87 }
88 
89 #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
TEST(UniLibTest,Regex)90 TEST(UniLibTest, Regex) {
91   CREATE_UNILIB_FOR_TESTING;
92 
93   // The smiley face is a 4-byte UTF8 codepoint 0x1F60B, and it's important to
94   // test the regex functionality with it to verify we are handling the indices
95   // correctly.
96   const UnicodeText regex_pattern =
97       UTF8ToUnicodeText("[0-9]+��", /*do_copy=*/false);
98   std::unique_ptr<UniLib::RegexPattern> pattern =
99       unilib.CreateRegexPattern(regex_pattern);
100   int status;
101   std::unique_ptr<UniLib::RegexMatcher> matcher;
102 
103   matcher = pattern->Matcher(UTF8ToUnicodeText("0123��", /*do_copy=*/false));
104   EXPECT_TRUE(matcher->Matches(&status));
105   EXPECT_TRUE(matcher->ApproximatelyMatches(&status));
106   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
107   EXPECT_TRUE(matcher->Matches(&status));  // Check that the state is reset.
108   EXPECT_TRUE(matcher->ApproximatelyMatches(&status));
109   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
110 
111   matcher = pattern->Matcher(
112       UTF8ToUnicodeText("hello���� 0123�� world", /*do_copy=*/false));
113   EXPECT_FALSE(matcher->Matches(&status));
114   EXPECT_FALSE(matcher->ApproximatelyMatches(&status));
115   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
116 
117   matcher = pattern->Matcher(
118       UTF8ToUnicodeText("hello���� 0123�� world", /*do_copy=*/false));
119   EXPECT_TRUE(matcher->Find(&status));
120   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
121   EXPECT_EQ(matcher->Start(0, &status), 8);
122   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
123   EXPECT_EQ(matcher->End(0, &status), 13);
124   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
125   EXPECT_EQ(matcher->Group(0, &status).ToUTF8String(), "0123��");
126   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
127 }
128 #endif  // LIBTEXTCLASSIFIER_UNILIB_ICU
129 
130 #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
TEST(UniLibTest,RegexGroups)131 TEST(UniLibTest, RegexGroups) {
132   CREATE_UNILIB_FOR_TESTING;
133 
134   // The smiley face is a 4-byte UTF8 codepoint 0x1F60B, and it's important to
135   // test the regex functionality with it to verify we are handling the indices
136   // correctly.
137   const UnicodeText regex_pattern = UTF8ToUnicodeText(
138       "(?<group1>[0-9])(?<group2>[0-9]+)��", /*do_copy=*/false);
139   std::unique_ptr<UniLib::RegexPattern> pattern =
140       unilib.CreateRegexPattern(regex_pattern);
141   int status;
142   std::unique_ptr<UniLib::RegexMatcher> matcher;
143 
144   matcher = pattern->Matcher(
145       UTF8ToUnicodeText("hello���� 0123�� world", /*do_copy=*/false));
146   EXPECT_TRUE(matcher->Find(&status));
147   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
148   EXPECT_EQ(matcher->Start(0, &status), 8);
149   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
150   EXPECT_EQ(matcher->Start(1, &status), 8);
151   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
152   EXPECT_EQ(matcher->Start(2, &status), 9);
153   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
154   EXPECT_EQ(matcher->End(0, &status), 13);
155   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
156   EXPECT_EQ(matcher->End(1, &status), 9);
157   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
158   EXPECT_EQ(matcher->End(2, &status), 12);
159   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
160   EXPECT_EQ(matcher->Group(0, &status).ToUTF8String(), "0123��");
161   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
162   EXPECT_EQ(matcher->Group(1, &status).ToUTF8String(), "0");
163   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
164   EXPECT_EQ(matcher->Group(2, &status).ToUTF8String(), "123");
165   EXPECT_EQ(status, UniLib::RegexMatcher::kNoError);
166 }
167 #endif  // LIBTEXTCLASSIFIER_UNILIB_ICU
168 
169 #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
170 
TEST(UniLibTest,BreakIterator)171 TEST(UniLibTest, BreakIterator) {
172   CREATE_UNILIB_FOR_TESTING;
173   const UnicodeText text = UTF8ToUnicodeText("some text", /*do_copy=*/false);
174   std::unique_ptr<UniLib::BreakIterator> iterator =
175       unilib.CreateBreakIterator(text);
176   std::vector<int> break_indices;
177   int break_index = 0;
178   while ((break_index = iterator->Next()) != UniLib::BreakIterator::kDone) {
179     break_indices.push_back(break_index);
180   }
181   EXPECT_THAT(break_indices, ElementsAre(4, 5, 9));
182 }
183 #endif  // LIBTEXTCLASSIFIER_UNILIB_ICU
184 
185 #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
TEST(UniLibTest,BreakIterator4ByteUTF8)186 TEST(UniLibTest, BreakIterator4ByteUTF8) {
187   CREATE_UNILIB_FOR_TESTING;
188   const UnicodeText text = UTF8ToUnicodeText("������", /*do_copy=*/false);
189   std::unique_ptr<UniLib::BreakIterator> iterator =
190       unilib.CreateBreakIterator(text);
191   std::vector<int> break_indices;
192   int break_index = 0;
193   while ((break_index = iterator->Next()) != UniLib::BreakIterator::kDone) {
194     break_indices.push_back(break_index);
195   }
196   EXPECT_THAT(break_indices, ElementsAre(1, 2, 3));
197 }
198 #endif  // LIBTEXTCLASSIFIER_UNILIB_ICU
199 
200 #ifndef LIBTEXTCLASSIFIER_UNILIB_JAVAICU
TEST(UniLibTest,IntegerParse)201 TEST(UniLibTest, IntegerParse) {
202   CREATE_UNILIB_FOR_TESTING;
203   int result;
204   EXPECT_TRUE(
205       unilib.ParseInt32(UTF8ToUnicodeText("123", /*do_copy=*/false), &result));
206   EXPECT_EQ(result, 123);
207 }
208 #endif  // ndef LIBTEXTCLASSIFIER_UNILIB_JAVAICU
209 
210 #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
TEST(UniLibTest,IntegerParseFullWidth)211 TEST(UniLibTest, IntegerParseFullWidth) {
212   CREATE_UNILIB_FOR_TESTING;
213   int result;
214   // The input string here is full width
215   EXPECT_TRUE(unilib.ParseInt32(UTF8ToUnicodeText("123", /*do_copy=*/false),
216                                 &result));
217   EXPECT_EQ(result, 123);
218 }
219 #endif  // LIBTEXTCLASSIFIER_UNILIB_ICU
220 
221 #ifdef LIBTEXTCLASSIFIER_UNILIB_ICU
TEST(UniLibTest,IntegerParseFullWidthWithAlpha)222 TEST(UniLibTest, IntegerParseFullWidthWithAlpha) {
223   CREATE_UNILIB_FOR_TESTING;
224   int result;
225   // The input string here is full width
226   EXPECT_FALSE(unilib.ParseInt32(UTF8ToUnicodeText("1a3", /*do_copy=*/false),
227                                  &result));
228 }
229 #endif  // LIBTEXTCLASSIFIER_UNILIB_ICU
230 
231 }  // namespace
232 }  // namespace libtextclassifier2
233