1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "utils/utf8/unicodetext.h"
18
19 #include "utils/strings/stringpiece.h"
20 #include "gtest/gtest.h"
21
22 namespace libtextclassifier3 {
23 namespace {
24
25 class UnicodeTextTest : public testing::Test {
26 protected:
UnicodeTextTest()27 UnicodeTextTest() : empty_text_() {
28 text_.push_back(0x1C0);
29 text_.push_back(0x4E8C);
30 text_.push_back(0xD7DB);
31 text_.push_back(0x34);
32 text_.push_back(0x1D11E);
33 }
34
35 UnicodeText empty_text_;
36 UnicodeText text_;
37 };
38
TEST(UnicodeTextTest,ConstructionFromUnicodeText)39 TEST(UnicodeTextTest, ConstructionFromUnicodeText) {
40 UnicodeText text = UTF8ToUnicodeText("1234hello", /*do_copy=*/false);
41 EXPECT_EQ(UnicodeText(text).ToUTF8String(), "1234hello");
42 EXPECT_EQ(UnicodeText(text, /*do_copy=*/false).ToUTF8String(), "1234hello");
43 }
44
45 // Tests for our modifications of UnicodeText.
TEST(UnicodeTextTest,Custom)46 TEST(UnicodeTextTest, Custom) {
47 UnicodeText text = UTF8ToUnicodeText("1234hello", /*do_copy=*/false);
48 EXPECT_EQ(text.ToUTF8String(), "1234hello");
49 EXPECT_EQ(text.size_codepoints(), 10);
50 EXPECT_EQ(text.size_bytes(), 13);
51
52 auto it_begin = text.begin();
53 std::advance(it_begin, 4);
54 auto it_end = text.begin();
55 std::advance(it_end, 6);
56 EXPECT_EQ(text.UTF8Substring(it_begin, it_end), "h");
57 }
58
TEST(UnicodeTextTest,StringPieceView)59 TEST(UnicodeTextTest, StringPieceView) {
60 std::string raw_text = "1234hello";
61 UnicodeText text =
62 UTF8ToUnicodeText(StringPiece(raw_text), /*do_copy=*/false);
63 EXPECT_EQ(text.ToUTF8String(), "1234hello");
64 EXPECT_EQ(text.size_codepoints(), 10);
65 EXPECT_EQ(text.size_bytes(), 13);
66
67 auto it_begin = text.begin();
68 std::advance(it_begin, 4);
69 auto it_end = text.begin();
70 std::advance(it_end, 6);
71 EXPECT_EQ(text.UTF8Substring(it_begin, it_end), "h");
72 }
73
TEST(UnicodeTextTest,Substring)74 TEST(UnicodeTextTest, Substring) {
75 UnicodeText text = UTF8ToUnicodeText("1234hello", /*do_copy=*/false);
76
77 EXPECT_EQ(
78 UnicodeText::Substring(std::next(text.begin(), 4),
79 std::next(text.begin(), 6), /*do_copy=*/true),
80 UTF8ToUnicodeText("h"));
81 EXPECT_EQ(
82 UnicodeText::Substring(std::next(text.begin(), 4),
83 std::next(text.begin(), 6), /*do_copy=*/false),
84 UTF8ToUnicodeText("h"));
85 EXPECT_EQ(UnicodeText::Substring(text, 4, 6, /*do_copy=*/true),
86 UTF8ToUnicodeText("h"));
87 EXPECT_EQ(UnicodeText::Substring(text, 4, 6, /*do_copy=*/false),
88 UTF8ToUnicodeText("h"));
89 }
90
TEST(UnicodeTextTest,Ownership)91 TEST(UnicodeTextTest, Ownership) {
92 const std::string src = "\u304A\u00B0\u106B";
93
94 UnicodeText alias;
95 alias.PointToUTF8(src.data(), src.size());
96 EXPECT_EQ(alias.data(), src.data());
97 UnicodeText::const_iterator it = alias.begin();
98 EXPECT_EQ(*it++, 0x304A);
99 EXPECT_EQ(*it++, 0x00B0);
100 EXPECT_EQ(*it++, 0x106B);
101 EXPECT_EQ(it, alias.end());
102
103 UnicodeText t = alias; // Copy initialization copies the data.
104 EXPECT_NE(t.data(), alias.data());
105 }
106
TEST(UnicodeTextTest,Validation)107 TEST(UnicodeTextTest, Validation) {
108 EXPECT_TRUE(UTF8ToUnicodeText("1234hello", /*do_copy=*/false).is_valid());
109 EXPECT_TRUE(
110 UTF8ToUnicodeText("\u304A\u00B0\u106B", /*do_copy=*/false).is_valid());
111 EXPECT_TRUE(
112 UTF8ToUnicodeText("this is a test", /*do_copy=*/false).is_valid());
113 EXPECT_TRUE(
114 UTF8ToUnicodeText("\xf0\x9f\x98\x8b", /*do_copy=*/false).is_valid());
115 // Too short (string is too short).
116 EXPECT_FALSE(UTF8ToUnicodeText("\xf0\x9f", /*do_copy=*/false).is_valid());
117 // Too long (too many trailing bytes).
118 EXPECT_FALSE(
119 UTF8ToUnicodeText("\xf0\x9f\x98\x8b\x8b", /*do_copy=*/false).is_valid());
120 // Too short (too few trailing bytes).
121 EXPECT_FALSE(
122 UTF8ToUnicodeText("\xf0\x9f\x98\x61\x61", /*do_copy=*/false).is_valid());
123 // Invalid with context.
124 EXPECT_FALSE(
125 UTF8ToUnicodeText("hello \xf0\x9f\x98\x61\x61 world1", /*do_copy=*/false)
126 .is_valid());
127 }
128
129 class IteratorTest : public UnicodeTextTest {};
130
TEST_F(IteratorTest,Iterates)131 TEST_F(IteratorTest, Iterates) {
132 UnicodeText::const_iterator iter = text_.begin();
133 EXPECT_EQ(0x1C0, *iter);
134 EXPECT_EQ(&iter, &++iter); // operator++ returns *this.
135 EXPECT_EQ(0x4E8C, *iter++);
136 EXPECT_EQ(0xD7DB, *iter);
137 // Make sure you can dereference more than once.
138 EXPECT_EQ(0xD7DB, *iter);
139 EXPECT_EQ(0x34, *++iter);
140 EXPECT_EQ(0x1D11E, *++iter);
141 ASSERT_TRUE(iter != text_.end());
142 iter++;
143 EXPECT_TRUE(iter == text_.end());
144 }
145
TEST_F(IteratorTest,MultiPass)146 TEST_F(IteratorTest, MultiPass) {
147 // Also tests Default Constructible and Assignable.
148 UnicodeText::const_iterator i1, i2;
149 i1 = text_.begin();
150 i2 = i1;
151 EXPECT_EQ(0x4E8C, *++i1);
152 EXPECT_TRUE(i1 != i2);
153 EXPECT_EQ(0x1C0, *i2);
154 ++i2;
155 EXPECT_TRUE(i1 == i2);
156 EXPECT_EQ(0x4E8C, *i2);
157 }
158
TEST_F(IteratorTest,ReverseIterates)159 TEST_F(IteratorTest, ReverseIterates) {
160 UnicodeText::const_iterator iter = text_.end();
161 EXPECT_TRUE(iter == text_.end());
162 iter--;
163 ASSERT_TRUE(iter != text_.end());
164 EXPECT_EQ(0x1D11E, *iter--);
165 EXPECT_EQ(0x34, *iter);
166 EXPECT_EQ(0xD7DB, *--iter);
167 // Make sure you can dereference more than once.
168 EXPECT_EQ(0xD7DB, *iter);
169 --iter;
170 EXPECT_EQ(0x4E8C, *iter--);
171 EXPECT_EQ(0x1C0, *iter);
172 EXPECT_TRUE(iter == text_.begin());
173 }
174
TEST_F(IteratorTest,Comparable)175 TEST_F(IteratorTest, Comparable) {
176 UnicodeText::const_iterator i1, i2;
177 i1 = text_.begin();
178 i2 = i1;
179 ++i2;
180
181 EXPECT_TRUE(i1 < i2);
182 EXPECT_TRUE(text_.begin() <= i1);
183 EXPECT_FALSE(i1 >= i2);
184 EXPECT_FALSE(i1 > text_.end());
185 }
186
TEST_F(IteratorTest,Advance)187 TEST_F(IteratorTest, Advance) {
188 UnicodeText::const_iterator iter = text_.begin();
189 EXPECT_EQ(0x1C0, *iter);
190 std::advance(iter, 4);
191 EXPECT_EQ(0x1D11E, *iter);
192 ++iter;
193 EXPECT_TRUE(iter == text_.end());
194 }
195
TEST_F(IteratorTest,Distance)196 TEST_F(IteratorTest, Distance) {
197 UnicodeText::const_iterator iter = text_.begin();
198 EXPECT_EQ(0, std::distance(text_.begin(), iter));
199 EXPECT_EQ(5, std::distance(iter, text_.end()));
200 ++iter;
201 ++iter;
202 EXPECT_EQ(2, std::distance(text_.begin(), iter));
203 EXPECT_EQ(3, std::distance(iter, text_.end()));
204 ++iter;
205 ++iter;
206 EXPECT_EQ(4, std::distance(text_.begin(), iter));
207 ++iter;
208 EXPECT_EQ(0, std::distance(iter, text_.end()));
209 }
210
211 class OperatorTest : public UnicodeTextTest {};
212
TEST_F(OperatorTest,Clear)213 TEST_F(OperatorTest, Clear) {
214 UnicodeText empty_text(UTF8ToUnicodeText("", /*do_copy=*/false));
215 EXPECT_FALSE(text_ == empty_text);
216 text_.clear();
217 EXPECT_TRUE(text_ == empty_text);
218 }
219
TEST_F(OperatorTest,Empty)220 TEST_F(OperatorTest, Empty) {
221 EXPECT_TRUE(empty_text_.empty());
222 EXPECT_FALSE(text_.empty());
223 text_.clear();
224 EXPECT_TRUE(text_.empty());
225 }
226
227 } // namespace
228 } // namespace libtextclassifier3
229