1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/util/character-iterator.h"
16
17 #include <string_view>
18
19 #include "icing/util/i18n-utils.h"
20 #include "unicode/utypes.h"
21
22 namespace icing {
23 namespace lib {
24
25 namespace {
26
27 // Returns the lead byte of the UTF-8 character that includes the byte at
28 // current_byte_index within it.
GetUTF8StartPosition(std::string_view text,int current_byte_index)29 int GetUTF8StartPosition(std::string_view text, int current_byte_index) {
30 while (!i18n_utils::IsLeadUtf8Byte(text[current_byte_index])) {
31 --current_byte_index;
32 }
33 return current_byte_index;
34 }
35
36 } // namespace
37
GetCurrentChar() const38 UChar32 CharacterIterator::GetCurrentChar() const {
39 if (utf8_index_ > text_.length() || utf8_index_ < 0) {
40 return i18n_utils::kInvalidUChar32;
41 }
42
43 if (utf8_index_ == text_.length()) {
44 // This is allowed and it means the iterator is at the end. Since
45 // std::string_view is not guaranteed to be null-terminated, we cannot read
46 // any bytes out of bound.
47 // Therefore, return 0 (null character) directly here.
48 return 0;
49 }
50
51 if (cached_current_char_ == i18n_utils::kInvalidUChar32) {
52 // Our indices point to the right character, we just need to read that
53 // character. No need to worry about an error. If GetUChar32At fails, then
54 // current_char will be i18n_utils::kInvalidUChar32.
55 cached_current_char_ =
56 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
57 }
58 return cached_current_char_;
59 }
60
MoveToUtf8(int desired_utf8_index)61 bool CharacterIterator::MoveToUtf8(int desired_utf8_index) {
62 return (desired_utf8_index > utf8_index_) ? AdvanceToUtf8(desired_utf8_index)
63 : RewindToUtf8(desired_utf8_index);
64 }
65
AdvanceToUtf8(int desired_utf8_index)66 bool CharacterIterator::AdvanceToUtf8(int desired_utf8_index) {
67 // Check the boundary first to ensure we only handle desired_utf8_index in
68 // range [0, text_.length()].
69 //
70 // Note that desired_utf8_index == text_.length() is allowed.
71 if (desired_utf8_index > text_.length() || desired_utf8_index < 0) {
72 return false;
73 }
74
75 ResetToStartIfNecessary();
76
77 // Need to work forwards.
78 UChar32 uchar32 = cached_current_char_;
79 while (utf8_index_ < desired_utf8_index) {
80 // At this point, utf8_index_ is a valid index in range [0, text_length() -
81 // 1], so we can call GetUChar32At safely.
82 uchar32 =
83 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
84 if (uchar32 == i18n_utils::kInvalidUChar32) {
85 // Unable to retrieve a valid UTF-32 character at the previous position.
86 cached_current_char_ = i18n_utils::kInvalidUChar32;
87 return false;
88 }
89 int utf8_length = i18n_utils::GetUtf8Length(uchar32);
90 if (utf8_index_ + utf8_length > desired_utf8_index) {
91 // Ah! Don't go too far!
92 break;
93 }
94 utf8_index_ += utf8_length;
95 utf16_index_ += i18n_utils::GetUtf16Length(uchar32);
96 ++utf32_index_;
97 }
98
99 if (utf8_index_ == text_.length()) {
100 // This is allowed and it means the iterator is at the end. Since
101 // std::string_view is not guaranteed to be null-terminated, we cannot read
102 // any bytes out of bound.
103 // Therefore, return 0 (null character) directly here.
104 cached_current_char_ = 0;
105 } else {
106 // At this point, utf8_index_ is a valid index in range [0, text_length() -
107 // 1], so we can call GetUChar32At safely.
108 cached_current_char_ =
109 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
110 }
111 return true;
112 }
113
RewindToUtf8(int desired_utf8_index)114 bool CharacterIterator::RewindToUtf8(int desired_utf8_index) {
115 if (desired_utf8_index < 0) {
116 // Enforce the requirement.
117 return false;
118 }
119 // Need to work backwards.
120 UChar32 uchar32 = cached_current_char_;
121 while (utf8_index_ > desired_utf8_index) {
122 int utf8_index = utf8_index_ - 1;
123 utf8_index = GetUTF8StartPosition(text_, utf8_index);
124 if (utf8_index < 0) {
125 // Somehow, there wasn't a single UTF-8 lead byte at
126 // requested_byte_index or an earlier byte.
127 cached_current_char_ = i18n_utils::kInvalidUChar32;
128 return false;
129 }
130 // We've found the start of a unicode char!
131 uchar32 =
132 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index);
133 int expected_length = utf8_index_ - utf8_index;
134 if (uchar32 == i18n_utils::kInvalidUChar32 ||
135 expected_length != i18n_utils::GetUtf8Length(uchar32)) {
136 // Either unable to retrieve a valid UTF-32 character at the previous
137 // position or we skipped past an invalid sequence while seeking the
138 // previous start position.
139 cached_current_char_ = i18n_utils::kInvalidUChar32;
140 return false;
141 }
142 cached_current_char_ = uchar32;
143 utf8_index_ = utf8_index;
144 utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
145 --utf32_index_;
146 }
147 return true;
148 }
149
MoveToUtf16(int desired_utf16_index)150 bool CharacterIterator::MoveToUtf16(int desired_utf16_index) {
151 return (desired_utf16_index > utf16_index_)
152 ? AdvanceToUtf16(desired_utf16_index)
153 : RewindToUtf16(desired_utf16_index);
154 }
155
AdvanceToUtf16(int desired_utf16_index)156 bool CharacterIterator::AdvanceToUtf16(int desired_utf16_index) {
157 ResetToStartIfNecessary();
158
159 UChar32 uchar32 = cached_current_char_;
160 while (utf16_index_ < desired_utf16_index) {
161 uchar32 =
162 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
163 if (uchar32 == i18n_utils::kInvalidUChar32) {
164 // Unable to retrieve a valid UTF-32 character at the previous position.
165 cached_current_char_ = i18n_utils::kInvalidUChar32;
166 return false;
167 }
168 int utf16_length = i18n_utils::GetUtf16Length(uchar32);
169 if (utf16_index_ + utf16_length > desired_utf16_index) {
170 // Ah! Don't go too far!
171 break;
172 }
173 int utf8_length = i18n_utils::GetUtf8Length(uchar32);
174 if (utf8_index_ + utf8_length > text_.length()) {
175 // Enforce the requirement.
176 cached_current_char_ = i18n_utils::kInvalidUChar32;
177 return false;
178 }
179 utf8_index_ += utf8_length;
180 utf16_index_ += utf16_length;
181 ++utf32_index_;
182 }
183 cached_current_char_ =
184 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
185 return true;
186 }
187
RewindToUtf16(int desired_utf16_index)188 bool CharacterIterator::RewindToUtf16(int desired_utf16_index) {
189 if (desired_utf16_index < 0) {
190 return false;
191 }
192 UChar32 uchar32 = cached_current_char_;
193 while (utf16_index_ > desired_utf16_index) {
194 int utf8_index = utf8_index_ - 1;
195 utf8_index = GetUTF8StartPosition(text_, utf8_index);
196 if (utf8_index < 0) {
197 // Somehow, there wasn't a single UTF-8 lead byte at
198 // requested_byte_index or an earlier byte.
199 cached_current_char_ = i18n_utils::kInvalidUChar32;
200 return false;
201 }
202 // We've found the start of a unicode char!
203 uchar32 =
204 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index);
205 int expected_length = utf8_index_ - utf8_index;
206 if (uchar32 == i18n_utils::kInvalidUChar32 ||
207 expected_length != i18n_utils::GetUtf8Length(uchar32)) {
208 // Either unable to retrieve a valid UTF-32 character at the previous
209 // position or we skipped past an invalid sequence while seeking the
210 // previous start position.
211 cached_current_char_ = i18n_utils::kInvalidUChar32;
212 return false;
213 }
214 cached_current_char_ = uchar32;
215 utf8_index_ = utf8_index;
216 utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
217 --utf32_index_;
218 }
219 return true;
220 }
221
MoveToUtf32(int desired_utf32_index)222 bool CharacterIterator::MoveToUtf32(int desired_utf32_index) {
223 return (desired_utf32_index > utf32_index_)
224 ? AdvanceToUtf32(desired_utf32_index)
225 : RewindToUtf32(desired_utf32_index);
226 }
227
AdvanceToUtf32(int desired_utf32_index)228 bool CharacterIterator::AdvanceToUtf32(int desired_utf32_index) {
229 ResetToStartIfNecessary();
230
231 UChar32 uchar32 = cached_current_char_;
232 while (utf32_index_ < desired_utf32_index) {
233 uchar32 =
234 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
235 if (uchar32 == i18n_utils::kInvalidUChar32) {
236 // Unable to retrieve a valid UTF-32 character at the previous position.
237 cached_current_char_ = i18n_utils::kInvalidUChar32;
238 return false;
239 }
240 int utf16_length = i18n_utils::GetUtf16Length(uchar32);
241 int utf8_length = i18n_utils::GetUtf8Length(uchar32);
242 if (utf8_index_ + utf8_length > text_.length()) {
243 // Enforce the requirement.
244 cached_current_char_ = i18n_utils::kInvalidUChar32;
245 return false;
246 }
247 utf8_index_ += utf8_length;
248 utf16_index_ += utf16_length;
249 ++utf32_index_;
250 }
251 cached_current_char_ =
252 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
253 return true;
254 }
255
RewindToUtf32(int desired_utf32_index)256 bool CharacterIterator::RewindToUtf32(int desired_utf32_index) {
257 if (desired_utf32_index < 0) {
258 return false;
259 }
260 UChar32 uchar32 = cached_current_char_;
261 while (utf32_index_ > desired_utf32_index) {
262 int utf8_index = utf8_index_ - 1;
263 utf8_index = GetUTF8StartPosition(text_, utf8_index);
264 if (utf8_index < 0) {
265 // Somehow, there wasn't a single UTF-8 lead byte at
266 // requested_byte_index or an earlier byte.
267 cached_current_char_ = i18n_utils::kInvalidUChar32;
268 return false;
269 }
270 // We've found the start of a unicode char!
271 uchar32 =
272 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index);
273 int expected_length = utf8_index_ - utf8_index;
274 if (uchar32 == i18n_utils::kInvalidUChar32 ||
275 expected_length != i18n_utils::GetUtf8Length(uchar32)) {
276 // Either unable to retrieve a valid UTF-32 character at the previous
277 // position or we skipped past an invalid sequence while seeking the
278 // previous start position.
279 cached_current_char_ = i18n_utils::kInvalidUChar32;
280 return false;
281 }
282 cached_current_char_ = uchar32;
283 utf8_index_ = utf8_index;
284 utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
285 --utf32_index_;
286 }
287 return true;
288 }
289
ResetToStartIfNecessary()290 void CharacterIterator::ResetToStartIfNecessary() {
291 if (utf8_index_ < 0 || utf16_index_ < 0 || utf32_index_ < 0) {
292 utf8_index_ = 0;
293 utf16_index_ = 0;
294 utf32_index_ = 0;
295 if (!text_.empty()) {
296 cached_current_char_ =
297 i18n_utils::GetUChar32At(text_.data(), text_.length(), 0);
298 } else {
299 cached_current_char_ = i18n_utils::kInvalidUChar32;
300 }
301 }
302 }
303
304 } // namespace lib
305 } // namespace icing
306