• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/util/character-iterator.h"
16 
17 #include <string_view>
18 
19 #include "icing/util/i18n-utils.h"
20 #include "unicode/utypes.h"
21 
22 namespace icing {
23 namespace lib {
24 
25 namespace {
26 
27 // Returns the lead byte of the UTF-8 character that includes the byte at
28 // current_byte_index within it.
GetUTF8StartPosition(std::string_view text,int current_byte_index)29 int GetUTF8StartPosition(std::string_view text, int current_byte_index) {
30   while (!i18n_utils::IsLeadUtf8Byte(text[current_byte_index])) {
31     --current_byte_index;
32   }
33   return current_byte_index;
34 }
35 
36 }  // namespace
37 
GetCurrentChar() const38 UChar32 CharacterIterator::GetCurrentChar() const {
39   if (utf8_index_ > text_.length() || utf8_index_ < 0) {
40     return i18n_utils::kInvalidUChar32;
41   }
42 
43   if (utf8_index_ == text_.length()) {
44     // This is allowed and it means the iterator is at the end. Since
45     // std::string_view is not guaranteed to be null-terminated, we cannot read
46     // any bytes out of bound.
47     // Therefore, return 0 (null character) directly here.
48     return 0;
49   }
50 
51   if (cached_current_char_ == i18n_utils::kInvalidUChar32) {
52     // Our indices point to the right character, we just need to read that
53     // character. No need to worry about an error. If GetUChar32At fails, then
54     // current_char will be i18n_utils::kInvalidUChar32.
55     cached_current_char_ =
56         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
57   }
58   return cached_current_char_;
59 }
60 
MoveToUtf8(int desired_utf8_index)61 bool CharacterIterator::MoveToUtf8(int desired_utf8_index) {
62   return (desired_utf8_index > utf8_index_) ? AdvanceToUtf8(desired_utf8_index)
63                                             : RewindToUtf8(desired_utf8_index);
64 }
65 
AdvanceToUtf8(int desired_utf8_index)66 bool CharacterIterator::AdvanceToUtf8(int desired_utf8_index) {
67   // Check the boundary first to ensure we only handle desired_utf8_index in
68   // range [0, text_.length()].
69   //
70   // Note that desired_utf8_index == text_.length() is allowed.
71   if (desired_utf8_index > text_.length() || desired_utf8_index < 0) {
72     return false;
73   }
74 
75   ResetToStartIfNecessary();
76 
77   // Need to work forwards.
78   UChar32 uchar32 = cached_current_char_;
79   while (utf8_index_ < desired_utf8_index) {
80     // At this point, utf8_index_ is a valid index in range [0, text_length() -
81     // 1], so we can call GetUChar32At safely.
82     uchar32 =
83         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
84     if (uchar32 == i18n_utils::kInvalidUChar32) {
85       // Unable to retrieve a valid UTF-32 character at the previous position.
86       cached_current_char_ = i18n_utils::kInvalidUChar32;
87       return false;
88     }
89     int utf8_length = i18n_utils::GetUtf8Length(uchar32);
90     if (utf8_index_ + utf8_length > desired_utf8_index) {
91       // Ah! Don't go too far!
92       break;
93     }
94     utf8_index_ += utf8_length;
95     utf16_index_ += i18n_utils::GetUtf16Length(uchar32);
96     ++utf32_index_;
97   }
98 
99   if (utf8_index_ == text_.length()) {
100     // This is allowed and it means the iterator is at the end. Since
101     // std::string_view is not guaranteed to be null-terminated, we cannot read
102     // any bytes out of bound.
103     // Therefore, return 0 (null character) directly here.
104     cached_current_char_ = 0;
105   } else {
106     // At this point, utf8_index_ is a valid index in range [0, text_length() -
107     // 1], so we can call GetUChar32At safely.
108     cached_current_char_ =
109         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
110   }
111   return true;
112 }
113 
RewindToUtf8(int desired_utf8_index)114 bool CharacterIterator::RewindToUtf8(int desired_utf8_index) {
115   if (desired_utf8_index < 0) {
116     // Enforce the requirement.
117     return false;
118   }
119   // Need to work backwards.
120   UChar32 uchar32 = cached_current_char_;
121   while (utf8_index_ > desired_utf8_index) {
122     int utf8_index = utf8_index_ - 1;
123     utf8_index = GetUTF8StartPosition(text_, utf8_index);
124     if (utf8_index < 0) {
125       // Somehow, there wasn't a single UTF-8 lead byte at
126       // requested_byte_index or an earlier byte.
127       cached_current_char_ = i18n_utils::kInvalidUChar32;
128       return false;
129     }
130     // We've found the start of a unicode char!
131     uchar32 =
132         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index);
133     int expected_length = utf8_index_ - utf8_index;
134     if (uchar32 == i18n_utils::kInvalidUChar32 ||
135         expected_length != i18n_utils::GetUtf8Length(uchar32)) {
136       // Either unable to retrieve a valid UTF-32 character at the previous
137       // position or we skipped past an invalid sequence while seeking the
138       // previous start position.
139       cached_current_char_ = i18n_utils::kInvalidUChar32;
140       return false;
141     }
142     cached_current_char_ = uchar32;
143     utf8_index_ = utf8_index;
144     utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
145     --utf32_index_;
146   }
147   return true;
148 }
149 
MoveToUtf16(int desired_utf16_index)150 bool CharacterIterator::MoveToUtf16(int desired_utf16_index) {
151   return (desired_utf16_index > utf16_index_)
152              ? AdvanceToUtf16(desired_utf16_index)
153              : RewindToUtf16(desired_utf16_index);
154 }
155 
AdvanceToUtf16(int desired_utf16_index)156 bool CharacterIterator::AdvanceToUtf16(int desired_utf16_index) {
157   ResetToStartIfNecessary();
158 
159   UChar32 uchar32 = cached_current_char_;
160   while (utf16_index_ < desired_utf16_index) {
161     uchar32 =
162         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
163     if (uchar32 == i18n_utils::kInvalidUChar32) {
164       // Unable to retrieve a valid UTF-32 character at the previous position.
165       cached_current_char_ = i18n_utils::kInvalidUChar32;
166       return false;
167     }
168     int utf16_length = i18n_utils::GetUtf16Length(uchar32);
169     if (utf16_index_ + utf16_length > desired_utf16_index) {
170       // Ah! Don't go too far!
171       break;
172     }
173     int utf8_length = i18n_utils::GetUtf8Length(uchar32);
174     if (utf8_index_ + utf8_length > text_.length()) {
175       // Enforce the requirement.
176       cached_current_char_ = i18n_utils::kInvalidUChar32;
177       return false;
178     }
179     utf8_index_ += utf8_length;
180     utf16_index_ += utf16_length;
181     ++utf32_index_;
182   }
183   cached_current_char_ =
184       i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
185   return true;
186 }
187 
RewindToUtf16(int desired_utf16_index)188 bool CharacterIterator::RewindToUtf16(int desired_utf16_index) {
189   if (desired_utf16_index < 0) {
190     return false;
191   }
192   UChar32 uchar32 = cached_current_char_;
193   while (utf16_index_ > desired_utf16_index) {
194     int utf8_index = utf8_index_ - 1;
195     utf8_index = GetUTF8StartPosition(text_, utf8_index);
196     if (utf8_index < 0) {
197       // Somehow, there wasn't a single UTF-8 lead byte at
198       // requested_byte_index or an earlier byte.
199       cached_current_char_ = i18n_utils::kInvalidUChar32;
200       return false;
201     }
202     // We've found the start of a unicode char!
203     uchar32 =
204         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index);
205     int expected_length = utf8_index_ - utf8_index;
206     if (uchar32 == i18n_utils::kInvalidUChar32 ||
207         expected_length != i18n_utils::GetUtf8Length(uchar32)) {
208       // Either unable to retrieve a valid UTF-32 character at the previous
209       // position or we skipped past an invalid sequence while seeking the
210       // previous start position.
211       cached_current_char_ = i18n_utils::kInvalidUChar32;
212       return false;
213     }
214     cached_current_char_ = uchar32;
215     utf8_index_ = utf8_index;
216     utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
217     --utf32_index_;
218   }
219   return true;
220 }
221 
MoveToUtf32(int desired_utf32_index)222 bool CharacterIterator::MoveToUtf32(int desired_utf32_index) {
223   return (desired_utf32_index > utf32_index_)
224              ? AdvanceToUtf32(desired_utf32_index)
225              : RewindToUtf32(desired_utf32_index);
226 }
227 
AdvanceToUtf32(int desired_utf32_index)228 bool CharacterIterator::AdvanceToUtf32(int desired_utf32_index) {
229   ResetToStartIfNecessary();
230 
231   UChar32 uchar32 = cached_current_char_;
232   while (utf32_index_ < desired_utf32_index) {
233     uchar32 =
234         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
235     if (uchar32 == i18n_utils::kInvalidUChar32) {
236       // Unable to retrieve a valid UTF-32 character at the previous position.
237       cached_current_char_ = i18n_utils::kInvalidUChar32;
238       return false;
239     }
240     int utf16_length = i18n_utils::GetUtf16Length(uchar32);
241     int utf8_length = i18n_utils::GetUtf8Length(uchar32);
242     if (utf8_index_ + utf8_length > text_.length()) {
243       // Enforce the requirement.
244       cached_current_char_ = i18n_utils::kInvalidUChar32;
245       return false;
246     }
247     utf8_index_ += utf8_length;
248     utf16_index_ += utf16_length;
249     ++utf32_index_;
250   }
251   cached_current_char_ =
252       i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
253   return true;
254 }
255 
RewindToUtf32(int desired_utf32_index)256 bool CharacterIterator::RewindToUtf32(int desired_utf32_index) {
257   if (desired_utf32_index < 0) {
258     return false;
259   }
260   UChar32 uchar32 = cached_current_char_;
261   while (utf32_index_ > desired_utf32_index) {
262     int utf8_index = utf8_index_ - 1;
263     utf8_index = GetUTF8StartPosition(text_, utf8_index);
264     if (utf8_index < 0) {
265       // Somehow, there wasn't a single UTF-8 lead byte at
266       // requested_byte_index or an earlier byte.
267       cached_current_char_ = i18n_utils::kInvalidUChar32;
268       return false;
269     }
270     // We've found the start of a unicode char!
271     uchar32 =
272         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index);
273     int expected_length = utf8_index_ - utf8_index;
274     if (uchar32 == i18n_utils::kInvalidUChar32 ||
275         expected_length != i18n_utils::GetUtf8Length(uchar32)) {
276       // Either unable to retrieve a valid UTF-32 character at the previous
277       // position or we skipped past an invalid sequence while seeking the
278       // previous start position.
279       cached_current_char_ = i18n_utils::kInvalidUChar32;
280       return false;
281     }
282     cached_current_char_ = uchar32;
283     utf8_index_ = utf8_index;
284     utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
285     --utf32_index_;
286   }
287   return true;
288 }
289 
ResetToStartIfNecessary()290 void CharacterIterator::ResetToStartIfNecessary() {
291   if (utf8_index_ < 0 || utf16_index_ < 0 || utf32_index_ < 0) {
292     utf8_index_ = 0;
293     utf16_index_ = 0;
294     utf32_index_ = 0;
295     if (!text_.empty()) {
296       cached_current_char_ =
297           i18n_utils::GetUChar32At(text_.data(), text_.length(), 0);
298     } else {
299       cached_current_char_ = i18n_utils::kInvalidUChar32;
300     }
301   }
302 }
303 
304 }  // namespace lib
305 }  // namespace icing
306