• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/util/character-iterator.h"
16 
17 #include "icing/util/i18n-utils.h"
18 
19 namespace icing {
20 namespace lib {
21 
22 namespace {
23 
24 // Returns the lead byte of the UTF-8 character that includes the byte at
25 // current_byte_index within it.
GetUTF8StartPosition(std::string_view text,int current_byte_index)26 int GetUTF8StartPosition(std::string_view text, int current_byte_index) {
27   while (!i18n_utils::IsLeadUtf8Byte(text[current_byte_index])) {
28     --current_byte_index;
29   }
30   return current_byte_index;
31 }
32 
33 }  // namespace
34 
GetCurrentChar()35 UChar32 CharacterIterator::GetCurrentChar() {
36   if (cached_current_char_ == i18n_utils::kInvalidUChar32) {
37     // Our indices point to the right character, we just need to read that
38     // character. No need to worry about an error. If GetUChar32At fails, then
39     // current_char will be i18n_utils::kInvalidUChar32.
40     cached_current_char_ =
41         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
42   }
43   return cached_current_char_;
44 }
45 
MoveToUtf8(int desired_utf8_index)46 bool CharacterIterator::MoveToUtf8(int desired_utf8_index) {
47   return (desired_utf8_index > utf8_index_) ? AdvanceToUtf8(desired_utf8_index)
48                                             : RewindToUtf8(desired_utf8_index);
49 }
50 
AdvanceToUtf8(int desired_utf8_index)51 bool CharacterIterator::AdvanceToUtf8(int desired_utf8_index) {
52   ResetToStartIfNecessary();
53 
54   if (desired_utf8_index > text_.length()) {
55     // Enforce the requirement.
56     return false;
57   }
58   // Need to work forwards.
59   UChar32 uchar32 = cached_current_char_;
60   while (utf8_index_ < desired_utf8_index) {
61     uchar32 =
62         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
63     if (uchar32 == i18n_utils::kInvalidUChar32) {
64       // Unable to retrieve a valid UTF-32 character at the previous position.
65       cached_current_char_ = i18n_utils::kInvalidUChar32;
66       return false;
67     }
68     int utf8_length = i18n_utils::GetUtf8Length(uchar32);
69     if (utf8_index_ + utf8_length > desired_utf8_index) {
70       // Ah! Don't go too far!
71       break;
72     }
73     utf8_index_ += utf8_length;
74     utf16_index_ += i18n_utils::GetUtf16Length(uchar32);
75     ++utf32_index_;
76   }
77   cached_current_char_ =
78       i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
79   return true;
80 }
81 
RewindToUtf8(int desired_utf8_index)82 bool CharacterIterator::RewindToUtf8(int desired_utf8_index) {
83   if (desired_utf8_index < 0) {
84     // Enforce the requirement.
85     return false;
86   }
87   // Need to work backwards.
88   UChar32 uchar32 = cached_current_char_;
89   while (utf8_index_ > desired_utf8_index) {
90     int utf8_index = utf8_index_ - 1;
91     utf8_index = GetUTF8StartPosition(text_, utf8_index);
92     if (utf8_index < 0) {
93       // Somehow, there wasn't a single UTF-8 lead byte at
94       // requested_byte_index or an earlier byte.
95       cached_current_char_ = i18n_utils::kInvalidUChar32;
96       return false;
97     }
98     // We've found the start of a unicode char!
99     uchar32 =
100         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index);
101     int expected_length = utf8_index_ - utf8_index;
102     if (uchar32 == i18n_utils::kInvalidUChar32 ||
103         expected_length != i18n_utils::GetUtf8Length(uchar32)) {
104       // Either unable to retrieve a valid UTF-32 character at the previous
105       // position or we skipped past an invalid sequence while seeking the
106       // previous start position.
107       cached_current_char_ = i18n_utils::kInvalidUChar32;
108       return false;
109     }
110     cached_current_char_ = uchar32;
111     utf8_index_ = utf8_index;
112     utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
113     --utf32_index_;
114   }
115   return true;
116 }
117 
MoveToUtf16(int desired_utf16_index)118 bool CharacterIterator::MoveToUtf16(int desired_utf16_index) {
119   return (desired_utf16_index > utf16_index_)
120              ? AdvanceToUtf16(desired_utf16_index)
121              : RewindToUtf16(desired_utf16_index);
122 }
123 
AdvanceToUtf16(int desired_utf16_index)124 bool CharacterIterator::AdvanceToUtf16(int desired_utf16_index) {
125   ResetToStartIfNecessary();
126 
127   UChar32 uchar32 = cached_current_char_;
128   while (utf16_index_ < desired_utf16_index) {
129     uchar32 =
130         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
131     if (uchar32 == i18n_utils::kInvalidUChar32) {
132       // Unable to retrieve a valid UTF-32 character at the previous position.
133       cached_current_char_ = i18n_utils::kInvalidUChar32;
134       return false;
135     }
136     int utf16_length = i18n_utils::GetUtf16Length(uchar32);
137     if (utf16_index_ + utf16_length > desired_utf16_index) {
138       // Ah! Don't go too far!
139       break;
140     }
141     int utf8_length = i18n_utils::GetUtf8Length(uchar32);
142     if (utf8_index_ + utf8_length > text_.length()) {
143       // Enforce the requirement.
144       cached_current_char_ = i18n_utils::kInvalidUChar32;
145       return false;
146     }
147     utf8_index_ += utf8_length;
148     utf16_index_ += utf16_length;
149     ++utf32_index_;
150   }
151   cached_current_char_ =
152       i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
153   return true;
154 }
155 
RewindToUtf16(int desired_utf16_index)156 bool CharacterIterator::RewindToUtf16(int desired_utf16_index) {
157   if (desired_utf16_index < 0) {
158     return false;
159   }
160   UChar32 uchar32 = cached_current_char_;
161   while (utf16_index_ > desired_utf16_index) {
162     int utf8_index = utf8_index_ - 1;
163     utf8_index = GetUTF8StartPosition(text_, utf8_index);
164     if (utf8_index < 0) {
165       // Somehow, there wasn't a single UTF-8 lead byte at
166       // requested_byte_index or an earlier byte.
167       cached_current_char_ = i18n_utils::kInvalidUChar32;
168       return false;
169     }
170     // We've found the start of a unicode char!
171     uchar32 =
172         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index);
173     int expected_length = utf8_index_ - utf8_index;
174     if (uchar32 == i18n_utils::kInvalidUChar32 ||
175         expected_length != i18n_utils::GetUtf8Length(uchar32)) {
176       // Either unable to retrieve a valid UTF-32 character at the previous
177       // position or we skipped past an invalid sequence while seeking the
178       // previous start position.
179       cached_current_char_ = i18n_utils::kInvalidUChar32;
180       return false;
181     }
182     cached_current_char_ = uchar32;
183     utf8_index_ = utf8_index;
184     utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
185     --utf32_index_;
186   }
187   return true;
188 }
189 
MoveToUtf32(int desired_utf32_index)190 bool CharacterIterator::MoveToUtf32(int desired_utf32_index) {
191   return (desired_utf32_index > utf32_index_)
192              ? AdvanceToUtf32(desired_utf32_index)
193              : RewindToUtf32(desired_utf32_index);
194 }
195 
AdvanceToUtf32(int desired_utf32_index)196 bool CharacterIterator::AdvanceToUtf32(int desired_utf32_index) {
197   ResetToStartIfNecessary();
198 
199   UChar32 uchar32 = cached_current_char_;
200   while (utf32_index_ < desired_utf32_index) {
201     uchar32 =
202         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
203     if (uchar32 == i18n_utils::kInvalidUChar32) {
204       // Unable to retrieve a valid UTF-32 character at the previous position.
205       cached_current_char_ = i18n_utils::kInvalidUChar32;
206       return false;
207     }
208     int utf16_length = i18n_utils::GetUtf16Length(uchar32);
209     int utf8_length = i18n_utils::GetUtf8Length(uchar32);
210     if (utf8_index_ + utf8_length > text_.length()) {
211       // Enforce the requirement.
212       cached_current_char_ = i18n_utils::kInvalidUChar32;
213       return false;
214     }
215     utf8_index_ += utf8_length;
216     utf16_index_ += utf16_length;
217     ++utf32_index_;
218   }
219   cached_current_char_ =
220       i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
221   return true;
222 }
223 
RewindToUtf32(int desired_utf32_index)224 bool CharacterIterator::RewindToUtf32(int desired_utf32_index) {
225   if (desired_utf32_index < 0) {
226     return false;
227   }
228   UChar32 uchar32 = cached_current_char_;
229   while (utf32_index_ > desired_utf32_index) {
230     int utf8_index = utf8_index_ - 1;
231     utf8_index = GetUTF8StartPosition(text_, utf8_index);
232     if (utf8_index < 0) {
233       // Somehow, there wasn't a single UTF-8 lead byte at
234       // requested_byte_index or an earlier byte.
235       cached_current_char_ = i18n_utils::kInvalidUChar32;
236       return false;
237     }
238     // We've found the start of a unicode char!
239     uchar32 =
240         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index);
241     int expected_length = utf8_index_ - utf8_index;
242     if (uchar32 == i18n_utils::kInvalidUChar32 ||
243         expected_length != i18n_utils::GetUtf8Length(uchar32)) {
244       // Either unable to retrieve a valid UTF-32 character at the previous
245       // position or we skipped past an invalid sequence while seeking the
246       // previous start position.
247       cached_current_char_ = i18n_utils::kInvalidUChar32;
248       return false;
249     }
250     cached_current_char_ = uchar32;
251     utf8_index_ = utf8_index;
252     utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
253     --utf32_index_;
254   }
255   return true;
256 }
257 
ResetToStartIfNecessary()258 void CharacterIterator::ResetToStartIfNecessary() {
259   if (utf8_index_ < 0 || utf16_index_ < 0 || utf32_index_ < 0) {
260     utf8_index_ = 0;
261     utf16_index_ = 0;
262     utf32_index_ = 0;
263     cached_current_char_ =
264         i18n_utils::GetUChar32At(text_.data(), text_.length(), 0);
265   }
266 }
267 
268 }  // namespace lib
269 }  // namespace icing
270