1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/util/character-iterator.h"
16
17 #include "icing/util/i18n-utils.h"
18
19 namespace icing {
20 namespace lib {
21
22 namespace {
23
24 // Returns the lead byte of the UTF-8 character that includes the byte at
25 // current_byte_index within it.
GetUTF8StartPosition(std::string_view text,int current_byte_index)26 int GetUTF8StartPosition(std::string_view text, int current_byte_index) {
27 while (!i18n_utils::IsLeadUtf8Byte(text[current_byte_index])) {
28 --current_byte_index;
29 }
30 return current_byte_index;
31 }
32
33 } // namespace
34
GetCurrentChar()35 UChar32 CharacterIterator::GetCurrentChar() {
36 if (cached_current_char_ == i18n_utils::kInvalidUChar32) {
37 // Our indices point to the right character, we just need to read that
38 // character. No need to worry about an error. If GetUChar32At fails, then
39 // current_char will be i18n_utils::kInvalidUChar32.
40 cached_current_char_ =
41 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
42 }
43 return cached_current_char_;
44 }
45
MoveToUtf8(int desired_utf8_index)46 bool CharacterIterator::MoveToUtf8(int desired_utf8_index) {
47 return (desired_utf8_index > utf8_index_) ? AdvanceToUtf8(desired_utf8_index)
48 : RewindToUtf8(desired_utf8_index);
49 }
50
AdvanceToUtf8(int desired_utf8_index)51 bool CharacterIterator::AdvanceToUtf8(int desired_utf8_index) {
52 ResetToStartIfNecessary();
53
54 if (desired_utf8_index > text_.length()) {
55 // Enforce the requirement.
56 return false;
57 }
58 // Need to work forwards.
59 UChar32 uchar32 = cached_current_char_;
60 while (utf8_index_ < desired_utf8_index) {
61 uchar32 =
62 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
63 if (uchar32 == i18n_utils::kInvalidUChar32) {
64 // Unable to retrieve a valid UTF-32 character at the previous position.
65 cached_current_char_ = i18n_utils::kInvalidUChar32;
66 return false;
67 }
68 int utf8_length = i18n_utils::GetUtf8Length(uchar32);
69 if (utf8_index_ + utf8_length > desired_utf8_index) {
70 // Ah! Don't go too far!
71 break;
72 }
73 utf8_index_ += utf8_length;
74 utf16_index_ += i18n_utils::GetUtf16Length(uchar32);
75 ++utf32_index_;
76 }
77 cached_current_char_ =
78 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
79 return true;
80 }
81
RewindToUtf8(int desired_utf8_index)82 bool CharacterIterator::RewindToUtf8(int desired_utf8_index) {
83 if (desired_utf8_index < 0) {
84 // Enforce the requirement.
85 return false;
86 }
87 // Need to work backwards.
88 UChar32 uchar32 = cached_current_char_;
89 while (utf8_index_ > desired_utf8_index) {
90 int utf8_index = utf8_index_ - 1;
91 utf8_index = GetUTF8StartPosition(text_, utf8_index);
92 if (utf8_index < 0) {
93 // Somehow, there wasn't a single UTF-8 lead byte at
94 // requested_byte_index or an earlier byte.
95 cached_current_char_ = i18n_utils::kInvalidUChar32;
96 return false;
97 }
98 // We've found the start of a unicode char!
99 uchar32 =
100 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index);
101 int expected_length = utf8_index_ - utf8_index;
102 if (uchar32 == i18n_utils::kInvalidUChar32 ||
103 expected_length != i18n_utils::GetUtf8Length(uchar32)) {
104 // Either unable to retrieve a valid UTF-32 character at the previous
105 // position or we skipped past an invalid sequence while seeking the
106 // previous start position.
107 cached_current_char_ = i18n_utils::kInvalidUChar32;
108 return false;
109 }
110 cached_current_char_ = uchar32;
111 utf8_index_ = utf8_index;
112 utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
113 --utf32_index_;
114 }
115 return true;
116 }
117
MoveToUtf16(int desired_utf16_index)118 bool CharacterIterator::MoveToUtf16(int desired_utf16_index) {
119 return (desired_utf16_index > utf16_index_)
120 ? AdvanceToUtf16(desired_utf16_index)
121 : RewindToUtf16(desired_utf16_index);
122 }
123
AdvanceToUtf16(int desired_utf16_index)124 bool CharacterIterator::AdvanceToUtf16(int desired_utf16_index) {
125 ResetToStartIfNecessary();
126
127 UChar32 uchar32 = cached_current_char_;
128 while (utf16_index_ < desired_utf16_index) {
129 uchar32 =
130 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
131 if (uchar32 == i18n_utils::kInvalidUChar32) {
132 // Unable to retrieve a valid UTF-32 character at the previous position.
133 cached_current_char_ = i18n_utils::kInvalidUChar32;
134 return false;
135 }
136 int utf16_length = i18n_utils::GetUtf16Length(uchar32);
137 if (utf16_index_ + utf16_length > desired_utf16_index) {
138 // Ah! Don't go too far!
139 break;
140 }
141 int utf8_length = i18n_utils::GetUtf8Length(uchar32);
142 if (utf8_index_ + utf8_length > text_.length()) {
143 // Enforce the requirement.
144 cached_current_char_ = i18n_utils::kInvalidUChar32;
145 return false;
146 }
147 utf8_index_ += utf8_length;
148 utf16_index_ += utf16_length;
149 ++utf32_index_;
150 }
151 cached_current_char_ =
152 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
153 return true;
154 }
155
RewindToUtf16(int desired_utf16_index)156 bool CharacterIterator::RewindToUtf16(int desired_utf16_index) {
157 if (desired_utf16_index < 0) {
158 return false;
159 }
160 UChar32 uchar32 = cached_current_char_;
161 while (utf16_index_ > desired_utf16_index) {
162 int utf8_index = utf8_index_ - 1;
163 utf8_index = GetUTF8StartPosition(text_, utf8_index);
164 if (utf8_index < 0) {
165 // Somehow, there wasn't a single UTF-8 lead byte at
166 // requested_byte_index or an earlier byte.
167 cached_current_char_ = i18n_utils::kInvalidUChar32;
168 return false;
169 }
170 // We've found the start of a unicode char!
171 uchar32 =
172 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index);
173 int expected_length = utf8_index_ - utf8_index;
174 if (uchar32 == i18n_utils::kInvalidUChar32 ||
175 expected_length != i18n_utils::GetUtf8Length(uchar32)) {
176 // Either unable to retrieve a valid UTF-32 character at the previous
177 // position or we skipped past an invalid sequence while seeking the
178 // previous start position.
179 cached_current_char_ = i18n_utils::kInvalidUChar32;
180 return false;
181 }
182 cached_current_char_ = uchar32;
183 utf8_index_ = utf8_index;
184 utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
185 --utf32_index_;
186 }
187 return true;
188 }
189
MoveToUtf32(int desired_utf32_index)190 bool CharacterIterator::MoveToUtf32(int desired_utf32_index) {
191 return (desired_utf32_index > utf32_index_)
192 ? AdvanceToUtf32(desired_utf32_index)
193 : RewindToUtf32(desired_utf32_index);
194 }
195
AdvanceToUtf32(int desired_utf32_index)196 bool CharacterIterator::AdvanceToUtf32(int desired_utf32_index) {
197 ResetToStartIfNecessary();
198
199 UChar32 uchar32 = cached_current_char_;
200 while (utf32_index_ < desired_utf32_index) {
201 uchar32 =
202 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
203 if (uchar32 == i18n_utils::kInvalidUChar32) {
204 // Unable to retrieve a valid UTF-32 character at the previous position.
205 cached_current_char_ = i18n_utils::kInvalidUChar32;
206 return false;
207 }
208 int utf16_length = i18n_utils::GetUtf16Length(uchar32);
209 int utf8_length = i18n_utils::GetUtf8Length(uchar32);
210 if (utf8_index_ + utf8_length > text_.length()) {
211 // Enforce the requirement.
212 cached_current_char_ = i18n_utils::kInvalidUChar32;
213 return false;
214 }
215 utf8_index_ += utf8_length;
216 utf16_index_ += utf16_length;
217 ++utf32_index_;
218 }
219 cached_current_char_ =
220 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
221 return true;
222 }
223
RewindToUtf32(int desired_utf32_index)224 bool CharacterIterator::RewindToUtf32(int desired_utf32_index) {
225 if (desired_utf32_index < 0) {
226 return false;
227 }
228 UChar32 uchar32 = cached_current_char_;
229 while (utf32_index_ > desired_utf32_index) {
230 int utf8_index = utf8_index_ - 1;
231 utf8_index = GetUTF8StartPosition(text_, utf8_index);
232 if (utf8_index < 0) {
233 // Somehow, there wasn't a single UTF-8 lead byte at
234 // requested_byte_index or an earlier byte.
235 cached_current_char_ = i18n_utils::kInvalidUChar32;
236 return false;
237 }
238 // We've found the start of a unicode char!
239 uchar32 =
240 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index);
241 int expected_length = utf8_index_ - utf8_index;
242 if (uchar32 == i18n_utils::kInvalidUChar32 ||
243 expected_length != i18n_utils::GetUtf8Length(uchar32)) {
244 // Either unable to retrieve a valid UTF-32 character at the previous
245 // position or we skipped past an invalid sequence while seeking the
246 // previous start position.
247 cached_current_char_ = i18n_utils::kInvalidUChar32;
248 return false;
249 }
250 cached_current_char_ = uchar32;
251 utf8_index_ = utf8_index;
252 utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
253 --utf32_index_;
254 }
255 return true;
256 }
257
ResetToStartIfNecessary()258 void CharacterIterator::ResetToStartIfNecessary() {
259 if (utf8_index_ < 0 || utf16_index_ < 0 || utf32_index_ < 0) {
260 utf8_index_ = 0;
261 utf16_index_ = 0;
262 utf32_index_ = 0;
263 cached_current_char_ =
264 i18n_utils::GetUChar32At(text_.data(), text_.length(), 0);
265 }
266 }
267
268 } // namespace lib
269 } // namespace icing
270