1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/util/character-iterator.h"
16
17 namespace icing {
18 namespace lib {
19
20 namespace {
21
22 // Returns the lead byte of the UTF-8 character that includes the byte at
23 // current_byte_index within it.
GetUTF8StartPosition(std::string_view text,int current_byte_index)24 int GetUTF8StartPosition(std::string_view text, int current_byte_index) {
25 while (!i18n_utils::IsLeadUtf8Byte(text[current_byte_index])) {
26 --current_byte_index;
27 }
28 return current_byte_index;
29 }
30
31 } // namespace
32
MoveToUtf8(int desired_utf8_index)33 bool CharacterIterator::MoveToUtf8(int desired_utf8_index) {
34 return (desired_utf8_index > utf8_index_) ? AdvanceToUtf8(desired_utf8_index)
35 : RewindToUtf8(desired_utf8_index);
36 }
37
AdvanceToUtf8(int desired_utf8_index)38 bool CharacterIterator::AdvanceToUtf8(int desired_utf8_index) {
39 if (desired_utf8_index > text_.length()) {
40 // Enforce the requirement.
41 return false;
42 }
43 // Need to work forwards.
44 while (utf8_index_ < desired_utf8_index) {
45 UChar32 uchar32 =
46 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
47 if (uchar32 == i18n_utils::kInvalidUChar32) {
48 // Unable to retrieve a valid UTF-32 character at the previous position.
49 return false;
50 }
51 int utf8_length = i18n_utils::GetUtf8Length(uchar32);
52 if (utf8_index_ + utf8_length > desired_utf8_index) {
53 // Ah! Don't go too far!
54 break;
55 }
56 utf8_index_ += utf8_length;
57 utf16_index_ += i18n_utils::GetUtf16Length(uchar32);
58 ++utf32_index_;
59 }
60 return true;
61 }
62
RewindToUtf8(int desired_utf8_index)63 bool CharacterIterator::RewindToUtf8(int desired_utf8_index) {
64 if (desired_utf8_index < 0) {
65 // Enforce the requirement.
66 return false;
67 }
68 // Need to work backwards.
69 while (utf8_index_ > desired_utf8_index) {
70 --utf8_index_;
71 utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
72 if (utf8_index_ < 0) {
73 // Somehow, there wasn't a single UTF-8 lead byte at
74 // requested_byte_index or an earlier byte.
75 return false;
76 }
77 // We've found the start of a unicode char!
78 UChar32 uchar32 =
79 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
80 if (uchar32 == i18n_utils::kInvalidUChar32) {
81 // Unable to retrieve a valid UTF-32 character at the previous position.
82 return false;
83 }
84 utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
85 --utf32_index_;
86 }
87 return true;
88 }
89
MoveToUtf16(int desired_utf16_index)90 bool CharacterIterator::MoveToUtf16(int desired_utf16_index) {
91 return (desired_utf16_index > utf16_index_)
92 ? AdvanceToUtf16(desired_utf16_index)
93 : RewindToUtf16(desired_utf16_index);
94 }
95
AdvanceToUtf16(int desired_utf16_index)96 bool CharacterIterator::AdvanceToUtf16(int desired_utf16_index) {
97 while (utf16_index_ < desired_utf16_index) {
98 UChar32 uchar32 =
99 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
100 if (uchar32 == i18n_utils::kInvalidUChar32) {
101 // Unable to retrieve a valid UTF-32 character at the previous position.
102 return false;
103 }
104 int utf16_length = i18n_utils::GetUtf16Length(uchar32);
105 if (utf16_index_ + utf16_length > desired_utf16_index) {
106 // Ah! Don't go too far!
107 break;
108 }
109 int utf8_length = i18n_utils::GetUtf8Length(uchar32);
110 if (utf8_index_ + utf8_length > text_.length()) {
111 // Enforce the requirement.
112 return false;
113 }
114 utf8_index_ += utf8_length;
115 utf16_index_ += utf16_length;
116 ++utf32_index_;
117 }
118 return true;
119 }
120
RewindToUtf16(int desired_utf16_index)121 bool CharacterIterator::RewindToUtf16(int desired_utf16_index) {
122 if (desired_utf16_index < 0) {
123 return false;
124 }
125 while (utf16_index_ > desired_utf16_index) {
126 --utf8_index_;
127 utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
128 if (utf8_index_ < 0) {
129 // Somehow, there wasn't a single UTF-8 lead byte at
130 // requested_byte_index or an earlier byte.
131 return false;
132 }
133 // We've found the start of a unicode char!
134 UChar32 uchar32 =
135 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
136 if (uchar32 == i18n_utils::kInvalidUChar32) {
137 // Unable to retrieve a valid UTF-32 character at the previous position.
138 return false;
139 }
140 utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
141 --utf32_index_;
142 }
143 return true;
144 }
145
MoveToUtf32(int desired_utf32_index)146 bool CharacterIterator::MoveToUtf32(int desired_utf32_index) {
147 return (desired_utf32_index > utf32_index_)
148 ? AdvanceToUtf32(desired_utf32_index)
149 : RewindToUtf32(desired_utf32_index);
150 }
151
AdvanceToUtf32(int desired_utf32_index)152 bool CharacterIterator::AdvanceToUtf32(int desired_utf32_index) {
153 while (utf32_index_ < desired_utf32_index) {
154 UChar32 uchar32 =
155 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
156 if (uchar32 == i18n_utils::kInvalidUChar32) {
157 // Unable to retrieve a valid UTF-32 character at the previous position.
158 return false;
159 }
160 int utf16_length = i18n_utils::GetUtf16Length(uchar32);
161 int utf8_length = i18n_utils::GetUtf8Length(uchar32);
162 if (utf8_index_ + utf8_length > text_.length()) {
163 // Enforce the requirement.
164 return false;
165 }
166 utf8_index_ += utf8_length;
167 utf16_index_ += utf16_length;
168 ++utf32_index_;
169 }
170 return true;
171 }
172
RewindToUtf32(int desired_utf32_index)173 bool CharacterIterator::RewindToUtf32(int desired_utf32_index) {
174 if (desired_utf32_index < 0) {
175 return false;
176 }
177 while (utf32_index_ > desired_utf32_index) {
178 --utf8_index_;
179 utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
180 if (utf8_index_ < 0) {
181 // Somehow, there wasn't a single UTF-8 lead byte at
182 // requested_byte_index or an earlier byte.
183 return false;
184 }
185 // We've found the start of a unicode char!
186 UChar32 uchar32 =
187 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
188 if (uchar32 == i18n_utils::kInvalidUChar32) {
189 // Unable to retrieve a valid UTF-32 character at the previous position.
190 return false;
191 }
192 utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
193 --utf32_index_;
194 }
195 return true;
196 }
197
198 } // namespace lib
199 } // namespace icing
200