• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/util/character-iterator.h"
16 
17 namespace icing {
18 namespace lib {
19 
20 namespace {
21 
22 // Returns the lead byte of the UTF-8 character that includes the byte at
23 // current_byte_index within it.
GetUTF8StartPosition(std::string_view text,int current_byte_index)24 int GetUTF8StartPosition(std::string_view text, int current_byte_index) {
25   while (!i18n_utils::IsLeadUtf8Byte(text[current_byte_index])) {
26     --current_byte_index;
27   }
28   return current_byte_index;
29 }
30 
31 }  // namespace
32 
MoveToUtf8(int desired_utf8_index)33 bool CharacterIterator::MoveToUtf8(int desired_utf8_index) {
34   return (desired_utf8_index > utf8_index_) ? AdvanceToUtf8(desired_utf8_index)
35                                             : RewindToUtf8(desired_utf8_index);
36 }
37 
AdvanceToUtf8(int desired_utf8_index)38 bool CharacterIterator::AdvanceToUtf8(int desired_utf8_index) {
39   if (desired_utf8_index > text_.length()) {
40     // Enforce the requirement.
41     return false;
42   }
43   // Need to work forwards.
44   while (utf8_index_ < desired_utf8_index) {
45     UChar32 uchar32 =
46         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
47     if (uchar32 == i18n_utils::kInvalidUChar32) {
48       // Unable to retrieve a valid UTF-32 character at the previous position.
49       return false;
50     }
51     int utf8_length = i18n_utils::GetUtf8Length(uchar32);
52     if (utf8_index_ + utf8_length > desired_utf8_index) {
53       // Ah! Don't go too far!
54       break;
55     }
56     utf8_index_ += utf8_length;
57     utf16_index_ += i18n_utils::GetUtf16Length(uchar32);
58     ++utf32_index_;
59   }
60   return true;
61 }
62 
RewindToUtf8(int desired_utf8_index)63 bool CharacterIterator::RewindToUtf8(int desired_utf8_index) {
64   if (desired_utf8_index < 0) {
65     // Enforce the requirement.
66     return false;
67   }
68   // Need to work backwards.
69   while (utf8_index_ > desired_utf8_index) {
70     --utf8_index_;
71     utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
72     if (utf8_index_ < 0) {
73       // Somehow, there wasn't a single UTF-8 lead byte at
74       // requested_byte_index or an earlier byte.
75       return false;
76     }
77     // We've found the start of a unicode char!
78     UChar32 uchar32 =
79         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
80     if (uchar32 == i18n_utils::kInvalidUChar32) {
81       // Unable to retrieve a valid UTF-32 character at the previous position.
82       return false;
83     }
84     utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
85     --utf32_index_;
86   }
87   return true;
88 }
89 
MoveToUtf16(int desired_utf16_index)90 bool CharacterIterator::MoveToUtf16(int desired_utf16_index) {
91   return (desired_utf16_index > utf16_index_)
92              ? AdvanceToUtf16(desired_utf16_index)
93              : RewindToUtf16(desired_utf16_index);
94 }
95 
AdvanceToUtf16(int desired_utf16_index)96 bool CharacterIterator::AdvanceToUtf16(int desired_utf16_index) {
97   while (utf16_index_ < desired_utf16_index) {
98     UChar32 uchar32 =
99         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
100     if (uchar32 == i18n_utils::kInvalidUChar32) {
101       // Unable to retrieve a valid UTF-32 character at the previous position.
102       return false;
103     }
104     int utf16_length = i18n_utils::GetUtf16Length(uchar32);
105     if (utf16_index_ + utf16_length > desired_utf16_index) {
106       // Ah! Don't go too far!
107       break;
108     }
109     int utf8_length = i18n_utils::GetUtf8Length(uchar32);
110     if (utf8_index_ + utf8_length > text_.length()) {
111       // Enforce the requirement.
112       return false;
113     }
114     utf8_index_ += utf8_length;
115     utf16_index_ += utf16_length;
116     ++utf32_index_;
117   }
118   return true;
119 }
120 
RewindToUtf16(int desired_utf16_index)121 bool CharacterIterator::RewindToUtf16(int desired_utf16_index) {
122   if (desired_utf16_index < 0) {
123     return false;
124   }
125   while (utf16_index_ > desired_utf16_index) {
126     --utf8_index_;
127     utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
128     if (utf8_index_ < 0) {
129       // Somehow, there wasn't a single UTF-8 lead byte at
130       // requested_byte_index or an earlier byte.
131       return false;
132     }
133     // We've found the start of a unicode char!
134     UChar32 uchar32 =
135         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
136     if (uchar32 == i18n_utils::kInvalidUChar32) {
137       // Unable to retrieve a valid UTF-32 character at the previous position.
138       return false;
139     }
140     utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
141     --utf32_index_;
142   }
143   return true;
144 }
145 
MoveToUtf32(int desired_utf32_index)146 bool CharacterIterator::MoveToUtf32(int desired_utf32_index) {
147   return (desired_utf32_index > utf32_index_)
148              ? AdvanceToUtf32(desired_utf32_index)
149              : RewindToUtf32(desired_utf32_index);
150 }
151 
AdvanceToUtf32(int desired_utf32_index)152 bool CharacterIterator::AdvanceToUtf32(int desired_utf32_index) {
153   while (utf32_index_ < desired_utf32_index) {
154     UChar32 uchar32 =
155         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
156     if (uchar32 == i18n_utils::kInvalidUChar32) {
157       // Unable to retrieve a valid UTF-32 character at the previous position.
158       return false;
159     }
160     int utf16_length = i18n_utils::GetUtf16Length(uchar32);
161     int utf8_length = i18n_utils::GetUtf8Length(uchar32);
162     if (utf8_index_ + utf8_length > text_.length()) {
163       // Enforce the requirement.
164       return false;
165     }
166     utf8_index_ += utf8_length;
167     utf16_index_ += utf16_length;
168     ++utf32_index_;
169   }
170   return true;
171 }
172 
RewindToUtf32(int desired_utf32_index)173 bool CharacterIterator::RewindToUtf32(int desired_utf32_index) {
174   if (desired_utf32_index < 0) {
175     return false;
176   }
177   while (utf32_index_ > desired_utf32_index) {
178     --utf8_index_;
179     utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
180     if (utf8_index_ < 0) {
181       // Somehow, there wasn't a single UTF-8 lead byte at
182       // requested_byte_index or an earlier byte.
183       return false;
184     }
185     // We've found the start of a unicode char!
186     UChar32 uchar32 =
187         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
188     if (uchar32 == i18n_utils::kInvalidUChar32) {
189       // Unable to retrieve a valid UTF-32 character at the previous position.
190       return false;
191     }
192     utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
193     --utf32_index_;
194   }
195   return true;
196 }
197 
198 }  // namespace lib
199 }  // namespace icing
200