1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "utils/utf8/unicodetext.h"
18
19 #include <string.h>
20
21 #include <algorithm>
22
23 #include "utils/base/logging.h"
24 #include "utils/strings/utf8.h"
25
26 namespace libtextclassifier3 {
27
28 // *************** Data representation **********
29 // Note: the copy constructor is undefined.
30
operator =(Repr && src)31 UnicodeText::Repr& UnicodeText::Repr::operator=(Repr&& src) {
32 if (ours_ && data_) delete[] data_;
33 data_ = src.data_;
34 size_ = src.size_;
35 capacity_ = src.capacity_;
36 ours_ = src.ours_;
37 src.ours_ = false;
38 return *this;
39 }
40
PointTo(const char * data,int size)41 void UnicodeText::Repr::PointTo(const char* data, int size) {
42 if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
43 data_ = const_cast<char*>(data);
44 size_ = size;
45 capacity_ = size;
46 ours_ = false;
47 }
48
Copy(const char * data,int size)49 void UnicodeText::Repr::Copy(const char* data, int size) {
50 resize(size);
51 memcpy(data_, data, size);
52 }
53
resize(int new_size)54 void UnicodeText::Repr::resize(int new_size) {
55 if (new_size == 0) {
56 clear();
57 } else {
58 if (!ours_ || new_size > capacity_) reserve(new_size);
59 // Clear the memory in the expanded part.
60 if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
61 size_ = new_size;
62 ours_ = true;
63 }
64 }
65
reserve(int new_capacity)66 void UnicodeText::Repr::reserve(int new_capacity) {
67 // If there's already enough capacity, and we're an owner, do nothing.
68 if (capacity_ >= new_capacity && ours_) return;
69
70 // Otherwise, allocate a new buffer.
71 capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);
72 char* new_data = new char[capacity_];
73
74 // If there is an old buffer, copy it into the new buffer.
75 if (data_) {
76 memcpy(new_data, data_, size_);
77 if (ours_) delete[] data_; // If we owned the old buffer, free it.
78 }
79 data_ = new_data;
80 ours_ = true; // We own the new buffer.
81 // size_ is unchanged.
82 }
83
append(const char * bytes,int byte_length)84 void UnicodeText::Repr::append(const char* bytes, int byte_length) {
85 reserve(size_ + byte_length);
86 memcpy(data_ + size_, bytes, byte_length);
87 size_ += byte_length;
88 }
89
clear()90 void UnicodeText::Repr::clear() {
91 if (ours_) delete[] data_;
92 data_ = nullptr;
93 size_ = capacity_ = 0;
94 ours_ = true;
95 }
96
97 // *************** UnicodeText ******************
98
UnicodeText()99 UnicodeText::UnicodeText() {}
100
UnicodeText(const UnicodeText & src,bool do_copy)101 UnicodeText::UnicodeText(const UnicodeText& src, bool do_copy) {
102 if (do_copy) {
103 Copy(src);
104 } else {
105 repr_.PointTo(src.repr_.data_, src.repr_.size_);
106 }
107 }
108
operator =(UnicodeText && src)109 UnicodeText& UnicodeText::operator=(UnicodeText&& src) {
110 this->repr_ = std::move(src.repr_);
111 return *this;
112 }
113
Copy(const UnicodeText & src)114 UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
115 repr_.Copy(src.repr_.data_, src.repr_.size_);
116 return *this;
117 }
118
PointToUTF8(const char * buffer,int byte_length)119 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
120 repr_.PointTo(buffer, byte_length);
121 return *this;
122 }
123
CopyUTF8(const char * buffer,int byte_length)124 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
125 repr_.Copy(buffer, byte_length);
126 return *this;
127 }
128
AppendUTF8(const char * utf8,int len)129 UnicodeText& UnicodeText::AppendUTF8(const char* utf8, int len) {
130 repr_.append(utf8, len);
131 return *this;
132 }
133
data() const134 const char* UnicodeText::data() const { return repr_.data_; }
135
size_bytes() const136 int UnicodeText::size_bytes() const { return repr_.size_; }
137
138 namespace {
139
140 enum {
141 RuneError = 0xFFFD, // Decoding error in UTF.
142 RuneMax = 0x10FFFF, // Maximum rune value.
143 };
144
runetochar(const char32 rune,char * dest)145 int runetochar(const char32 rune, char* dest) {
146 // Convert to unsigned for range check.
147 uint32 c;
148
149 // 1 char 00-7F
150 c = rune;
151 if (c <= 0x7F) {
152 dest[0] = static_cast<char>(c);
153 return 1;
154 }
155
156 // 2 char 0080-07FF
157 if (c <= 0x07FF) {
158 dest[0] = 0xC0 | static_cast<char>(c >> 1 * 6);
159 dest[1] = 0x80 | (c & 0x3F);
160 return 2;
161 }
162
163 // Range check
164 if (c > RuneMax) {
165 c = RuneError;
166 }
167
168 // 3 char 0800-FFFF
169 if (c <= 0xFFFF) {
170 dest[0] = 0xE0 | static_cast<char>(c >> 2 * 6);
171 dest[1] = 0x80 | ((c >> 1 * 6) & 0x3F);
172 dest[2] = 0x80 | (c & 0x3F);
173 return 3;
174 }
175
176 // 4 char 10000-1FFFFF
177 dest[0] = 0xF0 | static_cast<char>(c >> 3 * 6);
178 dest[1] = 0x80 | ((c >> 2 * 6) & 0x3F);
179 dest[2] = 0x80 | ((c >> 1 * 6) & 0x3F);
180 dest[3] = 0x80 | (c & 0x3F);
181 return 4;
182 }
183
184 } // namespace
185
push_back(char32 ch)186 UnicodeText& UnicodeText::push_back(char32 ch) {
187 char str[4];
188 int char_len = runetochar(ch, str);
189 repr_.append(str, char_len);
190 return *this;
191 }
192
clear()193 void UnicodeText::clear() { repr_.clear(); }
194
size_codepoints() const195 int UnicodeText::size_codepoints() const {
196 return std::distance(begin(), end());
197 }
198
empty() const199 bool UnicodeText::empty() const { return size_bytes() == 0; }
200
is_valid() const201 bool UnicodeText::is_valid() const {
202 return IsValidUTF8(repr_.data_, repr_.size_);
203 }
204
operator ==(const UnicodeText & other) const205 bool UnicodeText::operator==(const UnicodeText& other) const {
206 if (repr_.size_ != other.repr_.size_) {
207 return false;
208 }
209 return memcmp(repr_.data_, other.repr_.data_, repr_.size_) == 0;
210 }
211
ToUTF8String() const212 std::string UnicodeText::ToUTF8String() const {
213 return UTF8Substring(begin(), end());
214 }
215
UTF8Substring(int begin_codepoint,int end_codepoint) const216 std::string UnicodeText::UTF8Substring(int begin_codepoint,
217 int end_codepoint) const {
218 auto span_begin = begin();
219 std::advance(span_begin, begin_codepoint);
220 auto span_end = span_begin;
221 std::advance(span_end, end_codepoint - begin_codepoint);
222 return UTF8Substring(span_begin, span_end);
223 }
224
UTF8Substring(const const_iterator & it_begin,const const_iterator & it_end)225 std::string UnicodeText::UTF8Substring(const const_iterator& it_begin,
226 const const_iterator& it_end) {
227 return std::string(it_begin.it_, it_end.it_ - it_begin.it_);
228 }
229
Substring(const UnicodeText & text,int begin_codepoint,int end_codepoint,bool do_copy)230 UnicodeText UnicodeText::Substring(const UnicodeText& text, int begin_codepoint,
231 int end_codepoint, bool do_copy) {
232 auto it_begin = text.begin();
233 std::advance(it_begin, begin_codepoint);
234 auto it_end = text.begin();
235 std::advance(it_end, end_codepoint);
236
237 return Substring(it_begin, it_end, do_copy);
238 }
239
Substring(const const_iterator & it_begin,const const_iterator & it_end,bool do_copy)240 UnicodeText UnicodeText::Substring(const const_iterator& it_begin,
241 const const_iterator& it_end, bool do_copy) {
242 if (do_copy) {
243 UnicodeText result;
244 result.repr_.Copy(it_begin.it_, it_end.it_ - it_begin.it_);
245 return result;
246 } else {
247 UnicodeText result;
248 result.repr_.PointTo(it_begin.it_, it_end.it_ - it_begin.it_);
249 return result;
250 }
251 }
252
~UnicodeText()253 UnicodeText::~UnicodeText() {}
254
255 // ******************* UnicodeText::const_iterator *********************
256
257 // The implementation of const_iterator would be nicer if it
258 // inherited from boost::iterator_facade
259 // (http://boost.org/libs/iterator/doc/iterator_facade.html).
260
const_iterator()261 UnicodeText::const_iterator::const_iterator() : it_(nullptr) {}
262
operator =(const const_iterator & other)263 UnicodeText::const_iterator& UnicodeText::const_iterator::operator=(
264 const const_iterator& other) {
265 if (&other != this) it_ = other.it_;
266 return *this;
267 }
268
begin() const269 UnicodeText::const_iterator UnicodeText::begin() const {
270 return const_iterator(repr_.data_);
271 }
272
end() const273 UnicodeText::const_iterator UnicodeText::end() const {
274 return const_iterator(repr_.data_ + repr_.size_);
275 }
276
operator <(const UnicodeText::const_iterator & lhs,const UnicodeText::const_iterator & rhs)277 bool operator<(const UnicodeText::const_iterator& lhs,
278 const UnicodeText::const_iterator& rhs) {
279 return lhs.it_ < rhs.it_;
280 }
281
operator *() const282 char32 UnicodeText::const_iterator::operator*() const {
283 // (We could call chartorune here, but that does some
284 // error-checking, and we're guaranteed that our data is valid
285 // UTF-8. Also, we expect this routine to be called very often. So
286 // for speed, we do the calculation ourselves.)
287 return ValidCharToRune(it_);
288 }
289
operator ++()290 UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
291 it_ += GetNumBytesForUTF8Char(it_);
292 return *this;
293 }
294
operator --()295 UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
296 while (IsTrailByte(*--it_)) {
297 }
298 return *this;
299 }
300
UTF8ToUnicodeText(const char * utf8_buf,int len,bool do_copy)301 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, bool do_copy) {
302 UnicodeText t;
303 if (do_copy) {
304 t.CopyUTF8(utf8_buf, len);
305 } else {
306 t.PointToUTF8(utf8_buf, len);
307 }
308 return t;
309 }
310
UTF8ToUnicodeText(const char * utf8_buf,bool do_copy)311 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, bool do_copy) {
312 return UTF8ToUnicodeText(utf8_buf, strlen(utf8_buf), do_copy);
313 }
314
UTF8ToUnicodeText(const std::string & str,bool do_copy)315 UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy) {
316 return UTF8ToUnicodeText(str.data(), str.size(), do_copy);
317 }
318
UTF8ToUnicodeText(StringPiece str,bool do_copy)319 UnicodeText UTF8ToUnicodeText(StringPiece str, bool do_copy) {
320 return UTF8ToUnicodeText(str.data(), str.size(), do_copy);
321 }
322
323 } // namespace libtextclassifier3
324