• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "util/utf8/unicodetext.h"
18 
19 #include "base.h"
20 #include "util/strings/utf8.h"
21 
22 namespace libtextclassifier {
23 
24 // *************** Data representation **********
25 // Note: the copy constructor is undefined.
26 
PointTo(const char * data,int size)27 void UnicodeText::Repr::PointTo(const char* data, int size) {
28   if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
29   data_ = const_cast<char*>(data);
30   size_ = size;
31   capacity_ = size;
32   ours_ = false;
33 }
34 
Copy(const char * data,int size)35 void UnicodeText::Repr::Copy(const char* data, int size) {
36   resize(size);
37   memcpy(data_, data, size);
38 }
39 
resize(int new_size)40 void UnicodeText::Repr::resize(int new_size) {
41   if (new_size == 0) {
42     clear();
43   } else {
44     if (!ours_ || new_size > capacity_) reserve(new_size);
45     // Clear the memory in the expanded part.
46     if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
47     size_ = new_size;
48     ours_ = true;
49   }
50 }
51 
reserve(int new_capacity)52 void UnicodeText::Repr::reserve(int new_capacity) {
53   // If there's already enough capacity, and we're an owner, do nothing.
54   if (capacity_ >= new_capacity && ours_) return;
55 
56   // Otherwise, allocate a new buffer.
57   capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);
58   char* new_data = new char[capacity_];
59 
60   // If there is an old buffer, copy it into the new buffer.
61   if (data_) {
62     memcpy(new_data, data_, size_);
63     if (ours_) delete[] data_;  // If we owned the old buffer, free it.
64   }
65   data_ = new_data;
66   ours_ = true;  // We own the new buffer.
67   // size_ is unchanged.
68 }
69 
append(const char * bytes,int byte_length)70 void UnicodeText::Repr::append(const char* bytes, int byte_length) {
71   reserve(size_ + byte_length);
72   memcpy(data_ + size_, bytes, byte_length);
73   size_ += byte_length;
74 }
75 
clear()76 void UnicodeText::Repr::clear() {
77   if (ours_) delete[] data_;
78   data_ = nullptr;
79   size_ = capacity_ = 0;
80   ours_ = true;
81 }
82 
83 // *************** UnicodeText ******************
84 
UnicodeText()85 UnicodeText::UnicodeText() {}
86 
UnicodeText(const UnicodeText & src)87 UnicodeText::UnicodeText(const UnicodeText& src) { Copy(src); }
88 
Copy(const UnicodeText & src)89 UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
90   repr_.Copy(src.repr_.data_, src.repr_.size_);
91   return *this;
92 }
93 
PointToUTF8(const char * buffer,int byte_length)94 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
95   repr_.PointTo(buffer, byte_length);
96   return *this;
97 }
98 
CopyUTF8(const char * buffer,int byte_length)99 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
100   repr_.Copy(buffer, byte_length);
101   return *this;
102 }
103 
AppendUTF8(const char * utf8,int len)104 UnicodeText& UnicodeText::AppendUTF8(const char* utf8, int len) {
105   repr_.append(utf8, len);
106   return *this;
107 }
108 
clear()109 void UnicodeText::clear() { repr_.clear(); }
110 
UTF8Substring(const const_iterator & first,const const_iterator & last)111 std::string UnicodeText::UTF8Substring(const const_iterator& first,
112                                        const const_iterator& last) {
113   return std::string(first.it_, last.it_ - first.it_);
114 }
115 
~UnicodeText()116 UnicodeText::~UnicodeText() {}
117 
118 // ******************* UnicodeText::const_iterator *********************
119 
120 // The implementation of const_iterator would be nicer if it
121 // inherited from boost::iterator_facade
122 // (http://boost.org/libs/iterator/doc/iterator_facade.html).
123 
const_iterator()124 UnicodeText::const_iterator::const_iterator() : it_(0) {}
125 
operator =(const const_iterator & other)126 UnicodeText::const_iterator& UnicodeText::const_iterator::operator=(
127     const const_iterator& other) {
128   if (&other != this) it_ = other.it_;
129   return *this;
130 }
131 
begin() const132 UnicodeText::const_iterator UnicodeText::begin() const {
133   return const_iterator(repr_.data_);
134 }
135 
end() const136 UnicodeText::const_iterator UnicodeText::end() const {
137   return const_iterator(repr_.data_ + repr_.size_);
138 }
139 
operator <(const UnicodeText::const_iterator & lhs,const UnicodeText::const_iterator & rhs)140 bool operator<(const UnicodeText::const_iterator& lhs,
141                const UnicodeText::const_iterator& rhs) {
142   return lhs.it_ < rhs.it_;
143 }
144 
operator *() const145 char32 UnicodeText::const_iterator::operator*() const {
146   // (We could call chartorune here, but that does some
147   // error-checking, and we're guaranteed that our data is valid
148   // UTF-8. Also, we expect this routine to be called very often. So
149   // for speed, we do the calculation ourselves.)
150 
151   // Convert from UTF-8
152   unsigned char byte1 = static_cast<unsigned char>(it_[0]);
153   if (byte1 < 0x80) return byte1;
154 
155   unsigned char byte2 = static_cast<unsigned char>(it_[1]);
156   if (byte1 < 0xE0) return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);
157 
158   unsigned char byte3 = static_cast<unsigned char>(it_[2]);
159   if (byte1 < 0xF0) {
160     return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);
161   }
162 
163   unsigned char byte4 = static_cast<unsigned char>(it_[3]);
164   return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) |
165          ((byte3 & 0x3F) << 6) | (byte4 & 0x3F);
166 }
167 
operator ++()168 UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
169   it_ += GetNumBytesForNonZeroUTF8Char(it_);
170   return *this;
171 }
172 
operator --()173 UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
174   while (IsTrailByte(*--it_)) {
175   }
176   return *this;
177 }
178 
UTF8ToUnicodeText(const char * utf8_buf,int len,bool do_copy)179 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, bool do_copy) {
180   UnicodeText t;
181   if (do_copy) {
182     t.CopyUTF8(utf8_buf, len);
183   } else {
184     t.PointToUTF8(utf8_buf, len);
185   }
186   return t;
187 }
188 
UTF8ToUnicodeText(const std::string & str,bool do_copy)189 UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy) {
190   return UTF8ToUnicodeText(str.data(), str.size(), do_copy);
191 }
192 
193 }  // namespace libtextclassifier
194