1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_UTIL_UTF8_UNICODETEXT_H_ 18 #define LIBTEXTCLASSIFIER_UTIL_UTF8_UNICODETEXT_H_ 19 20 #include <iterator> 21 #include <string> 22 #include <utility> 23 24 #include "util/base/integral_types.h" 25 26 namespace libtextclassifier2 { 27 28 // ***************************** UnicodeText ************************** 29 // 30 // A UnicodeText object is a wrapper around a sequence of Unicode 31 // codepoint values that allows iteration over these values. 32 // 33 // The internal representation of the text is UTF-8. Since UTF-8 is a 34 // variable-width format, UnicodeText does not provide random access 35 // to the text, and changes to the text are permitted only at the end. 36 // 37 // The UnicodeText class defines a const_iterator. The dereferencing 38 // operator (*) returns a codepoint (int32). The iterator is a 39 // read-only iterator. It becomes invalid if the text is changed. 40 // 41 // Codepoints are integers in the range [0, 0xD7FF] or [0xE000, 42 // 0x10FFFF], but UnicodeText has the additional restriction that it 43 // can contain only those characters that are valid for interchange on 44 // the Web. This excludes all of the control codes except for carriage 45 // return, line feed, and horizontal tab. It also excludes 46 // non-characters, but codepoints that are in the Private Use regions 47 // are allowed, as are codepoints that are unassigned. (See the 48 // Unicode reference for details.) 49 // 50 // MEMORY MANAGEMENT: 51 // 52 // PointToUTF8(buffer, size) creates an alias pointing to buffer. 53 // 54 // The purpose of an alias is to avoid making an unnecessary copy of a 55 // UTF-8 buffer while still providing access to the Unicode values 56 // within that text through iterators. The lifetime of an alias must not 57 // exceed the lifetime of the buffer from which it was constructed. 58 // 59 // Aliases should be used with care. If the source from which an alias 60 // was created is freed, or if the contents are changed, while the 61 // alias is still in use, fatal errors could result. But it can be 62 // quite useful to have a UnicodeText "window" through which to see a 63 // UTF-8 buffer without having to pay the price of making a copy. 64 65 class UnicodeText { 66 public: 67 class const_iterator; 68 69 UnicodeText(); // Create an empty text. 70 UnicodeText(const UnicodeText& src); 71 UnicodeText& operator=(UnicodeText&& src); 72 ~UnicodeText(); 73 74 class const_iterator { 75 typedef const_iterator CI; 76 77 public: 78 typedef std::input_iterator_tag iterator_category; 79 typedef char32 value_type; 80 typedef int difference_type; 81 typedef void pointer; // (Not needed.) 82 typedef const char32 reference; // (Needed for const_reverse_iterator) 83 84 // Iterators are default-constructible. 85 const_iterator(); 86 87 // It's safe to make multiple passes over a UnicodeText. 88 const_iterator& operator=(const const_iterator& other); 89 90 char32 operator*() const; // Dereference 91 92 const_iterator& operator++(); // Advance (++iter) 93 const_iterator operator++(int) { // (iter++) 94 const_iterator result(*this); 95 ++*this; 96 return result; 97 } 98 99 const_iterator& operator--(); // Retreat (--iter) 100 const_iterator operator--(int) { // (iter--) 101 const_iterator result(*this); 102 --*this; 103 return result; 104 } 105 106 friend bool operator==(const CI& lhs, const CI& rhs) { 107 return lhs.it_ == rhs.it_; 108 } 109 friend bool operator!=(const CI& lhs, const CI& rhs) { 110 return !(lhs == rhs); 111 } 112 friend bool operator<(const CI& lhs, const CI& rhs); 113 friend bool operator>(const CI& lhs, const CI& rhs) { return rhs < lhs; } 114 friend bool operator<=(const CI& lhs, const CI& rhs) { 115 return !(rhs < lhs); 116 } 117 friend bool operator>=(const CI& lhs, const CI& rhs) { 118 return !(lhs < rhs); 119 } 120 utf8_length()121 int utf8_length() const { 122 if (it_[0] < 0x80) { 123 return 1; 124 } else if (it_[0] < 0xE0) { 125 return 2; 126 } else if (it_[0] < 0xF0) { 127 return 3; 128 } else { 129 return 4; 130 } 131 } utf8_data()132 const char* utf8_data() const { return it_; } 133 134 private: 135 friend class UnicodeText; const_iterator(const char * it)136 explicit const_iterator(const char* it) : it_(it) {} 137 138 const char* it_; 139 }; 140 141 const_iterator begin() const; 142 const_iterator end() const; 143 144 // Gets pointer to the underlying utf8 data. 145 const char* data() const; 146 147 // Gets length (in bytes) of the underlying utf8 data. 148 int size_bytes() const; 149 150 // Computes length (in number of Unicode codepoints) of the underlying utf8 151 // data. 152 // NOTE: Complexity O(n). 153 int size_codepoints() const; 154 155 bool empty() const; 156 157 // Checks whether the underlying data is valid utf8 data. 158 bool is_valid() const; 159 160 bool operator==(const UnicodeText& other) const; 161 162 // x.PointToUTF8(buf,len) changes x so that it points to buf 163 // ("becomes an alias"). It does not take ownership or copy buf. 164 // This function assumes that the input is interchange valid UTF8. 165 UnicodeText& Copy(const UnicodeText& src); 166 UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length); 167 UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length); 168 169 // Calling this may invalidate pointers to underlying data. 170 UnicodeText& AppendUTF8(const char* utf8, int len); 171 UnicodeText& AppendCodepoint(char32 ch); 172 void clear(); 173 174 std::string ToUTF8String() const; 175 static std::string UTF8Substring(const const_iterator& first, 176 const const_iterator& last); 177 178 private: 179 friend class const_iterator; 180 181 class Repr { // A byte-string. 182 public: 183 char* data_; 184 int size_; 185 int capacity_; 186 bool ours_; // Do we own data_? 187 Repr()188 Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {} 189 Repr& operator=(Repr&& src); ~Repr()190 ~Repr() { 191 if (ours_) delete[] data_; 192 } 193 194 void clear(); 195 void reserve(int capacity); 196 void resize(int size); 197 198 void append(const char* bytes, int byte_length); 199 void Copy(const char* data, int size); 200 void PointTo(const char* data, int size); 201 202 private: 203 Repr& operator=(const Repr&); 204 Repr(const Repr& other); 205 }; 206 207 Repr repr_; 208 }; 209 210 typedef std::pair<UnicodeText::const_iterator, UnicodeText::const_iterator> 211 UnicodeTextRange; 212 213 // NOTE: The following are needed to avoid implicit conversion from char* to 214 // std::string, or from ::string to std::string, because if this happens it 215 // often results in invalid memory access to a temporary object created during 216 // such conversion (if do_copy == false). 217 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, bool do_copy); 218 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, bool do_copy); 219 UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy); 220 UnicodeText UTF8ToUnicodeText(const std::string& str); 221 222 } // namespace libtextclassifier2 223 224 #endif // LIBTEXTCLASSIFIER_UTIL_UTF8_UNICODETEXT_H_ 225