1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_ 18 #define LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_ 19 20 #include <iterator> 21 #include <string> 22 #include <utility> 23 #include <vector> 24 25 #include "utils/base/integral_types.h" 26 #include "utils/base/logging.h" 27 #include "utils/strings/stringpiece.h" 28 #include "absl/strings/string_view.h" 29 30 namespace libtextclassifier3 { 31 32 // ***************************** UnicodeText ************************** 33 // 34 // A UnicodeText object is a wrapper around a sequence of Unicode 35 // codepoint values that allows iteration over these values. 36 // 37 // The internal representation of the text is UTF-8. Since UTF-8 is a 38 // variable-width format, UnicodeText does not provide random access 39 // to the text, and changes to the text are permitted only at the end. 40 // 41 // The UnicodeText class defines a const_iterator. The dereferencing 42 // operator (*) returns a codepoint (int32). The iterator is a 43 // read-only iterator. It becomes invalid if the text is changed. 44 // 45 // Codepoints are integers in the range [0, 0xD7FF] or [0xE000, 46 // 0x10FFFF], but UnicodeText has the additional restriction that it 47 // can contain only those characters that are valid for interchange on 48 // the Web. This excludes all of the control codes except for carriage 49 // return, line feed, and horizontal tab. It also excludes 50 // non-characters, but codepoints that are in the Private Use regions 51 // are allowed, as are codepoints that are unassigned. (See the 52 // Unicode reference for details.) 53 // 54 // MEMORY MANAGEMENT: 55 // 56 // PointToUTF8(buffer, size) creates an alias pointing to buffer. 57 // 58 // The purpose of an alias is to avoid making an unnecessary copy of a 59 // UTF-8 buffer while still providing access to the Unicode values 60 // within that text through iterators. The lifetime of an alias must not 61 // exceed the lifetime of the buffer from which it was constructed. 62 // 63 // Aliases should be used with care. If the source from which an alias 64 // was created is freed, or if the contents are changed, while the 65 // alias is still in use, fatal errors could result. But it can be 66 // quite useful to have a UnicodeText "window" through which to see a 67 // UTF-8 buffer without having to pay the price of making a copy. 68 69 class UnicodeText { 70 public: 71 class const_iterator; 72 73 UnicodeText(); // Create an empty text. 74 UnicodeText(const UnicodeText& src, bool do_copy = true); 75 UnicodeText& operator=(UnicodeText&& src); 76 ~UnicodeText(); 77 78 class const_iterator { 79 typedef const_iterator CI; 80 81 public: 82 typedef std::bidirectional_iterator_tag iterator_category; 83 typedef char32 value_type; 84 typedef int difference_type; 85 typedef void pointer; // (Not needed.) 86 typedef const char32 reference; // (Needed for const_reverse_iterator) 87 88 // Iterators are default-constructible. 89 const_iterator(); 90 91 // It's safe to make multiple passes over a UnicodeText. 92 const_iterator(const const_iterator&) = default; 93 const_iterator& operator=(const const_iterator&) = default; 94 95 char32 operator*() const; // Dereference 96 97 const_iterator& operator++(); // Advance (++iter) 98 const_iterator operator++(int) { // (iter++) 99 const_iterator result(*this); 100 ++*this; 101 return result; 102 } 103 104 const_iterator& operator--(); // Retreat (--iter) 105 const_iterator operator--(int) { // (iter--) 106 const_iterator result(*this); 107 --*this; 108 return result; 109 } 110 111 friend bool operator==(const CI& lhs, const CI& rhs) { 112 return lhs.it_ == rhs.it_; 113 } 114 friend bool operator!=(const CI& lhs, const CI& rhs) { 115 return !(lhs == rhs); 116 } 117 friend bool operator<(const CI& lhs, const CI& rhs); 118 friend bool operator>(const CI& lhs, const CI& rhs) { return rhs < lhs; } 119 friend bool operator<=(const CI& lhs, const CI& rhs) { 120 return !(rhs < lhs); 121 } 122 friend bool operator>=(const CI& lhs, const CI& rhs) { 123 return !(lhs < rhs); 124 } 125 utf8_length()126 int utf8_length() const { 127 const unsigned char byte = static_cast<unsigned char>(it_[0]); 128 if (byte < 0x80) { 129 return 1; 130 } else if (byte < 0xE0) { 131 return 2; 132 } else if (byte < 0xF0) { 133 return 3; 134 } else { 135 return 4; 136 } 137 } utf8_data()138 const char* utf8_data() const { return it_; } 139 140 private: 141 friend class UnicodeText; const_iterator(const char * it)142 explicit const_iterator(const char* it) : it_(it) {} 143 144 const char* it_; 145 }; 146 147 const_iterator begin() const; 148 const_iterator end() const; 149 150 // Gets pointer to the underlying utf8 data. 151 const char* data() const; 152 153 // Gets length (in bytes) of the underlying utf8 data. 154 int size_bytes() const; 155 156 // Computes length (in number of Unicode codepoints) of the underlying utf8 157 // data. 158 // NOTE: Complexity O(n). 159 int size_codepoints() const; 160 161 bool empty() const; 162 163 // Checks whether the underlying data is valid utf8 data. 164 bool is_valid() const; 165 166 bool operator==(const UnicodeText& other) const; 167 168 // x.PointToUTF8(buf,len) changes x so that it points to buf 169 // ("becomes an alias"). It does not take ownership or copy buf. 170 // This function assumes that the input is interchange valid UTF8. 171 UnicodeText& Copy(const UnicodeText& src); 172 UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length); 173 UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length); 174 175 // Calling this may invalidate pointers to underlying data. 176 UnicodeText& AppendUTF8(const char* utf8, int len); 177 UnicodeText& push_back(char32 ch); 178 void clear(); 179 180 // Returns an iterator for each codepoint. 181 std::vector<const_iterator> Codepoints() const; 182 183 // Returns the list of codepoints of the UnicodeText. 184 std::vector<char32> CodepointsChar32() const; 185 186 std::string ToUTF8String() const; 187 std::string UTF8Substring(int begin_codepoint, int end_codepoint) const; 188 static std::string UTF8Substring(const const_iterator& it_begin, 189 const const_iterator& it_end); 190 static UnicodeText Substring(const UnicodeText& text, int begin_codepoint, 191 int end_codepoint, bool do_copy = true); 192 static UnicodeText Substring(const const_iterator& it_begin, 193 const const_iterator& it_end, 194 bool do_copy = true); 195 196 private: 197 friend class const_iterator; 198 199 class Repr { // A byte-string. 200 public: 201 char* data_; 202 int size_; 203 int capacity_; 204 bool ours_; // Do we own data_? 205 Repr()206 Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {} 207 Repr& operator=(Repr&& src); ~Repr()208 ~Repr() { 209 if (ours_) delete[] data_; 210 } 211 212 void clear(); 213 void reserve(int capacity); 214 void resize(int size); 215 216 void append(const char* bytes, int byte_length); 217 void Copy(const char* data, int size); 218 void PointTo(const char* data, int size); 219 220 private: 221 Repr& operator=(const Repr&); 222 Repr(const Repr& other); 223 }; 224 225 Repr repr_; 226 }; 227 228 typedef std::pair<UnicodeText::const_iterator, UnicodeText::const_iterator> 229 UnicodeTextRange; 230 231 // NOTE: The following are needed to avoid implicit conversion from char* to 232 // std::string, or from ::string to std::string, because if this happens it 233 // often results in invalid memory access to a temporary object created during 234 // such conversion (if do_copy == false). 235 // NOTE: These methods don't check if the input string is UTF8 well formed, for 236 // efficiency reasons. Use UnicodeText::is_valid() when explicitly needed. 237 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, 238 bool do_copy = true); 239 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, bool do_copy = true); 240 UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy = true); 241 UnicodeText UTF8ToUnicodeText(StringPiece str, bool do_copy = true); 242 UnicodeText UTF8ToUnicodeText(absl::string_view str, bool do_copy = true); 243 244 inline logging::LoggingStringStream& operator<<( 245 logging::LoggingStringStream& stream, const UnicodeText& message) { 246 stream.message.append(message.data(), message.size_bytes()); 247 return stream; 248 } 249 250 } // namespace libtextclassifier3 251 252 #endif // LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_ 253