1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_UTIL_UTF8_UNICODETEXT_H_ 18 #define LIBTEXTCLASSIFIER_UTIL_UTF8_UNICODETEXT_H_ 19 20 #include <utility> 21 22 #include "base.h" 23 24 namespace libtextclassifier { 25 26 // ***************************** UnicodeText ************************** 27 // 28 // A UnicodeText object is a wrapper around a sequence of Unicode 29 // codepoint values that allows iteration over these values. 30 // 31 // The internal representation of the text is UTF-8. Since UTF-8 is a 32 // variable-width format, UnicodeText does not provide random access 33 // to the text, and changes to the text are permitted only at the end. 34 // 35 // The UnicodeText class defines a const_iterator. The dereferencing 36 // operator (*) returns a codepoint (int32). The iterator is a 37 // read-only iterator. It becomes invalid if the text is changed. 38 // 39 // Codepoints are integers in the range [0, 0xD7FF] or [0xE000, 40 // 0x10FFFF], but UnicodeText has the additional restriction that it 41 // can contain only those characters that are valid for interchange on 42 // the Web. This excludes all of the control codes except for carriage 43 // return, line feed, and horizontal tab. It also excludes 44 // non-characters, but codepoints that are in the Private Use regions 45 // are allowed, as are codepoints that are unassigned. (See the 46 // Unicode reference for details.) 47 // 48 // MEMORY MANAGEMENT: 49 // 50 // PointToUTF8(buffer, size) creates an alias pointing to buffer. 51 // 52 // The purpose of an alias is to avoid making an unnecessary copy of a 53 // UTF-8 buffer while still providing access to the Unicode values 54 // within that text through iterators. The lifetime of an alias must not 55 // exceed the lifetime of the buffer from which it was constructed. 56 // 57 // Aliases should be used with care. If the source from which an alias 58 // was created is freed, or if the contents are changed, while the 59 // alias is still in use, fatal errors could result. But it can be 60 // quite useful to have a UnicodeText "window" through which to see a 61 // UTF-8 buffer without having to pay the price of making a copy. 62 63 class UnicodeText { 64 public: 65 class const_iterator; 66 67 UnicodeText(); // Create an empty text. 68 UnicodeText(const UnicodeText& src); 69 ~UnicodeText(); 70 71 class const_iterator { 72 typedef const_iterator CI; 73 74 public: 75 typedef std::input_iterator_tag iterator_category; 76 typedef char32 value_type; 77 typedef int difference_type; 78 typedef void pointer; // (Not needed.) 79 typedef const char32 reference; // (Needed for const_reverse_iterator) 80 81 // Iterators are default-constructible. 82 const_iterator(); 83 84 // It's safe to make multiple passes over a UnicodeText. 85 const_iterator& operator=(const const_iterator& other); 86 87 char32 operator*() const; // Dereference 88 89 const_iterator& operator++(); // Advance (++iter) 90 const_iterator operator++(int) { // (iter++) 91 const_iterator result(*this); 92 ++*this; 93 return result; 94 } 95 96 const_iterator& operator--(); // Retreat (--iter) 97 const_iterator operator--(int) { // (iter--) 98 const_iterator result(*this); 99 --*this; 100 return result; 101 } 102 103 friend bool operator==(const CI& lhs, const CI& rhs) { 104 return lhs.it_ == rhs.it_; 105 } 106 friend bool operator!=(const CI& lhs, const CI& rhs) { 107 return !(lhs == rhs); 108 } 109 friend bool operator<(const CI& lhs, const CI& rhs); 110 friend bool operator>(const CI& lhs, const CI& rhs) { return rhs < lhs; } 111 friend bool operator<=(const CI& lhs, const CI& rhs) { 112 return !(rhs < lhs); 113 } 114 friend bool operator>=(const CI& lhs, const CI& rhs) { 115 return !(lhs < rhs); 116 } 117 utf8_length()118 int utf8_length() const { 119 if (it_[0] < 0x80) { 120 return 1; 121 } else if (it_[0] < 0xE0) { 122 return 2; 123 } else if (it_[0] < 0xF0) { 124 return 3; 125 } else { 126 return 4; 127 } 128 } utf8_data()129 const char* utf8_data() const { return it_; } 130 131 private: 132 friend class UnicodeText; const_iterator(const char * it)133 explicit const_iterator(const char *it) : it_(it) {} 134 135 const char *it_; 136 }; 137 138 const_iterator begin() const; 139 const_iterator end() const; 140 141 // x.PointToUTF8(buf,len) changes x so that it points to buf 142 // ("becomes an alias"). It does not take ownership or copy buf. 143 // This function assumes that the input is interchange valid UTF8. 144 UnicodeText& Copy(const UnicodeText& src); 145 UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length); 146 UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length); 147 148 // Calling this may invalidate pointers to underlying data. 149 UnicodeText& AppendUTF8(const char* utf8, int len); 150 void clear(); 151 152 static std::string UTF8Substring(const const_iterator& first, 153 const const_iterator& last); 154 155 private: 156 friend class const_iterator; 157 158 class Repr { // A byte-string. 159 public: 160 char* data_; 161 int size_; 162 int capacity_; 163 bool ours_; // Do we own data_? 164 Repr()165 Repr() : data_(NULL), size_(0), capacity_(0), ours_(true) {} ~Repr()166 ~Repr() { 167 if (ours_) delete[] data_; 168 } 169 170 void clear(); 171 void reserve(int capacity); 172 void resize(int size); 173 174 void append(const char* bytes, int byte_length); 175 void Copy(const char* data, int size); 176 void TakeOwnershipOf(char* data, int size, int capacity); 177 void PointTo(const char* data, int size); 178 179 private: 180 Repr& operator=(const Repr&); 181 Repr(const Repr& other); 182 }; 183 184 Repr repr_; 185 }; 186 187 typedef std::pair<UnicodeText::const_iterator, UnicodeText::const_iterator> 188 UnicodeTextRange; 189 190 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, bool do_copy); 191 UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy); 192 193 } // namespace libtextclassifier 194 195 #endif // LIBTEXTCLASSIFIER_UTIL_UTF8_UNICODETEXT_H_ 196