• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_
18 #define LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_
19 
20 #include <iterator>
21 #include <string>
22 #include <utility>
23 #include <vector>
24 
25 #include "utils/base/integral_types.h"
26 #include "utils/base/logging.h"
27 #include "utils/strings/stringpiece.h"
28 #include "absl/strings/string_view.h"
29 
30 namespace libtextclassifier3 {
31 
32 // ***************************** UnicodeText **************************
33 //
34 // A UnicodeText object is a wrapper around a sequence of Unicode
35 // codepoint values that allows iteration over these values.
36 //
37 // The internal representation of the text is UTF-8. Since UTF-8 is a
38 // variable-width format, UnicodeText does not provide random access
39 // to the text, and changes to the text are permitted only at the end.
40 //
41 // The UnicodeText class defines a const_iterator. The dereferencing
42 // operator (*) returns a codepoint (int32). The iterator is a
43 // read-only iterator. It becomes invalid if the text is changed.
44 //
45 // Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
46 // 0x10FFFF], but UnicodeText has the additional restriction that it
47 // can contain only those characters that are valid for interchange on
48 // the Web. This excludes all of the control codes except for carriage
49 // return, line feed, and horizontal tab.  It also excludes
50 // non-characters, but codepoints that are in the Private Use regions
51 // are allowed, as are codepoints that are unassigned. (See the
52 // Unicode reference for details.)
53 //
54 // MEMORY MANAGEMENT:
55 //
56 // PointToUTF8(buffer, size) creates an alias pointing to buffer.
57 //
58 // The purpose of an alias is to avoid making an unnecessary copy of a
59 // UTF-8 buffer while still providing access to the Unicode values
60 // within that text through iterators. The lifetime of an alias must not
61 // exceed the lifetime of the buffer from which it was constructed.
62 //
63 // Aliases should be used with care. If the source from which an alias
64 // was created is freed, or if the contents are changed, while the
65 // alias is still in use, fatal errors could result. But it can be
66 // quite useful to have a UnicodeText "window" through which to see a
67 // UTF-8 buffer without having to pay the price of making a copy.
68 
69 class UnicodeText {
70  public:
71   class const_iterator;
72 
73   UnicodeText();  // Create an empty text.
74   UnicodeText(const UnicodeText& src, bool do_copy = true);
75   UnicodeText& operator=(UnicodeText&& src);
76   ~UnicodeText();
77 
78   class const_iterator {
79     typedef const_iterator CI;
80 
81    public:
82     typedef std::bidirectional_iterator_tag iterator_category;
83     typedef char32 value_type;
84     typedef int difference_type;
85     typedef void pointer;            // (Not needed.)
86     typedef const char32 reference;  // (Needed for const_reverse_iterator)
87 
88     // Iterators are default-constructible.
89     const_iterator();
90 
91     // It's safe to make multiple passes over a UnicodeText.
92     const_iterator(const const_iterator&) = default;
93     const_iterator& operator=(const const_iterator&) = default;
94 
95     char32 operator*() const;  // Dereference
96 
97     const_iterator& operator++();     // Advance (++iter)
98     const_iterator operator++(int) {  // (iter++)
99       const_iterator result(*this);
100       ++*this;
101       return result;
102     }
103 
104     const_iterator& operator--();     // Retreat (--iter)
105     const_iterator operator--(int) {  // (iter--)
106       const_iterator result(*this);
107       --*this;
108       return result;
109     }
110 
111     friend bool operator==(const CI& lhs, const CI& rhs) {
112       return lhs.it_ == rhs.it_;
113     }
114     friend bool operator!=(const CI& lhs, const CI& rhs) {
115       return !(lhs == rhs);
116     }
117     friend bool operator<(const CI& lhs, const CI& rhs);
118     friend bool operator>(const CI& lhs, const CI& rhs) { return rhs < lhs; }
119     friend bool operator<=(const CI& lhs, const CI& rhs) {
120       return !(rhs < lhs);
121     }
122     friend bool operator>=(const CI& lhs, const CI& rhs) {
123       return !(lhs < rhs);
124     }
125 
utf8_length()126     int utf8_length() const {
127       const unsigned char byte = static_cast<unsigned char>(it_[0]);
128       if (byte < 0x80) {
129         return 1;
130       } else if (byte < 0xE0) {
131         return 2;
132       } else if (byte < 0xF0) {
133         return 3;
134       } else {
135         return 4;
136       }
137     }
utf8_data()138     const char* utf8_data() const { return it_; }
139 
140    private:
141     friend class UnicodeText;
const_iterator(const char * it)142     explicit const_iterator(const char* it) : it_(it) {}
143 
144     const char* it_;
145   };
146 
147   const_iterator begin() const;
148   const_iterator end() const;
149 
150   // Gets pointer to the underlying utf8 data.
151   const char* data() const;
152 
153   // Gets length (in bytes) of the underlying utf8 data.
154   int size_bytes() const;
155 
156   // Computes length (in number of Unicode codepoints) of the underlying utf8
157   // data.
158   // NOTE: Complexity O(n).
159   int size_codepoints() const;
160 
161   bool empty() const;
162 
163   // Checks whether the underlying data is valid utf8 data.
164   bool is_valid() const;
165 
166   bool operator==(const UnicodeText& other) const;
167 
168   // x.PointToUTF8(buf,len) changes x so that it points to buf
169   // ("becomes an alias"). It does not take ownership or copy buf.
170   // This function assumes that the input is interchange valid UTF8.
171   UnicodeText& Copy(const UnicodeText& src);
172   UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
173   UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);
174 
175   // Calling this may invalidate pointers to underlying data.
176   UnicodeText& AppendUTF8(const char* utf8, int len);
177   UnicodeText& push_back(char32 ch);
178   void clear();
179 
180   // Returns an iterator for each codepoint.
181   std::vector<const_iterator> Codepoints() const;
182 
183   // Returns the list of codepoints of the UnicodeText.
184   std::vector<char32> CodepointsChar32() const;
185 
186   std::string ToUTF8String() const;
187   std::string UTF8Substring(int begin_codepoint, int end_codepoint) const;
188   static std::string UTF8Substring(const const_iterator& it_begin,
189                                    const const_iterator& it_end);
190   static UnicodeText Substring(const UnicodeText& text, int begin_codepoint,
191                                int end_codepoint, bool do_copy = true);
192   static UnicodeText Substring(const const_iterator& it_begin,
193                                const const_iterator& it_end,
194                                bool do_copy = true);
195 
196  private:
197   friend class const_iterator;
198 
199   class Repr {  // A byte-string.
200    public:
201     char* data_;
202     int size_;
203     int capacity_;
204     bool ours_;  // Do we own data_?
205 
Repr()206     Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {}
207     Repr& operator=(Repr&& src);
~Repr()208     ~Repr() {
209       if (ours_) delete[] data_;
210     }
211 
212     void clear();
213     void reserve(int capacity);
214     void resize(int size);
215 
216     void append(const char* bytes, int byte_length);
217     void Copy(const char* data, int size);
218     void PointTo(const char* data, int size);
219 
220    private:
221     Repr& operator=(const Repr&);
222     Repr(const Repr& other);
223   };
224 
225   Repr repr_;
226 };
227 
228 typedef std::pair<UnicodeText::const_iterator, UnicodeText::const_iterator>
229     UnicodeTextRange;
230 
231 // NOTE: The following are needed to avoid implicit conversion from char* to
232 // std::string, or from ::string to std::string, because if this happens it
233 // often results in invalid memory access to a temporary object created during
234 // such conversion (if do_copy == false).
235 // NOTE: These methods don't check if the input string is UTF8 well formed, for
236 // efficiency reasons. Use UnicodeText::is_valid() when explicitly needed.
237 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len,
238                               bool do_copy = true);
239 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, bool do_copy = true);
240 UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy = true);
241 UnicodeText UTF8ToUnicodeText(StringPiece str, bool do_copy = true);
242 UnicodeText UTF8ToUnicodeText(absl::string_view str, bool do_copy = true);
243 
244 inline logging::LoggingStringStream& operator<<(
245     logging::LoggingStringStream& stream, const UnicodeText& message) {
246   stream.message.append(message.data(), message.size_bytes());
247   return stream;
248 }
249 
250 }  // namespace libtextclassifier3
251 
252 #endif  // LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_
253