• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBTEXTCLASSIFIER_UTIL_UTF8_UNICODETEXT_H_
18 #define LIBTEXTCLASSIFIER_UTIL_UTF8_UNICODETEXT_H_
19 
20 #include <iterator>
21 #include <string>
22 #include <utility>
23 
24 #include "util/base/integral_types.h"
25 
26 namespace libtextclassifier2 {
27 
28 // ***************************** UnicodeText **************************
29 //
30 // A UnicodeText object is a wrapper around a sequence of Unicode
31 // codepoint values that allows iteration over these values.
32 //
33 // The internal representation of the text is UTF-8. Since UTF-8 is a
34 // variable-width format, UnicodeText does not provide random access
35 // to the text, and changes to the text are permitted only at the end.
36 //
37 // The UnicodeText class defines a const_iterator. The dereferencing
38 // operator (*) returns a codepoint (int32). The iterator is a
39 // read-only iterator. It becomes invalid if the text is changed.
40 //
41 // Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
42 // 0x10FFFF], but UnicodeText has the additional restriction that it
43 // can contain only those characters that are valid for interchange on
44 // the Web. This excludes all of the control codes except for carriage
45 // return, line feed, and horizontal tab.  It also excludes
46 // non-characters, but codepoints that are in the Private Use regions
47 // are allowed, as are codepoints that are unassigned. (See the
48 // Unicode reference for details.)
49 //
50 // MEMORY MANAGEMENT:
51 //
52 // PointToUTF8(buffer, size) creates an alias pointing to buffer.
53 //
54 // The purpose of an alias is to avoid making an unnecessary copy of a
55 // UTF-8 buffer while still providing access to the Unicode values
56 // within that text through iterators. The lifetime of an alias must not
57 // exceed the lifetime of the buffer from which it was constructed.
58 //
59 // Aliases should be used with care. If the source from which an alias
60 // was created is freed, or if the contents are changed, while the
61 // alias is still in use, fatal errors could result. But it can be
62 // quite useful to have a UnicodeText "window" through which to see a
63 // UTF-8 buffer without having to pay the price of making a copy.
64 
65 class UnicodeText {
66  public:
67   class const_iterator;
68 
69   UnicodeText();  // Create an empty text.
70   UnicodeText(const UnicodeText& src);
71   UnicodeText& operator=(UnicodeText&& src);
72   ~UnicodeText();
73 
74   class const_iterator {
75     typedef const_iterator CI;
76 
77    public:
78     typedef std::input_iterator_tag iterator_category;
79     typedef char32 value_type;
80     typedef int difference_type;
81     typedef void pointer;            // (Not needed.)
82     typedef const char32 reference;  // (Needed for const_reverse_iterator)
83 
84     // Iterators are default-constructible.
85     const_iterator();
86 
87     // It's safe to make multiple passes over a UnicodeText.
88     const_iterator& operator=(const const_iterator& other);
89 
90     char32 operator*() const;  // Dereference
91 
92     const_iterator& operator++();     // Advance (++iter)
93     const_iterator operator++(int) {  // (iter++)
94       const_iterator result(*this);
95       ++*this;
96       return result;
97     }
98 
99     const_iterator& operator--();     // Retreat (--iter)
100     const_iterator operator--(int) {  // (iter--)
101       const_iterator result(*this);
102       --*this;
103       return result;
104     }
105 
106     friend bool operator==(const CI& lhs, const CI& rhs) {
107       return lhs.it_ == rhs.it_;
108     }
109     friend bool operator!=(const CI& lhs, const CI& rhs) {
110       return !(lhs == rhs);
111     }
112     friend bool operator<(const CI& lhs, const CI& rhs);
113     friend bool operator>(const CI& lhs, const CI& rhs) { return rhs < lhs; }
114     friend bool operator<=(const CI& lhs, const CI& rhs) {
115       return !(rhs < lhs);
116     }
117     friend bool operator>=(const CI& lhs, const CI& rhs) {
118       return !(lhs < rhs);
119     }
120 
utf8_length()121     int utf8_length() const {
122       if (it_[0] < 0x80) {
123         return 1;
124       } else if (it_[0] < 0xE0) {
125         return 2;
126       } else if (it_[0] < 0xF0) {
127         return 3;
128       } else {
129         return 4;
130       }
131     }
utf8_data()132     const char* utf8_data() const { return it_; }
133 
134    private:
135     friend class UnicodeText;
const_iterator(const char * it)136     explicit const_iterator(const char* it) : it_(it) {}
137 
138     const char* it_;
139   };
140 
141   const_iterator begin() const;
142   const_iterator end() const;
143 
144   // Gets pointer to the underlying utf8 data.
145   const char* data() const;
146 
147   // Gets length (in bytes) of the underlying utf8 data.
148   int size_bytes() const;
149 
150   // Computes length (in number of Unicode codepoints) of the underlying utf8
151   // data.
152   // NOTE: Complexity O(n).
153   int size_codepoints() const;
154 
155   bool empty() const;
156 
157   // Checks whether the underlying data is valid utf8 data.
158   bool is_valid() const;
159 
160   bool operator==(const UnicodeText& other) const;
161 
162   // x.PointToUTF8(buf,len) changes x so that it points to buf
163   // ("becomes an alias"). It does not take ownership or copy buf.
164   // This function assumes that the input is interchange valid UTF8.
165   UnicodeText& Copy(const UnicodeText& src);
166   UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
167   UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);
168 
169   // Calling this may invalidate pointers to underlying data.
170   UnicodeText& AppendUTF8(const char* utf8, int len);
171   UnicodeText& AppendCodepoint(char32 ch);
172   void clear();
173 
174   std::string ToUTF8String() const;
175   static std::string UTF8Substring(const const_iterator& first,
176                                    const const_iterator& last);
177 
178  private:
179   friend class const_iterator;
180 
181   class Repr {  // A byte-string.
182    public:
183     char* data_;
184     int size_;
185     int capacity_;
186     bool ours_;  // Do we own data_?
187 
Repr()188     Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {}
189     Repr& operator=(Repr&& src);
~Repr()190     ~Repr() {
191       if (ours_) delete[] data_;
192     }
193 
194     void clear();
195     void reserve(int capacity);
196     void resize(int size);
197 
198     void append(const char* bytes, int byte_length);
199     void Copy(const char* data, int size);
200     void PointTo(const char* data, int size);
201 
202    private:
203     Repr& operator=(const Repr&);
204     Repr(const Repr& other);
205   };
206 
207   Repr repr_;
208 };
209 
210 typedef std::pair<UnicodeText::const_iterator, UnicodeText::const_iterator>
211     UnicodeTextRange;
212 
213 // NOTE: The following are needed to avoid implicit conversion from char* to
214 // std::string, or from ::string to std::string, because if this happens it
215 // often results in invalid memory access to a temporary object created during
216 // such conversion (if do_copy == false).
217 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, bool do_copy);
218 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, bool do_copy);
219 UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy);
220 UnicodeText UTF8ToUnicodeText(const std::string& str);
221 
222 }  // namespace libtextclassifier2
223 
224 #endif  // LIBTEXTCLASSIFIER_UTIL_UTF8_UNICODETEXT_H_
225