• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBTEXTCLASSIFIER_UTIL_UTF8_UNICODETEXT_H_
18 #define LIBTEXTCLASSIFIER_UTIL_UTF8_UNICODETEXT_H_
19 
20 #include <utility>
21 
22 #include "base.h"
23 
24 namespace libtextclassifier {
25 
26 // ***************************** UnicodeText **************************
27 //
28 // A UnicodeText object is a wrapper around a sequence of Unicode
29 // codepoint values that allows iteration over these values.
30 //
31 // The internal representation of the text is UTF-8. Since UTF-8 is a
32 // variable-width format, UnicodeText does not provide random access
33 // to the text, and changes to the text are permitted only at the end.
34 //
35 // The UnicodeText class defines a const_iterator. The dereferencing
36 // operator (*) returns a codepoint (int32). The iterator is a
37 // read-only iterator. It becomes invalid if the text is changed.
38 //
39 // Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
40 // 0x10FFFF], but UnicodeText has the additional restriction that it
41 // can contain only those characters that are valid for interchange on
42 // the Web. This excludes all of the control codes except for carriage
43 // return, line feed, and horizontal tab.  It also excludes
44 // non-characters, but codepoints that are in the Private Use regions
45 // are allowed, as are codepoints that are unassigned. (See the
46 // Unicode reference for details.)
47 //
48 // MEMORY MANAGEMENT:
49 //
50 // PointToUTF8(buffer, size) creates an alias pointing to buffer.
51 //
52 // The purpose of an alias is to avoid making an unnecessary copy of a
53 // UTF-8 buffer while still providing access to the Unicode values
54 // within that text through iterators. The lifetime of an alias must not
55 // exceed the lifetime of the buffer from which it was constructed.
56 //
57 // Aliases should be used with care. If the source from which an alias
58 // was created is freed, or if the contents are changed, while the
59 // alias is still in use, fatal errors could result. But it can be
60 // quite useful to have a UnicodeText "window" through which to see a
61 // UTF-8 buffer without having to pay the price of making a copy.
62 
63 class UnicodeText {
64  public:
65   class const_iterator;
66 
67   UnicodeText();  // Create an empty text.
68   UnicodeText(const UnicodeText& src);
69   ~UnicodeText();
70 
71   class const_iterator {
72     typedef const_iterator CI;
73 
74    public:
75     typedef std::input_iterator_tag iterator_category;
76     typedef char32 value_type;
77     typedef int difference_type;
78     typedef void pointer;  // (Not needed.)
79     typedef const char32 reference;  // (Needed for const_reverse_iterator)
80 
81     // Iterators are default-constructible.
82     const_iterator();
83 
84     // It's safe to make multiple passes over a UnicodeText.
85     const_iterator& operator=(const const_iterator& other);
86 
87     char32 operator*() const;  // Dereference
88 
89     const_iterator& operator++();  // Advance (++iter)
90     const_iterator operator++(int) {  // (iter++)
91       const_iterator result(*this);
92       ++*this;
93       return result;
94     }
95 
96     const_iterator& operator--();     // Retreat (--iter)
97     const_iterator operator--(int) {  // (iter--)
98       const_iterator result(*this);
99       --*this;
100       return result;
101     }
102 
103     friend bool operator==(const CI& lhs, const CI& rhs) {
104       return lhs.it_ == rhs.it_;
105     }
106     friend bool operator!=(const CI& lhs, const CI& rhs) {
107       return !(lhs == rhs);
108     }
109     friend bool operator<(const CI& lhs, const CI& rhs);
110     friend bool operator>(const CI& lhs, const CI& rhs) { return rhs < lhs; }
111     friend bool operator<=(const CI& lhs, const CI& rhs) {
112       return !(rhs < lhs);
113     }
114     friend bool operator>=(const CI& lhs, const CI& rhs) {
115       return !(lhs < rhs);
116     }
117 
utf8_length()118     int utf8_length() const {
119       if (it_[0] < 0x80) {
120         return 1;
121       } else if (it_[0] < 0xE0) {
122         return 2;
123       } else if (it_[0] < 0xF0) {
124         return 3;
125       } else {
126         return 4;
127       }
128     }
utf8_data()129     const char* utf8_data() const { return it_; }
130 
131    private:
132     friend class UnicodeText;
const_iterator(const char * it)133     explicit const_iterator(const char *it) : it_(it) {}
134 
135     const char *it_;
136   };
137 
138   const_iterator begin() const;
139   const_iterator end() const;
140 
141   // x.PointToUTF8(buf,len) changes x so that it points to buf
142   // ("becomes an alias"). It does not take ownership or copy buf.
143   // This function assumes that the input is interchange valid UTF8.
144   UnicodeText& Copy(const UnicodeText& src);
145   UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
146   UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);
147 
148   // Calling this may invalidate pointers to underlying data.
149   UnicodeText& AppendUTF8(const char* utf8, int len);
150   void clear();
151 
152   static std::string UTF8Substring(const const_iterator& first,
153                                    const const_iterator& last);
154 
155  private:
156   friend class const_iterator;
157 
158   class Repr {  // A byte-string.
159    public:
160     char* data_;
161     int size_;
162     int capacity_;
163     bool ours_;  // Do we own data_?
164 
Repr()165     Repr() : data_(NULL), size_(0), capacity_(0), ours_(true) {}
~Repr()166     ~Repr() {
167       if (ours_) delete[] data_;
168     }
169 
170     void clear();
171     void reserve(int capacity);
172     void resize(int size);
173 
174     void append(const char* bytes, int byte_length);
175     void Copy(const char* data, int size);
176     void TakeOwnershipOf(char* data, int size, int capacity);
177     void PointTo(const char* data, int size);
178 
179    private:
180     Repr& operator=(const Repr&);
181     Repr(const Repr& other);
182   };
183 
184   Repr repr_;
185 };
186 
187 typedef std::pair<UnicodeText::const_iterator, UnicodeText::const_iterator>
188     UnicodeTextRange;
189 
190 UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, bool do_copy);
191 UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy);
192 
193 }  // namespace libtextclassifier
194 
195 #endif  // LIBTEXTCLASSIFIER_UTIL_UTF8_UNICODETEXT_H_
196