• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2006 Google Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // Author: Jim Meehan
16 
17 #ifndef UTIL_UTF8_UNICODETEXT_H__
18 #define UTIL_UTF8_UNICODETEXT_H__
19 
20 #include <iterator>
21 #include <string>
22 #include <utility>
23 #include "phonenumbers/base/basictypes.h"
24 
25 namespace i18n {
26 namespace phonenumbers {
27 
28 using std::string;
29 using std::bidirectional_iterator_tag;
30 using std::pair;
31 
32 // ***************************** UnicodeText **************************
33 //
34 // A UnicodeText object is a container for a sequence of Unicode
35 // codepoint values. It has default, copy, and assignment constructors.
36 // Data can be appended to it from another UnicodeText, from
37 // iterators, or from a single codepoint.
38 //
39 // The internal representation of the text is UTF-8. Since UTF-8 is a
40 // variable-width format, UnicodeText does not provide random access
41 // to the text, and changes to the text are permitted only at the end.
42 //
43 // The UnicodeText class defines a const_iterator. The dereferencing
44 // operator (*) returns a codepoint (char32). The iterator is a
45 // bidirectional, read-only iterator. It becomes invalid if the text
46 // is changed.
47 //
48 // There are methods for appending and retrieving UTF-8 data directly.
49 // The 'utf8_data' method returns a const char* that contains the
50 // UTF-8-encoded version of the text; 'utf8_length' returns the number
51 // of bytes in the UTF-8 data. An iterator's 'get' method stores up to
52 // 4 bytes of UTF-8 data in a char array and returns the number of
53 // bytes that it stored.
54 //
55 // Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
56 // 0x10FFFF], but UnicodeText has the additional restriction that it
57 // can contain only those characters that are valid for interchange on
58 // the Web. This excludes all of the control codes except for carriage
59 // return, line feed, and horizontal tab.  It also excludes
60 // non-characters, but codepoints that are in the Private Use regions
61 // are allowed, as are codepoints that are unassigned. (See the
62 // Unicode reference for details.) The function UniLib::IsInterchangeValid
63 // can be used as a test for this property.
64 //
65 // UnicodeTexts are safe. Every method that constructs or modifies a
66 // UnicodeText tests for interchange-validity, and will substitute a
67 // space for the invalid data. Such cases are reported via
68 // LOG(WARNING).
69 //
70 // MEMORY MANAGEMENT: copy, take ownership, or point to
71 //
72 // A UnicodeText is either an "owner", meaning that it owns the memory
73 // for the data buffer and will free it when the UnicodeText is
74 // destroyed, or it is an "alias", meaning that it does not.
75 //
76 // There are three methods for storing UTF-8 data in a UnicodeText:
77 //
78 // CopyUTF8(buffer, len) copies buffer.
79 //
80 // TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer.
81 //
82 // PointToUTF8(buffer, size) creates an alias pointing to buffer.
83 //
84 // All three methods perform a validity check on the buffer. There are
85 // private, "unsafe" versions of these functions that bypass the
86 // validity check. They are used internally and by friend-functions
87 // that are handling UTF-8 data that has already been validated.
88 //
89 // The purpose of an alias is to avoid making an unnecessary copy of a
90 // UTF-8 buffer while still providing access to the Unicode values
91 // within that text through iterators or the fast scanners that are
92 // based on UTF-8 state tables. The lifetime of an alias must not
93 // exceed the lifetime of the buffer from which it was constructed.
94 //
95 // The semantics of an alias might be described as "copy on write or
96 // repair." The source data is never modified. If push_back() or
97 // append() is called on an alias, a copy of the data will be created,
98 // and the UnicodeText will become an owner. If clear() is called on
99 // an alias, it becomes an (empty) owner.
100 //
101 // The copy constructor and the assignment operator produce an owner.
102 // That is, after direct initialization ("UnicodeText x(y);") or copy
103 // initialization ("UnicodeText x = y;") x will be an owner, even if y
104 // was an alias. The assignment operator ("x = y;") also produces an
105 // owner unless x and y are the same object and y is an alias.
106 //
107 // Aliases should be used with care. If the source from which an alias
108 // was created is freed, or if the contents are changed, while the
109 // alias is still in use, fatal errors could result. But it can be
110 // quite useful to have a UnicodeText "window" through which to see a
111 // UTF-8 buffer without having to pay the price of making a copy.
112 //
113 // UTILITIES
114 //
115 // The interfaces in util/utf8/public/textutils.h provide higher-level
116 // utilities for dealing with UnicodeTexts, including routines for
117 // creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or
118 // strings, creating strings from UnicodeTexts, normalizing text for
119 // efficient matching or display, and others.
120 
121 class UnicodeText {
122  public:
123   class const_iterator;
124 
125   typedef char32 value_type;
126 
127   // Constructors. These always produce owners.
128   UnicodeText();  // Create an empty text.
129   UnicodeText(const UnicodeText& src);  // copy constructor
130   // Construct a substring (copies the data).
131   UnicodeText(const const_iterator& first, const const_iterator& last);
132 
133   // Assignment operator. This copies the data and produces an owner
134   // unless this == &src, e.g., "x = x;", which is a no-op.
135   UnicodeText& operator=(const UnicodeText& src);
136 
137   // x.Copy(y) copies the data from y into x.
138   UnicodeText& Copy(const UnicodeText& src);
assign(const UnicodeText & src)139   inline UnicodeText& assign(const UnicodeText& src) { return Copy(src); }
140 
141   // x.PointTo(y) changes x so that it points to y's data.
142   // It does not copy y or take ownership of y's data.
143   UnicodeText& PointTo(const UnicodeText& src);
144   UnicodeText& PointTo(const const_iterator& first,
145                        const const_iterator& last);
146 
147   ~UnicodeText();
148 
149   void clear();  // Clear text.
empty()150   bool empty() { return repr_.size_ == 0; }  // Test if text is empty.
151 
152   // Add a codepoint to the end of the text.
153   // If the codepoint is not interchange-valid, add a space instead
154   // and log a warning.
155   void push_back(char32 codepoint);
156 
157   // Generic appending operation.
158   // iterator_traits<ForwardIterator>::value_type must be implicitly
159   // convertible to char32. Typical uses of this method might include:
160   //     char32 chars[] = {0x1, 0x2, ...};
161   //     vector<char32> more_chars = ...;
162   //     utext.append(chars, chars+arraysize(chars));
163   //     utext.append(more_chars.begin(), more_chars.end());
164   template<typename ForwardIterator>
append(ForwardIterator first,const ForwardIterator last)165   UnicodeText& append(ForwardIterator first, const ForwardIterator last) {
166     while (first != last) { push_back(*first++); }
167     return *this;
168   }
169 
170   // A specialization of the generic append() method.
171   UnicodeText& append(const const_iterator& first, const const_iterator& last);
172 
173   // An optimization of append(source.begin(), source.end()).
174   UnicodeText& append(const UnicodeText& source);
175 
176   int size() const;  // the number of Unicode characters (codepoints)
177 
178   friend bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
179   friend bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs);
180 
181   class const_iterator {
182     typedef const_iterator CI;
183    public:
184     typedef bidirectional_iterator_tag iterator_category;
185     typedef char32 value_type;
186     typedef ptrdiff_t difference_type;
187     typedef void pointer;  // (Not needed.)
188     typedef const char32 reference;  // (Needed for const_reverse_iterator)
189 
190     // Iterators are default-constructible.
191     const_iterator();
192 
193     // It's safe to make multiple passes over a UnicodeText.
194     const_iterator(const const_iterator& other);
195     const_iterator& operator=(const const_iterator& other);
196 
197     char32 operator*() const;  // Dereference
198 
199     const_iterator& operator++();  // Advance (++iter)
200     const_iterator operator++(int) {  // (iter++)
201       const_iterator result(*this);
202       ++*this;
203       return result;
204     }
205 
206     const_iterator& operator--();  // Retreat (--iter)
207     const_iterator operator--(int) {  // (iter--)
208       const_iterator result(*this);
209       --*this;
210       return result;
211     }
212 
213     // We love relational operators.
214     friend bool operator==(const CI& lhs, const CI& rhs) {
215       return lhs.it_ == rhs.it_; }
216     friend bool operator!=(const CI& lhs, const CI& rhs) {
217       return !(lhs == rhs); }
218     friend bool operator<(const CI& lhs, const CI& rhs);
219     friend bool operator>(const CI& lhs, const CI& rhs) {
220       return rhs < lhs; }
221     friend bool operator<=(const CI& lhs, const CI& rhs) {
222       return !(rhs < lhs); }
223     friend bool operator>=(const CI& lhs, const CI& rhs) {
224       return !(lhs < rhs); }
225 
226     friend difference_type distance(const CI& first, const CI& last);
227 
228     // UTF-8-specific methods
229     // Store the UTF-8 encoding of the current codepoint into buf,
230     // which must be at least 4 bytes long. Return the number of
231     // bytes written.
232     int get_utf8(char* buf) const;
233     // Return the iterator's pointer into the UTF-8 data.
utf8_data()234     const char* utf8_data() const { return it_; }
235 
236     string DebugString() const;
237 
238    private:
239     friend class UnicodeText;
240     friend class UnicodeTextUtils;
241     friend class UTF8StateTableProperty;
const_iterator(const char * it)242     explicit const_iterator(const char* it) : it_(it) {}
243 
244     const char* it_;
245   };
246 
247   const_iterator begin() const;
248   const_iterator end() const;
249 
250   class const_reverse_iterator : public std::reverse_iterator<const_iterator> {
251    public:
const_reverse_iterator(const_iterator it)252     const_reverse_iterator(const_iterator it) :
253         std::reverse_iterator<const_iterator>(it) {}
utf8_data()254     const char* utf8_data() const {
255       const_iterator tmp_it = base();
256       return (--tmp_it).utf8_data();
257     }
get_utf8(char * buf)258     int get_utf8(char* buf) const {
259       const_iterator tmp_it = base();
260       return (--tmp_it).get_utf8(buf);
261     }
262   };
rbegin()263   const_reverse_iterator rbegin() const {
264     return const_reverse_iterator(end());
265   }
rend()266   const_reverse_iterator rend() const {
267     return const_reverse_iterator(begin());
268   }
269 
270   // Substring searching.  Returns the beginning of the first
271   // occurrence of "look", or end() if not found.
272   const_iterator find(const UnicodeText& look, const_iterator start_pos) const;
273   // Equivalent to find(look, begin())
274   const_iterator find(const UnicodeText& look) const;
275 
276   // Returns whether this contains the character U+FFFD.  This can
277   // occur, for example, if the input to Encodings::Decode() had byte
278   // sequences that were invalid in the source encoding.
279   bool HasReplacementChar() const;
280 
281   // UTF-8-specific methods
282   //
283   // Return the data, length, and capacity of UTF-8-encoded version of
284   // the text. Length and capacity are measured in bytes.
utf8_data()285   const char* utf8_data() const { return repr_.data_; }
utf8_length()286   int utf8_length() const { return repr_.size_; }
utf8_capacity()287   int utf8_capacity() const { return repr_.capacity_; }
288 
289   // Return the UTF-8 data as a string.
290   static string UTF8Substring(const const_iterator& first,
291                               const const_iterator& last);
292 
293   // There are three methods for initializing a UnicodeText from UTF-8
294   // data. They vary in details of memory management. In all cases,
295   // the data is tested for interchange-validity. If it is not
296   // interchange-valid, a LOG(WARNING) is issued, and each
297   // structurally invalid byte and each interchange-invalid codepoint
298   // is replaced with a space. The `utf8_was_valid_` status is set
299   // appropriately and may be queried afterwards.
300 
301   // x.CopyUTF8(buf, len) copies buf into x.
302   UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);
303 
304   // x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of
305   // buf. buf is not copied.
306   UnicodeText& TakeOwnershipOfUTF8(char* utf8_buffer,
307                                    int byte_length,
308                                    int byte_capacity);
309 
310   // x.PointToUTF8(buf,len) changes x so that it points to buf
311   // ("becomes an alias"). It does not take ownership or copy buf.
312   // If the buffer is not valid, this has the same effect as
313   // CopyUTF8(utf8_buffer, byte_length).
314   UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
315 
316   // Was this UnicodeText created from valid UTF-8?
UTF8WasValid()317   bool UTF8WasValid() const { return repr_.utf8_was_valid_; }
318 
319   // Occasionally it is necessary to use functions that operate on the
320   // pointer returned by utf8_data(). MakeIterator(p) provides a way
321   // to get back to the UnicodeText level. It uses CHECK to ensure
322   // that p is a pointer within this object's UTF-8 data, and that it
323   // points to the beginning of a character.
324   const_iterator MakeIterator(const char* p) const;
325 
326   string DebugString() const;
327 
328  private:
329   friend class const_iterator;
330   friend class UnicodeTextUtils;
331 
332   class Repr {  // A byte-string.
333    public:
334     char* data_;
335     int size_;
336     int capacity_;
337     bool ours_;  // Do we own data_?
338     bool utf8_was_valid_; // Were we created from valid UTF-8?
339 
Repr()340     Repr() : data_(NULL), size_(0), capacity_(0), ours_(true), utf8_was_valid_(true) {}
~Repr()341     ~Repr() { if (ours_) delete[] data_; }
342 
343     void clear();
344     void reserve(int capacity);
345     void resize(int size);
346 
347     void append(const char* bytes, int byte_length);
348     void Copy(const char* data, int size);
349     void TakeOwnershipOf(char* data, int size, int capacity);
350     void PointTo(const char* data, int size);
351 
352     string DebugString() const;
353 
354    private:
355     Repr& operator=(const Repr&);
356     Repr(const Repr& other);
357   };
358 
359   Repr repr_;
360 
361   // UTF-8-specific private methods.
362   // These routines do not perform a validity check when compiled
363   // in opt mode.
364   // It is an error to call these methods with UTF-8 data that
365   // is not interchange-valid.
366   //
367   UnicodeText& UnsafeCopyUTF8(const char* utf8_buffer, int byte_length);
368   UnicodeText& UnsafeTakeOwnershipOfUTF8(
369       char* utf8_buffer, int byte_length, int byte_capacity);
370   UnicodeText& UnsafePointToUTF8(const char* utf8_buffer, int byte_length);
371   UnicodeText& UnsafeAppendUTF8(const char* utf8_buffer, int byte_length);
372   const_iterator UnsafeFind(const UnicodeText& look,
373                             const_iterator start_pos) const;
374 };
375 
376 bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
377 
378 inline bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs) {
379   return !(lhs == rhs);
380 }
381 
382 // UnicodeTextRange is a pair of iterators, useful for specifying text
383 // segments. If the iterators are ==, the segment is empty.
384 typedef pair<UnicodeText::const_iterator,
385              UnicodeText::const_iterator> UnicodeTextRange;
386 
UnicodeTextRangeIsEmpty(const UnicodeTextRange & r)387 inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) {
388   return r.first == r.second;
389 }
390 
391 
392 // *************************** Utilities *************************
393 
394 // A factory function for creating a UnicodeText from a buffer of
395 // UTF-8 data. The new UnicodeText takes ownership of the buffer. (It
396 // is an "owner.")
397 //
398 // Each byte that is structurally invalid will be replaced with a
399 // space. Each codepoint that is interchange-invalid will also be
400 // replaced with a space, even if the codepoint was represented with a
401 // multibyte sequence in the UTF-8 data.
402 //
MakeUnicodeTextAcceptingOwnership(char * utf8_buffer,int byte_length,int byte_capacity)403 inline UnicodeText MakeUnicodeTextAcceptingOwnership(
404     char* utf8_buffer, int byte_length, int byte_capacity) {
405   return UnicodeText().TakeOwnershipOfUTF8(
406       utf8_buffer, byte_length, byte_capacity);
407 }
408 
409 // A factory function for creating a UnicodeText from a buffer of
410 // UTF-8 data. The new UnicodeText does not take ownership of the
411 // buffer. (It is an "alias.")
412 //
MakeUnicodeTextWithoutAcceptingOwnership(const char * utf8_buffer,int byte_length)413 inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership(
414     const char* utf8_buffer, int byte_length) {
415   return UnicodeText().PointToUTF8(utf8_buffer, byte_length);
416 }
417 
418 // Create a UnicodeText from a UTF-8 string or buffer.
419 //
420 // If do_copy is true, then a copy of the string is made. The copy is
421 // owned by the resulting UnicodeText object and will be freed when
422 // the object is destroyed. This UnicodeText object is referred to
423 // as an "owner."
424 //
425 // If do_copy is false, then no copy is made. The resulting
426 // UnicodeText object does NOT take ownership of the string; in this
427 // case, the lifetime of the UnicodeText object must not exceed the
428 // lifetime of the string. This Unicodetext object is referred to as
429 // an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership.
430 //
431 // If the input string does not contain valid UTF-8, then a copy is
432 // made (as if do_copy were true) and coerced to valid UTF-8 by
433 // replacing each invalid byte with a space.
434 //
UTF8ToUnicodeText(const char * utf8_buf,int len,bool do_copy)435 inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len,
436                                      bool do_copy) {
437   UnicodeText t;
438   if (do_copy) {
439     t.CopyUTF8(utf8_buf, len);
440   } else {
441     t.PointToUTF8(utf8_buf, len);
442   }
443   return t;
444 }
445 
UTF8ToUnicodeText(const string & utf_string,bool do_copy)446 inline UnicodeText UTF8ToUnicodeText(const string& utf_string, bool do_copy) {
447   return UTF8ToUnicodeText(utf_string.data(), static_cast<int>(utf_string.size()), do_copy);
448 }
449 
UTF8ToUnicodeText(const char * utf8_buf,int len)450 inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len) {
451   return UTF8ToUnicodeText(utf8_buf, len, true);
452 }
UTF8ToUnicodeText(const string & utf8_string)453 inline UnicodeText UTF8ToUnicodeText(const string& utf8_string) {
454   return UTF8ToUnicodeText(utf8_string, true);
455 }
456 
457 // Return a string containing the UTF-8 encoded version of all the
458 // Unicode characters in t.
UnicodeTextToUTF8(const UnicodeText & t)459 inline string UnicodeTextToUTF8(const UnicodeText& t) {
460   return string(t.utf8_data(), t.utf8_length());
461 }
462 
463 }  // namespace phonenumbers
464 }  // namespace i18n
465 
466 #endif  // UTIL_UTF8_UNICODETEXT_H__
467