• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2006 Google Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // Author: Jim Meehan
16 
17 #include <algorithm>
18 #include <sstream>
19 #include <cassert>
20 #include <cstdio>
21 
22 #include "phonenumbers/utf/unicodetext.h"
23 #include "phonenumbers/utf/stringpiece.h"
24 #include "phonenumbers/utf/utf.h"
25 #include "phonenumbers/utf/unilib.h"
26 
27 namespace i18n {
28 namespace phonenumbers {
29 
30 using std::stringstream;
31 using std::max;
32 using std::hex;
33 using std::dec;
34 
CodepointDistance(const char * start,const char * end)35 static int CodepointDistance(const char* start, const char* end) {
36   int n = 0;
37   // Increment n on every non-trail-byte.
38   for (const char* p = start; p < end; ++p) {
39     n += (*reinterpret_cast<const signed char*>(p) >= -0x40);
40   }
41   return n;
42 }
43 
CodepointCount(const char * utf8,int len)44 static int CodepointCount(const char* utf8, int len) {
45   return CodepointDistance(utf8, utf8 + len);
46 }
47 
48 UnicodeText::const_iterator::difference_type
distance(const UnicodeText::const_iterator & first,const UnicodeText::const_iterator & last)49 distance(const UnicodeText::const_iterator& first,
50          const UnicodeText::const_iterator& last) {
51   return CodepointDistance(first.it_, last.it_);
52 }
53 
54 // ---------- Utility ----------
55 
ConvertToInterchangeValid(char * start,int len)56 static int ConvertToInterchangeValid(char* start, int len) {
57   // This routine is called only when we've discovered that a UTF-8 buffer
58   // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8
59   // was not interchange valid. This indicates a bug in the caller, and
60   // a LOG(WARNING) is done in that case.
61   // This is similar to CoerceToInterchangeValid, but it replaces each
62   // structurally valid byte with a space, and each non-interchange
63   // character with a space, even when that character requires more
64   // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is
65   // structurally valid UTF8, but U+FDD0 is not an interchange-valid
66   // code point. The result should contain one space, not three.
67   //
68   // Since the conversion never needs to write more data than it
69   // reads, it is safe to change the buffer in place. It returns the
70   // number of bytes written.
71   char* const in = start;
72   char* out = start;
73   char* const end = start + len;
74   while (start < end) {
75     int good = UniLib::SpanInterchangeValid(start, end - start);
76     if (good > 0) {
77       if (out != start) {
78         memmove(out, start, good);
79       }
80       out += good;
81       start += good;
82       if (start == end) {
83         break;
84       }
85     }
86     // Is the current string invalid UTF8 or just non-interchange UTF8?
87     Rune rune;
88     int n;
89     if (isvalidcharntorune(start, end - start, &rune, &n)) {
90       // structurally valid UTF8, but not interchange valid
91       start += n;  // Skip over the whole character.
92     } else {  // bad UTF8
93       start += 1;  // Skip over just one byte
94     }
95     *out++ = ' ';
96   }
97   return out - in;
98 }
99 
100 
101 // *************** Data representation **********
102 
103 // Note: the copy constructor is undefined.
104 
105 // After reserve(), resize(), or clear(), we're an owner, not an alias.
106 
reserve(int new_capacity)107 void UnicodeText::Repr::reserve(int new_capacity) {
108   // If there's already enough capacity, and we're an owner, do nothing.
109   if (capacity_ >= new_capacity && ours_) return;
110 
111   // Otherwise, allocate a new buffer.
112   capacity_ = max(new_capacity, (3 * capacity_) / 2 + 20);
113   char* new_data = new char[capacity_];
114 
115   // If there is an old buffer, copy it into the new buffer.
116   if (data_) {
117     memcpy(new_data, data_, size_);
118     if (ours_) delete[] data_;  // If we owned the old buffer, free it.
119   }
120   data_ = new_data;
121   ours_ = true;  // We own the new buffer.
122   // size_ is unchanged.
123 }
124 
resize(int new_size)125 void UnicodeText::Repr::resize(int new_size) {
126   if (new_size == 0) {
127     clear();
128   } else {
129     if (!ours_ || new_size > capacity_) reserve(new_size);
130     // Clear the memory in the expanded part.
131     if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
132     size_ = new_size;
133     ours_ = true;
134   }
135 }
136 
137 // This implementation of clear() deallocates the buffer if we're an owner.
138 // That's not strictly necessary; we could just set size_ to 0.
clear()139 void UnicodeText::Repr::clear() {
140   if (ours_) delete[] data_;
141   data_ = NULL;
142   size_ = capacity_ = 0;
143   ours_ = true;
144 }
145 
Copy(const char * data,int size)146 void UnicodeText::Repr::Copy(const char* data, int size) {
147   resize(size);
148   memcpy(data_, data, size);
149 }
150 
TakeOwnershipOf(char * data,int size,int capacity)151 void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) {
152   if (data == data_) return;  // We already own this memory. (Weird case.)
153   if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
154   data_ = data;
155   size_ = size;
156   capacity_ = capacity;
157   ours_ = true;
158 }
159 
PointTo(const char * data,int size)160 void UnicodeText::Repr::PointTo(const char* data, int size) {
161   if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
162   data_ = const_cast<char*>(data);
163   size_ = size;
164   capacity_ = size;
165   ours_ = false;
166 }
167 
append(const char * bytes,int byte_length)168 void UnicodeText::Repr::append(const char* bytes, int byte_length) {
169   reserve(size_ + byte_length);
170   memcpy(data_ + size_, bytes, byte_length);
171   size_ += byte_length;
172 }
173 
DebugString() const174 string UnicodeText::Repr::DebugString() const {
175   stringstream ss;
176 
177   ss << "{Repr " << hex << this << " data=" << data_ << " size=" << dec
178      << size_ << " capacity=" << capacity_ << " "
179      << (ours_ ? "Owned" : "Alias") << "}";
180 
181   string result;
182   ss >> result;
183 
184   return result;
185 }
186 
187 
188 
189 // *************** UnicodeText ******************
190 
191 // ----- Constructors -----
192 
193 // Default constructor
UnicodeText()194 UnicodeText::UnicodeText() {
195 }
196 
197 // Copy constructor
UnicodeText(const UnicodeText & src)198 UnicodeText::UnicodeText(const UnicodeText& src) {
199   Copy(src);
200 }
201 
202 // Substring constructor
UnicodeText(const UnicodeText::const_iterator & first,const UnicodeText::const_iterator & last)203 UnicodeText::UnicodeText(const UnicodeText::const_iterator& first,
204                          const UnicodeText::const_iterator& last) {
205   assert(first <= last && "Incompatible iterators");
206   repr_.append(first.it_, last.it_ - first.it_);
207 }
208 
UTF8Substring(const const_iterator & first,const const_iterator & last)209 string UnicodeText::UTF8Substring(const const_iterator& first,
210                                   const const_iterator& last) {
211   assert(first <= last && "Incompatible iterators");
212   return string(first.it_, last.it_ - first.it_);
213 }
214 
215 
216 // ----- Copy -----
217 
operator =(const UnicodeText & src)218 UnicodeText& UnicodeText::operator=(const UnicodeText& src) {
219   if (this != &src) {
220     Copy(src);
221   }
222   return *this;
223 }
224 
Copy(const UnicodeText & src)225 UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
226   repr_.Copy(src.repr_.data_, src.repr_.size_);
227   return *this;
228 }
229 
CopyUTF8(const char * buffer,int byte_length)230 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
231   repr_.Copy(buffer, byte_length);
232   if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
233     fprintf(stderr, "UTF-8 buffer is not interchange-valid.\n");
234     repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
235   }
236   return *this;
237 }
238 
UnsafeCopyUTF8(const char * buffer,int byte_length)239 UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer,
240                                            int byte_length) {
241   repr_.Copy(buffer, byte_length);
242   return *this;
243 }
244 
245 // ----- TakeOwnershipOf  -----
246 
TakeOwnershipOfUTF8(char * buffer,int byte_length,int byte_capacity)247 UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,
248                                               int byte_length,
249                                               int byte_capacity) {
250   repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
251   if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
252     fprintf(stderr, "UTF-8 buffer is not interchange-valid.\n");
253     repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
254   }
255   return *this;
256 }
257 
UnsafeTakeOwnershipOfUTF8(char * buffer,int byte_length,int byte_capacity)258 UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,
259                                                     int byte_length,
260                                                     int byte_capacity) {
261   repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
262   return *this;
263 }
264 
265 // ----- PointTo -----
266 
PointToUTF8(const char * buffer,int byte_length)267 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
268   if (UniLib:: IsInterchangeValid(buffer, byte_length)) {
269     repr_.PointTo(buffer, byte_length);
270   } else {
271     fprintf(stderr, "UTF-8 buffer is not interchange-valid.");
272     repr_.Copy(buffer, byte_length);
273     repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
274   }
275   return *this;
276 }
277 
UnsafePointToUTF8(const char * buffer,int byte_length)278 UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer,
279                                           int byte_length) {
280   repr_.PointTo(buffer, byte_length);
281   return *this;
282 }
283 
PointTo(const UnicodeText & src)284 UnicodeText& UnicodeText::PointTo(const UnicodeText& src) {
285   repr_.PointTo(src.repr_.data_, src.repr_.size_);
286   return *this;
287 }
288 
PointTo(const const_iterator & first,const const_iterator & last)289 UnicodeText& UnicodeText::PointTo(const const_iterator &first,
290                                   const const_iterator &last) {
291   assert(first <= last && " Incompatible iterators");
292   repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
293   return *this;
294 }
295 
296 // ----- Append -----
297 
append(const UnicodeText & u)298 UnicodeText& UnicodeText::append(const UnicodeText& u) {
299   repr_.append(u.repr_.data_, u.repr_.size_);
300   return *this;
301 }
302 
append(const const_iterator & first,const const_iterator & last)303 UnicodeText& UnicodeText::append(const const_iterator& first,
304                                  const const_iterator& last) {
305   assert(first <= last && "Incompatible iterators");
306   repr_.append(first.it_, last.it_ - first.it_);
307   return *this;
308 }
309 
UnsafeAppendUTF8(const char * utf8,int len)310 UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) {
311   repr_.append(utf8, len);
312   return *this;
313 }
314 
315 // ----- substring searching -----
316 
find(const UnicodeText & look,const_iterator start_pos) const317 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look,
318                                               const_iterator start_pos) const {
319   assert(start_pos.utf8_data() >= utf8_data());
320   assert(start_pos.utf8_data() <= utf8_data() + utf8_length());
321   return UnsafeFind(look, start_pos);
322 }
323 
find(const UnicodeText & look) const324 UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const {
325   return UnsafeFind(look, begin());
326 }
327 
UnsafeFind(const UnicodeText & look,const_iterator start_pos) const328 UnicodeText::const_iterator UnicodeText::UnsafeFind(
329     const UnicodeText& look, const_iterator start_pos) const {
330   // Due to the magic of the UTF8 encoding, searching for a sequence of
331   // letters is equivalent to substring search.
332   StringPiece searching(utf8_data(), utf8_length());
333   StringPiece look_piece(look.utf8_data(), look.utf8_length());
334   StringPiece::size_type found =
335       searching.find(look_piece, start_pos.utf8_data() - utf8_data());
336   if (found == StringPiece::npos) return end();
337   return const_iterator(utf8_data() + found);
338 }
339 
HasReplacementChar() const340 bool UnicodeText::HasReplacementChar() const {
341   // Equivalent to:
342   //   UnicodeText replacement_char;
343   //   replacement_char.push_back(0xFFFD);
344   //   return find(replacement_char) != end();
345   StringPiece searching(utf8_data(), utf8_length());
346   StringPiece looking_for("\xEF\xBF\xBD", 3);
347   return searching.find(looking_for) != StringPiece::npos;
348 }
349 
350 // ----- other methods -----
351 
352 // Clear operator
clear()353 void UnicodeText::clear() {
354   repr_.clear();
355 }
356 
357 // Destructor
~UnicodeText()358 UnicodeText::~UnicodeText() {}
359 
360 
push_back(char32 c)361 void UnicodeText::push_back(char32 c) {
362   if (UniLib::IsValidCodepoint(c)) {
363     char buf[UTFmax];
364     Rune rune = c;
365     int len = runetochar(buf, &rune);
366     if (UniLib::IsInterchangeValid(buf, len)) {
367       repr_.append(buf, len);
368     } else {
369       fprintf(stderr, "Unicode value 0x%x is not valid for interchange\n", c);
370       repr_.append(" ", 1);
371     }
372   } else {
373     fprintf(stderr, "Illegal Unicode value: 0x%x\n", c);
374     repr_.append(" ", 1);
375   }
376 }
377 
size() const378 int UnicodeText::size() const {
379   return CodepointCount(repr_.data_, repr_.size_);
380 }
381 
operator ==(const UnicodeText & lhs,const UnicodeText & rhs)382 bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) {
383   if (&lhs == &rhs) return true;
384   if (lhs.repr_.size_ != rhs.repr_.size_) return false;
385   return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
386 }
387 
DebugString() const388 string UnicodeText::DebugString() const {
389   stringstream ss;
390 
391   ss << "{UnicodeText " << hex << this << dec << " chars="
392      << size() << " repr=" << repr_.DebugString() << "}";
393 #if 0
394   return StringPrintf("{UnicodeText %p chars=%d repr=%s}",
395                       this,
396                       size(),
397                       repr_.DebugString().c_str());
398 #endif
399   string result;
400   ss >> result;
401 
402   return result;
403 }
404 
405 
406 // ******************* UnicodeText::const_iterator *********************
407 
408 // The implementation of const_iterator would be nicer if it
409 // inherited from boost::iterator_facade
410 // (http://boost.org/libs/iterator/doc/iterator_facade.html).
411 
const_iterator()412 UnicodeText::const_iterator::const_iterator() : it_(0) {}
413 
const_iterator(const const_iterator & other)414 UnicodeText::const_iterator::const_iterator(const const_iterator& other)
415     : it_(other.it_) {
416 }
417 
418 UnicodeText::const_iterator&
operator =(const const_iterator & other)419 UnicodeText::const_iterator::operator=(const const_iterator& other) {
420   if (&other != this)
421     it_ = other.it_;
422   return *this;
423 }
424 
begin() const425 UnicodeText::const_iterator UnicodeText::begin() const {
426   return const_iterator(repr_.data_);
427 }
428 
end() const429 UnicodeText::const_iterator UnicodeText::end() const {
430   return const_iterator(repr_.data_ + repr_.size_);
431 }
432 
operator <(const UnicodeText::const_iterator & lhs,const UnicodeText::const_iterator & rhs)433 bool operator<(const UnicodeText::const_iterator& lhs,
434                const UnicodeText::const_iterator& rhs) {
435   return lhs.it_ < rhs.it_;
436 }
437 
operator *() const438 char32 UnicodeText::const_iterator::operator*() const {
439   // (We could call chartorune here, but that does some
440   // error-checking, and we're guaranteed that our data is valid
441   // UTF-8. Also, we expect this routine to be called very often. So
442   // for speed, we do the calculation ourselves.)
443 
444   // Convert from UTF-8
445   uint8 byte1 = static_cast<uint8>(it_[0]);
446   if (byte1 < 0x80)
447     return byte1;
448 
449   uint8 byte2 = static_cast<uint8>(it_[1]);
450   if (byte1 < 0xE0)
451     return ((byte1 & 0x1F) << 6)
452           | (byte2 & 0x3F);
453 
454   uint8 byte3 = static_cast<uint8>(it_[2]);
455   if (byte1 < 0xF0)
456     return ((byte1 & 0x0F) << 12)
457          | ((byte2 & 0x3F) << 6)
458          |  (byte3 & 0x3F);
459 
460   uint8 byte4 = static_cast<uint8>(it_[3]);
461   return ((byte1 & 0x07) << 18)
462        | ((byte2 & 0x3F) << 12)
463        | ((byte3 & 0x3F) << 6)
464        |  (byte4 & 0x3F);
465 }
466 
operator ++()467 UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
468   it_ += UniLib::OneCharLen(it_);
469   return *this;
470 }
471 
operator --()472 UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
473   while (UniLib::IsTrailByte(*--it_)) { }
474   return *this;
475 }
476 
get_utf8(char * utf8_output) const477 int UnicodeText::const_iterator::get_utf8(char* utf8_output) const {
478   utf8_output[0] = it_[0];
479   if (static_cast<unsigned char>(it_[0]) < 0x80)
480     return 1;
481 
482   utf8_output[1] = it_[1];
483   if (static_cast<unsigned char>(it_[0]) < 0xE0)
484     return 2;
485 
486   utf8_output[2] = it_[2];
487   if (static_cast<unsigned char>(it_[0]) < 0xF0)
488     return 3;
489 
490   utf8_output[3] = it_[3];
491   return 4;
492 }
493 
494 
MakeIterator(const char * p) const495 UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const {
496 #ifndef NDEBUG
497   assert(p != NULL);
498   const char* start = utf8_data();
499   int len = utf8_length();
500   const char* end = start + len;
501   assert(p >= start);
502   assert(p <= end);
503   assert(p == end || !UniLib::IsTrailByte(*p));
504 #endif
505   return const_iterator(p);
506 }
507 
DebugString() const508 string UnicodeText::const_iterator::DebugString() const {
509   stringstream ss;
510 
511   ss << "{iter " << hex << it_ << "}";
512   string result;
513   ss >> result;
514 
515   return result;
516 }
517 
518 }  // namespace phonenumbers
519 }  // namespace i18n
520