• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2010 Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 // Routines to do manipulation of Unicode characters or text
18 //
19 // The StructurallyValid routines accept buffers of arbitrary bytes.
20 // For CoerceToStructurallyValid(), the input buffer and output buffers may
21 // point to exactly the same memory.
22 //
23 // In all other cases, the UTF-8 string must be structurally valid and
24 // have all codepoints in the range  U+0000 to U+D7FF or U+E000 to U+10FFFF.
25 // Debug builds take a fatal error for invalid UTF-8 input.
26 // The input and output buffers may not overlap at all.
27 //
28 // The char32 routines are here only for convenience; they convert to UTF-8
29 // internally and use the UTF-8 routines.
30 
31 #ifndef UTIL_UTF8_UNILIB_H__
32 #define UTIL_UTF8_UNILIB_H__
33 
34 #include <string>
35 #include "phonenumbers/base/basictypes.h"
36 
37 namespace i18n {
38 namespace phonenumbers {
39 namespace UniLib {
40 
41 // Returns true unless a surrogate code point
IsValidCodepoint(char32 c)42 inline bool IsValidCodepoint(char32 c) {
43   // In the range [0, 0xD800) or [0xE000, 0x10FFFF]
44   return (static_cast<uint32>(c) < 0xD800)
45     || (c >= 0xE000 && c <= 0x10FFFF);
46 }
47 
48 // Table of UTF-8 character lengths, based on first byte
49 static const unsigned char kUTF8LenTbl[256] = {
50   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
51   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
52   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
53   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
54 
55   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
56   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
57   2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
58   3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
59 };
60 
61 // Return length of a single UTF-8 source character
OneCharLen(const char * src)62 inline int OneCharLen(const char* src) {
63   return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
64 }
65 
66 // Return length of a single UTF-8 source character
OneCharLen(const uint8 * src)67 inline int OneCharLen(const uint8* src) {
68   return kUTF8LenTbl[*src];
69 }
70 
71 // Return true if this byte is a trailing UTF-8 byte (10xx xxxx)
IsTrailByte(char x)72 inline bool IsTrailByte(char x) {
73   // return (x & 0xC0) == 0x80;
74   // Since trail bytes are always in [0x80, 0xBF], we can optimize:
75   return static_cast<signed char>(x) < -0x40;
76 }
77 
78 // Returns the length in bytes of the prefix of src that is all
79 //  interchange valid UTF-8
80 int SpanInterchangeValid(const char* src, int byte_length);
SpanInterchangeValid(const std::string & src)81 inline int SpanInterchangeValid(const std::string& src) {
82   return SpanInterchangeValid(src.data(), src.size());
83 }
84 
85 // Returns true if the source is all interchange valid UTF-8
86 // "Interchange valid" is a stronger than structurally valid --
87 // no C0 or C1 control codes (other than CR LF HT FF) and no non-characters.
IsInterchangeValid(const char * src,int byte_length)88 inline bool IsInterchangeValid(const char* src, int byte_length) {
89   return (byte_length == SpanInterchangeValid(src, byte_length));
90 }
IsInterchangeValid(const std::string & src)91 inline bool IsInterchangeValid(const std::string& src) {
92   return IsInterchangeValid(src.data(), src.size());
93 }
94 
95 }  // namespace UniLib
96 }  // namespace phonenumbers
97 }  // namespace i18n
98 
99 #endif  // UTIL_UTF8_PUBLIC_UNILIB_H_
100