1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "utils/strings/utf8.h"
18
19 #include "utils/base/logging.h"
20
21 namespace libtextclassifier3 {
22
IsValidUTF8(const char * src,int size)23 bool IsValidUTF8(const char *src, int size) {
24 int char_length;
25 for (int i = 0; i < size;) {
26 if (!IsValidChar(src + i, size - i, &char_length)) {
27 return false;
28 }
29 i += char_length;
30 }
31 return true;
32 }
33
SafeTruncateLength(const char * str,int truncate_at)34 int SafeTruncateLength(const char *str, int truncate_at) {
35 // Always want to truncate at the start of a character, so if
36 // it's in a middle, back up toward the start
37 while (IsTrailByte(str[truncate_at]) && (truncate_at > 0)) {
38 truncate_at--;
39 }
40 return truncate_at;
41 }
42
ValidCharToRune(const char * str)43 char32 ValidCharToRune(const char *str) {
44 TC3_DCHECK(!IsTrailByte(str[0]) && GetNumBytesForUTF8Char(str) > 0);
45
46 // Convert from UTF-8
47 unsigned char byte1 = static_cast<unsigned char>(str[0]);
48 if (byte1 < 0x80) {
49 // One character sequence: 00000 - 0007F.
50 return byte1;
51 }
52
53 unsigned char byte2 = static_cast<unsigned char>(str[1]);
54 if (byte1 < 0xE0) {
55 // Two character sequence: 00080 - 007FF.
56 return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);
57 }
58
59 unsigned char byte3 = static_cast<unsigned char>(str[2]);
60 if (byte1 < 0xF0) {
61 // Three character sequence: 00800 - 0FFFF.
62 return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);
63 }
64
65 unsigned char byte4 = static_cast<unsigned char>(str[3]);
66 // Four character sequence: 10000 - 1FFFF.
67 return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) |
68 ((byte3 & 0x3F) << 6) | (byte4 & 0x3F);
69 }
70
IsValidChar(const char * str,int size,int * num_bytes)71 bool IsValidChar(const char *str, int size, int *num_bytes) {
72 // Unexpected trail byte.
73 if (IsTrailByte(str[0])) {
74 return false;
75 }
76
77 *num_bytes = GetNumBytesForUTF8Char(str);
78 if (*num_bytes <= 0 || *num_bytes > size) {
79 return false;
80 }
81
82 // Check that remaining bytes in the codepoint are trailing bytes.
83 for (int k = 1; k < *num_bytes; k++) {
84 if (!IsTrailByte(str[k])) {
85 return false;
86 }
87 }
88
89 // Exclude overlong encodings.
90 // Check that the codepoint is encoded with the minimum number of required
91 // bytes. An ascii value could be encoded in 4, 3 or 2 bytes but requires
92 // only 1. There is a unique valid encoding for each code point.
93 // This ensures that string comparisons and searches are well-defined.
94 // See: https://en.wikipedia.org/wiki/UTF-8
95 const char32 codepoint = ValidCharToRune(str);
96 switch (*num_bytes) {
97 case 1:
98 return true;
99 case 2:
100 // Everything below 128 can be encoded in one byte.
101 return (codepoint >= (1 << 7 /* num. payload bits in one byte */));
102 case 3:
103 return (codepoint >= (1 << 11 /* num. payload bits in two utf8 bytes */));
104 case 4:
105 return (codepoint >=
106 (1 << 16 /* num. payload bits in three utf8 bytes */)) &&
107 (codepoint < 0x10FFFF /* maximum rune value */);
108 }
109 return false;
110 }
111
ValidRuneToChar(const char32 rune,char * dest)112 int ValidRuneToChar(const char32 rune, char *dest) {
113 // Convert to unsigned for range check.
114 uint32 c;
115
116 // 1 char 00-7F
117 c = rune;
118 if (c <= 0x7F) {
119 dest[0] = static_cast<char>(c);
120 return 1;
121 }
122
123 // 2 char 0080-07FF
124 if (c <= 0x07FF) {
125 dest[0] = 0xC0 | static_cast<char>(c >> 1 * 6);
126 dest[1] = 0x80 | (c & 0x3F);
127 return 2;
128 }
129
130 // 3 char 0800-FFFF
131 if (c <= 0xFFFF) {
132 dest[0] = 0xE0 | static_cast<char>(c >> 2 * 6);
133 dest[1] = 0x80 | ((c >> 1 * 6) & 0x3F);
134 dest[2] = 0x80 | (c & 0x3F);
135 return 3;
136 }
137
138 // 4 char 10000-1FFFFF
139 dest[0] = 0xF0 | static_cast<char>(c >> 3 * 6);
140 dest[1] = 0x80 | ((c >> 2 * 6) & 0x3F);
141 dest[2] = 0x80 | ((c >> 1 * 6) & 0x3F);
142 dest[3] = 0x80 | (c & 0x3F);
143 return 4;
144 }
145
146 } // namespace libtextclassifier3
147