1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "utils/strings/utf8.h"
18
19 #include "utils/base/logging.h"
20
21 namespace libtextclassifier3 {
IsValidUTF8(const char * src,int size)22 bool IsValidUTF8(const char *src, int size) {
23 for (int i = 0; i < size;) {
24 const int char_length = ValidUTF8CharLength(src + i, size - i);
25 if (char_length <= 0) {
26 return false;
27 }
28 i += char_length;
29 }
30 return true;
31 }
32
ValidUTF8CharLength(const char * src,int size)33 int ValidUTF8CharLength(const char *src, int size) {
34 // Unexpected trail byte.
35 if (IsTrailByte(src[0])) {
36 return -1;
37 }
38
39 const int num_codepoint_bytes = GetNumBytesForUTF8Char(&src[0]);
40 if (num_codepoint_bytes <= 0 || num_codepoint_bytes > size) {
41 return -1;
42 }
43
44 // Check that remaining bytes in the codepoint are trailing bytes.
45 for (int k = 1; k < num_codepoint_bytes; k++) {
46 if (!IsTrailByte(src[k])) {
47 return -1;
48 }
49 }
50
51 return num_codepoint_bytes;
52 }
53
SafeTruncateLength(const char * str,int truncate_at)54 int SafeTruncateLength(const char *str, int truncate_at) {
55 // Always want to truncate at the start of a character, so if
56 // it's in a middle, back up toward the start
57 while (IsTrailByte(str[truncate_at]) && (truncate_at > 0)) {
58 truncate_at--;
59 }
60 return truncate_at;
61 }
62
ValidCharToRune(const char * str)63 char32 ValidCharToRune(const char *str) {
64 TC3_DCHECK(!IsTrailByte(str[0]) && GetNumBytesForUTF8Char(str) > 0);
65
66 // Convert from UTF-8
67 unsigned char byte1 = static_cast<unsigned char>(str[0]);
68 if (byte1 < 0x80) {
69 // One character sequence: 00000 - 0007F.
70 return byte1;
71 }
72
73 unsigned char byte2 = static_cast<unsigned char>(str[1]);
74 if (byte1 < 0xE0) {
75 // Two character sequence: 00080 - 007FF.
76 return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);
77 }
78
79 unsigned char byte3 = static_cast<unsigned char>(str[2]);
80 if (byte1 < 0xF0) {
81 // Three character sequence: 00800 - 0FFFF.
82 return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);
83 }
84
85 unsigned char byte4 = static_cast<unsigned char>(str[3]);
86 // Four character sequence: 10000 - 1FFFF.
87 return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) |
88 ((byte3 & 0x3F) << 6) | (byte4 & 0x3F);
89 }
90
ValidRuneToChar(const char32 rune,char * dest)91 int ValidRuneToChar(const char32 rune, char *dest) {
92 // Convert to unsigned for range check.
93 uint32 c;
94
95 // 1 char 00-7F
96 c = rune;
97 if (c <= 0x7F) {
98 dest[0] = static_cast<char>(c);
99 return 1;
100 }
101
102 // 2 char 0080-07FF
103 if (c <= 0x07FF) {
104 dest[0] = 0xC0 | static_cast<char>(c >> 1 * 6);
105 dest[1] = 0x80 | (c & 0x3F);
106 return 2;
107 }
108
109 // 3 char 0800-FFFF
110 if (c <= 0xFFFF) {
111 dest[0] = 0xE0 | static_cast<char>(c >> 2 * 6);
112 dest[1] = 0x80 | ((c >> 1 * 6) & 0x3F);
113 dest[2] = 0x80 | (c & 0x3F);
114 return 3;
115 }
116
117 // 4 char 10000-1FFFFF
118 dest[0] = 0xF0 | static_cast<char>(c >> 3 * 6);
119 dest[1] = 0x80 | ((c >> 2 * 6) & 0x3F);
120 dest[2] = 0x80 | ((c >> 1 * 6) & 0x3F);
121 dest[3] = 0x80 | (c & 0x3F);
122 return 4;
123 }
124
125 } // namespace libtextclassifier3
126