• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "utils/strings/utf8.h"
18 
19 #include "utils/base/logging.h"
20 
21 namespace libtextclassifier3 {
IsValidUTF8(const char * src,int size)22 bool IsValidUTF8(const char *src, int size) {
23   for (int i = 0; i < size;) {
24     const int char_length = ValidUTF8CharLength(src + i, size - i);
25     if (char_length <= 0) {
26       return false;
27     }
28     i += char_length;
29   }
30   return true;
31 }
32 
ValidUTF8CharLength(const char * src,int size)33 int ValidUTF8CharLength(const char *src, int size) {
34   // Unexpected trail byte.
35   if (IsTrailByte(src[0])) {
36     return -1;
37   }
38 
39   const int num_codepoint_bytes = GetNumBytesForUTF8Char(&src[0]);
40   if (num_codepoint_bytes <= 0 || num_codepoint_bytes > size) {
41     return -1;
42   }
43 
44   // Check that remaining bytes in the codepoint are trailing bytes.
45   for (int k = 1; k < num_codepoint_bytes; k++) {
46     if (!IsTrailByte(src[k])) {
47       return -1;
48     }
49   }
50 
51   return num_codepoint_bytes;
52 }
53 
SafeTruncateLength(const char * str,int truncate_at)54 int SafeTruncateLength(const char *str, int truncate_at) {
55   // Always want to truncate at the start of a character, so if
56   // it's in a middle, back up toward the start
57   while (IsTrailByte(str[truncate_at]) && (truncate_at > 0)) {
58     truncate_at--;
59   }
60   return truncate_at;
61 }
62 
ValidCharToRune(const char * str)63 char32 ValidCharToRune(const char *str) {
64   TC3_DCHECK(!IsTrailByte(str[0]) && GetNumBytesForUTF8Char(str) > 0);
65 
66   // Convert from UTF-8
67   unsigned char byte1 = static_cast<unsigned char>(str[0]);
68   if (byte1 < 0x80) {
69     // One character sequence: 00000 - 0007F.
70     return byte1;
71   }
72 
73   unsigned char byte2 = static_cast<unsigned char>(str[1]);
74   if (byte1 < 0xE0) {
75     // Two character sequence: 00080 - 007FF.
76     return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);
77   }
78 
79   unsigned char byte3 = static_cast<unsigned char>(str[2]);
80   if (byte1 < 0xF0) {
81     // Three character sequence: 00800 - 0FFFF.
82     return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);
83   }
84 
85   unsigned char byte4 = static_cast<unsigned char>(str[3]);
86   // Four character sequence: 10000 - 1FFFF.
87   return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) |
88          ((byte3 & 0x3F) << 6) | (byte4 & 0x3F);
89 }
90 
ValidRuneToChar(const char32 rune,char * dest)91 int ValidRuneToChar(const char32 rune, char *dest) {
92   // Convert to unsigned for range check.
93   uint32 c;
94 
95   // 1 char 00-7F
96   c = rune;
97   if (c <= 0x7F) {
98     dest[0] = static_cast<char>(c);
99     return 1;
100   }
101 
102   // 2 char 0080-07FF
103   if (c <= 0x07FF) {
104     dest[0] = 0xC0 | static_cast<char>(c >> 1 * 6);
105     dest[1] = 0x80 | (c & 0x3F);
106     return 2;
107   }
108 
109   // 3 char 0800-FFFF
110   if (c <= 0xFFFF) {
111     dest[0] = 0xE0 | static_cast<char>(c >> 2 * 6);
112     dest[1] = 0x80 | ((c >> 1 * 6) & 0x3F);
113     dest[2] = 0x80 | (c & 0x3F);
114     return 3;
115   }
116 
117   // 4 char 10000-1FFFFF
118   dest[0] = 0xF0 | static_cast<char>(c >> 3 * 6);
119   dest[1] = 0x80 | ((c >> 2 * 6) & 0x3F);
120   dest[2] = 0x80 | ((c >> 1 * 6) & 0x3F);
121   dest[3] = 0x80 | (c & 0x3F);
122   return 4;
123 }
124 
125 }  // namespace libtextclassifier3
126