• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2023 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "utf8_utils.h"
17 
18 namespace OHOS::Request::Utf8Utils {
19 namespace {
20 static constexpr size_t TWO_OCTET = 2;
21 static constexpr size_t THREE_OCTET = 3;
22 static constexpr size_t FOUR_OCTET = 4;
23 
GetNextByte(const std::vector<uint8_t> & v,size_t & index,uint8_t & next)24 bool GetNextByte(const std::vector<uint8_t> &v, size_t &index, uint8_t &next)
25 {
26     index += 1;
27     if (index >= v.size()) {
28         return false;
29     }
30     next = v[index];
31     return true;
32 }
33 
34 // Given a first byte, determines how many bytes are in this UTF-8 character.
Utf8CharWidth(uint8_t b)35 size_t Utf8CharWidth(uint8_t b)
36 {
37     // https://tools.ietf.org/html/rfc3629
38     static const size_t UTF8_CHAR_WIDTH[256] = {
39         // 1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
40         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0
41         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
42         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2
43         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3
44         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
45         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5
46         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
47         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
48         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8
49         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9
50         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A
51         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
52         0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C
53         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D
54         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E
55         4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // F
56     };
57     return UTF8_CHAR_WIDTH[b];
58 }
59 
60 // https://tools.ietf.org/html/rfc3629
61 // UTF8-1      = %x00-7F
62 // UTF8-2      = %xC2-DF UTF8-tail
Check2Bytes(const std::vector<uint8_t> & v,size_t & index)63 bool Check2Bytes(const std::vector<uint8_t> &v, size_t &index)
64 {
65     uint8_t next = 0;
66     return GetNextByte(v, index, next) && (next >= 0x80 && next <= 0xBF);
67 }
68 
69 // https://tools.ietf.org/html/rfc3629
70 // UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
71 //               %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
Check3Bytes(const std::vector<uint8_t> & v,const size_t & first,size_t & index)72 bool Check3Bytes(const std::vector<uint8_t> &v, const size_t &first, size_t &index)
73 {
74     uint8_t next = 0;
75     if (!GetNextByte(v, index, next)) {
76         return false;
77     };
78 
79     if (first == 0xE0 && next >= 0xA0 && next <= 0xBF) {
80     } else if (first >= 0xE1 && first <= 0xEC && next >= 0x80 && next <= 0xBF) {
81     } else if (first == 0xED && next >= 0x80 && next <= 0x9F) {
82     } else if (first >= 0xEE && first <= 0xEF && next >= 0x80 && next <= 0xBF) {
83     } else {
84         return false;
85     };
86 
87     return Check2Bytes(v, index);
88 }
89 
90 // https://tools.ietf.org/html/rfc3629
91 // UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
92 //               %xF4 %x80-8F 2( UTF8-tail )
Check4Bytes(const std::vector<uint8_t> & v,const size_t & first,size_t & index)93 bool Check4Bytes(const std::vector<uint8_t> &v, const size_t &first, size_t &index)
94 {
95     uint8_t next = 0;
96     if (!GetNextByte(v, index, next)) {
97         return false;
98     };
99 
100     if (first == 0xF0 && next >= 0x90 && next <= 0xBF) {
101     } else if (first >= 0xF1 && first <= 0xF3 && next >= 0x80 && next <= 0xBF) {
102     } else if (first == 0xF4 && next >= 0x80 && next <= 0x8F) {
103     } else {
104         return false;
105     }
106 
107     return Check2Bytes(v, index) && Check2Bytes(v, index);
108 }
109 } // namespace
110 
RunUtf8Validation(const std::vector<uint8_t> & v)111 bool RunUtf8Validation(const std::vector<uint8_t> &v)
112 {
113     size_t index = 0;
114     size_t len = v.size();
115 
116     while (index < len) {
117         uint8_t first = v[index];
118 
119         // <= 0x7F means single byte.
120         if (first <= 0x7F) {
121             index += 1;
122             continue;
123         }
124 
125         size_t w = Utf8CharWidth(first);
126         if (w == TWO_OCTET) {
127             if (!Check2Bytes(v, index)) {
128                 return false;
129             }
130         } else if (w == THREE_OCTET) {
131             if (!Check3Bytes(v, first, index)) {
132                 return false;
133             }
134         } else if (w == FOUR_OCTET) {
135             if (!Check4Bytes(v, first, index)) {
136                 return false;
137             }
138         } else {
139             return false;
140         };
141         index += 1;
142     }
143     return true;
144 }
145 } // namespace OHOS::Request::Utf8Utils