1 /*
2 * Copyright (c) 2023 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "utf8_utils.h"
17
18 namespace OHOS::Request::Utf8Utils {
19 namespace {
20 static constexpr size_t TWO_OCTET = 2;
21 static constexpr size_t THREE_OCTET = 3;
22 static constexpr size_t FOUR_OCTET = 4;
23
GetNextByte(const std::vector<uint8_t> & v,size_t & index,uint8_t & next)24 bool GetNextByte(const std::vector<uint8_t> &v, size_t &index, uint8_t &next)
25 {
26 index += 1;
27 if (index >= v.size()) {
28 return false;
29 }
30 next = v[index];
31 return true;
32 }
33
34 // Given a first byte, determines how many bytes are in this UTF-8 character.
Utf8CharWidth(uint8_t b)35 size_t Utf8CharWidth(uint8_t b)
36 {
37 // https://tools.ietf.org/html/rfc3629
38 static const size_t UTF8_CHAR_WIDTH[256] = {
39 // 1 2 3 4 5 6 7 8 9 A B C D E F
40 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0
41 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
42 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2
43 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3
44 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
45 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5
46 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
47 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
48 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8
49 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9
50 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A
51 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
52 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C
53 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D
54 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E
55 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // F
56 };
57 return UTF8_CHAR_WIDTH[b];
58 }
59
60 // https://tools.ietf.org/html/rfc3629
61 // UTF8-1 = %x00-7F
62 // UTF8-2 = %xC2-DF UTF8-tail
Check2Bytes(const std::vector<uint8_t> & v,size_t & index)63 bool Check2Bytes(const std::vector<uint8_t> &v, size_t &index)
64 {
65 uint8_t next = 0;
66 return GetNextByte(v, index, next) && (next >= 0x80 && next <= 0xBF);
67 }
68
69 // https://tools.ietf.org/html/rfc3629
70 // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
71 // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
Check3Bytes(const std::vector<uint8_t> & v,const size_t & first,size_t & index)72 bool Check3Bytes(const std::vector<uint8_t> &v, const size_t &first, size_t &index)
73 {
74 uint8_t next = 0;
75 if (!GetNextByte(v, index, next)) {
76 return false;
77 };
78
79 if (first == 0xE0 && next >= 0xA0 && next <= 0xBF) {
80 } else if (first >= 0xE1 && first <= 0xEC && next >= 0x80 && next <= 0xBF) {
81 } else if (first == 0xED && next >= 0x80 && next <= 0x9F) {
82 } else if (first >= 0xEE && first <= 0xEF && next >= 0x80 && next <= 0xBF) {
83 } else {
84 return false;
85 };
86
87 return Check2Bytes(v, index);
88 }
89
90 // https://tools.ietf.org/html/rfc3629
91 // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
92 // %xF4 %x80-8F 2( UTF8-tail )
Check4Bytes(const std::vector<uint8_t> & v,const size_t & first,size_t & index)93 bool Check4Bytes(const std::vector<uint8_t> &v, const size_t &first, size_t &index)
94 {
95 uint8_t next = 0;
96 if (!GetNextByte(v, index, next)) {
97 return false;
98 };
99
100 if (first == 0xF0 && next >= 0x90 && next <= 0xBF) {
101 } else if (first >= 0xF1 && first <= 0xF3 && next >= 0x80 && next <= 0xBF) {
102 } else if (first == 0xF4 && next >= 0x80 && next <= 0x8F) {
103 } else {
104 return false;
105 }
106
107 return Check2Bytes(v, index) && Check2Bytes(v, index);
108 }
109 } // namespace
110
RunUtf8Validation(const std::vector<uint8_t> & v)111 bool RunUtf8Validation(const std::vector<uint8_t> &v)
112 {
113 size_t index = 0;
114 size_t len = v.size();
115
116 while (index < len) {
117 uint8_t first = v[index];
118
119 // <= 0x7F means single byte.
120 if (first <= 0x7F) {
121 index += 1;
122 continue;
123 }
124
125 size_t w = Utf8CharWidth(first);
126 if (w == TWO_OCTET) {
127 if (!Check2Bytes(v, index)) {
128 return false;
129 }
130 } else if (w == THREE_OCTET) {
131 if (!Check3Bytes(v, first, index)) {
132 return false;
133 }
134 } else if (w == FOUR_OCTET) {
135 if (!Check4Bytes(v, first, index)) {
136 return false;
137 }
138 } else {
139 return false;
140 };
141 index += 1;
142 }
143 return true;
144 }
145 } // namespace OHOS::Request::Utf8Utils