1 /* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18 // -*- c++ -*-
19 // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
20
21 // O S C L UTF-8 S T R I N G FUNCTIONS
22
23 // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
24
25 // - - Inclusion - - - - - - - - - - - - - - - - - - - - - - - - - - - -
26
27 #include "oscl_string_utf8.h"
28 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
29
30 #define UTF8_1_BYTE_MASK 0x80 //1000 0000 -> 0xxx xxxx
31 #define UTF8_1_BYTE_SIG 0x00 //0000 0000 -> 0xxx xxxx
32
33 #define UTF8_2_BYTES_MASK_1 0xE0 //1110 0000 -> 110y yyyy
34 #define UTF8_2_BYTES_MASK_2 0xC0 //1100 0000 -> 10xx xxxx
35 #define UTF8_2_BYTES_SIG_1 0xC0 //1100 0000 -> 110y yyyy
36 #define UTF8_2_BYTES_SIG_2 0x80 //1000 0000 -> 10xx xxxx
37
38
39 #define UTF8_3_BYTES_MASK_1 0xF0 //1111 0000 -> 1110 zzzz
40 #define UTF8_3_BYTES_MASK_2 0xC0 //1100 0000 -> 10yy yyyy
41 #define UTF8_3_BYTES_MASK_3 0xC0 //1100 0000 -> 10xx xxxx
42 #define UTF8_3_BYTES_SIG_1 0xE0 //1110 0000 -> 1110 zzzz
43 #define UTF8_3_BYTES_SIG_2 0x80 //1000 0000 -> 10yy yyyy
44 #define UTF8_3_BYTES_SIG_3 0x80 //1000 0000 -> 10xx xxxx
45
46
47 #define UTF8_4_BYTES_MASK_1 0xF8 //1111 1000 -> 1111 0uuu
48 #define UTF8_4_BYTES_MASK_2 0xC0 //1100 0000 -> 10uu zzzz
49 #define UTF8_4_BYTES_MASK_3 0xC0 //1100 0000 -> 10yy yyyy
50 #define UTF8_4_BYTES_MASK_4 0xC0 //1100 0000 -> 10xx xxxx
51 #define UTF8_4_BYTES_SIG_1 0xF0 //1111 0000 -> 1111 0uuu
52 #define UTF8_4_BYTES_SIG_2 0x80 //1000 0000 -> 10uu zzzz
53 #define UTF8_4_BYTES_SIG_3 0x80 //1000 0000 -> 10yy yyyy
54 #define UTF8_4_BYTES_SIG_4 0x80 //1000 0000 -> 10xx xxxx
55
56 /* ========================================================================
57 * Function : oscl_str_is_valid_and_truncate_utf8
58 * Date : 10/25/2002
59 * Purpose :
60 * Modified :
61 * ========================================================================
62 */
oscl_str_is_valid_and_truncate_utf8(const uint8 * str_buf,uint32 & num_valid_characters,uint32 max_bytes,uint32 & truncate_pos_index,uint32 max_char_2_valid=0)63 OSCL_EXPORT_REF bool oscl_str_is_valid_and_truncate_utf8(const uint8 *str_buf, uint32& num_valid_characters, uint32 max_bytes, uint32& truncate_pos_index,
64 uint32 max_char_2_valid = 0)
65 {
66 bool result = true, done = false;
67 uint32 bytes_left;
68 uint32 total_num_valid_characters;
69 const uint8 *ptr;
70
71 ptr = str_buf;
72 bytes_left = max_bytes;
73 total_num_valid_characters = 0;
74 truncate_pos_index = 0;
75
76 while (!done)
77 {
78 if ((*ptr & UTF8_1_BYTE_MASK) == UTF8_1_BYTE_SIG)
79 {
80 if ((*ptr == 0x0) && (max_bytes == 0)) //if need to be terminated at first null char
81 {
82 result = true;
83 done = true;
84 continue;
85 }
86 if (bytes_left > 0)
87 bytes_left--;
88 if (max_bytes && !bytes_left) //done when there is no more data
89 done = true;
90
91 ptr++;
92 total_num_valid_characters ++;
93 truncate_pos_index += 1;
94 }
95 else if ((*ptr & UTF8_2_BYTES_MASK_1) == UTF8_2_BYTES_SIG_1)
96 {
97 if ((!max_bytes) || (bytes_left >= 2))
98 {
99 if (((*(ptr + 1) & UTF8_2_BYTES_MASK_2) != UTF8_2_BYTES_SIG_2)
100 || (*ptr < 0xC2) //C0, C1 are illegal
101 )
102 { //invalid char
103 done = true;
104 result = false;
105 continue;
106 }
107 if (bytes_left)
108 bytes_left -= 2;
109
110 if (max_bytes && !bytes_left) //done when there is no more data
111 done = true;
112
113 ptr += 2;
114 total_num_valid_characters++;
115 truncate_pos_index += 2;
116 }
117 else
118 {
119 done = true;
120 result = false;
121 }
122 }
123 else if ((*ptr & UTF8_3_BYTES_MASK_1) == UTF8_3_BYTES_SIG_1)
124 {
125 if ((!max_bytes) || (bytes_left >= 3))
126 {
127 if (((*(ptr + 1) & UTF8_3_BYTES_MASK_2) != UTF8_3_BYTES_SIG_2)
128 || ((*(ptr + 2) & UTF8_3_BYTES_MASK_3) != UTF8_3_BYTES_SIG_3)
129 || ((*ptr == 0xE0) && (*(ptr + 1) < 0xA0)) //if 1st byte =E0, 2n byte has to be greater than 0xA0
130 )
131 { //invalid char
132 done = true;
133 result = false;
134 continue;
135 }
136 if (bytes_left > 0)
137 bytes_left -= 3;
138
139 if (max_bytes && !bytes_left) //done when there is no more data
140 done = true;
141
142 ptr += 3;
143 total_num_valid_characters++;
144 truncate_pos_index += 3;
145 }
146 else
147 {
148 done = true;
149 result = false;
150 }
151 }
152 else if ((*ptr & UTF8_4_BYTES_MASK_1) == UTF8_4_BYTES_SIG_1)
153 {
154 if ((!max_bytes) || (bytes_left >= 4))
155 {
156 if (((*(ptr + 1) & UTF8_4_BYTES_MASK_2) != UTF8_4_BYTES_SIG_2)
157 || ((*(ptr + 2) & UTF8_4_BYTES_MASK_3) != UTF8_4_BYTES_SIG_3)
158 || ((*(ptr + 3) & UTF8_4_BYTES_MASK_4) != UTF8_4_BYTES_SIG_4)
159 || (*ptr > 0xF4)
160 || ((*ptr == 0xF4) && (*(ptr + 1) > 0x8F))
161 )
162 { //invalid char
163 done = true;
164 result = false;
165 continue;
166 }
167 if (bytes_left > 0)
168 bytes_left -= 4;
169
170 if (max_bytes && !bytes_left) //done when there is no more data
171 done = true;
172
173 ptr += 4;
174 total_num_valid_characters++;
175 truncate_pos_index += 4;
176 }
177 else
178 {
179 done = true;
180 result = false;
181 }
182 }
183 else //found invalid char
184 {
185 done = true;
186 result = false;
187
188 if (bytes_left > 0)
189 bytes_left--;
190 }
191 if (num_valid_characters && (total_num_valid_characters == num_valid_characters))
192 done = true;
193 if ((!num_valid_characters) && max_char_2_valid && (total_num_valid_characters == max_char_2_valid))
194 done = true;
195
196 }
197 num_valid_characters = total_num_valid_characters;
198
199 return result;
200 }
201
202 /* ========================================================================
203 * Function : oscl_str_is_valid_utf8
204 * Date : 10/25/2002
205 * Purpose : see oscl_string_utf8.h
206 * Modified :
207 * ========================================================================
208 */
oscl_str_is_valid_utf8(const uint8 * str_buf,uint32 & num_valid_characters,uint32 max_bytes,uint32 max_char_2_valid,uint32 * num_byte_4_char)209 OSCL_EXPORT_REF bool oscl_str_is_valid_utf8(const uint8 *str_buf, uint32& num_valid_characters, uint32 max_bytes,
210 uint32 max_char_2_valid, uint32 *num_byte_4_char)
211 {
212 uint32 truncate_pos_index = 0;
213 num_valid_characters = 0;
214 bool result;
215
216 if (num_byte_4_char)
217 *num_byte_4_char = 0;
218 result = oscl_str_is_valid_and_truncate_utf8(str_buf, num_valid_characters, max_bytes, truncate_pos_index, max_char_2_valid);
219 if (num_byte_4_char)
220 *num_byte_4_char = truncate_pos_index;
221
222 return result;
223 }
224 /* ========================================================================
225 * Function : oscl_str_truncate_utf8
226 * Date : 10/25/2002
227 * Purpose : see oscl_string_utf8.h
228 * Modified :
229 * ========================================================================
230 */
oscl_str_truncate_utf8(uint8 * str_buf,uint32 max_chars,uint32 max_bytes)231 OSCL_EXPORT_REF int32 oscl_str_truncate_utf8(uint8 *str_buf, uint32 max_chars, uint32 max_bytes)
232 {
233 uint32 num_valid_characters, truncate_pos_index;
234
235 num_valid_characters = max_chars;
236 truncate_pos_index = 0;
237
238 oscl_str_is_valid_and_truncate_utf8(str_buf, num_valid_characters, max_bytes, truncate_pos_index);
239 if (max_bytes)
240 {
241 if (truncate_pos_index <= (max_bytes - 1))
242 *(str_buf + truncate_pos_index) = 0x00;
243 }
244 else
245 {
246 *(str_buf + truncate_pos_index) = 0x00;
247 }
248
249 return num_valid_characters;
250 }
251
252