• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ------------------------------------------------------------------
2  * Copyright (C) 1998-2009 PacketVideo
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13  * express or implied.
14  * See the License for the specific language governing permissions
15  * and limitations under the License.
16  * -------------------------------------------------------------------
17  */
18 // -*- c++ -*-
19 // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
20 
21 //               O S C L  UTF-8 S T R I N G  FUNCTIONS
22 
23 // = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
24 
25 // - - Inclusion - - - - - - - - - - - - - - - - - - - - - - - - - - - -
26 
27 #include "oscl_string_utf8.h"
28 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
29 
30 #define UTF8_1_BYTE_MASK     0x80     //1000 0000 -> 0xxx xxxx
31 #define UTF8_1_BYTE_SIG      0x00     //0000 0000 -> 0xxx xxxx
32 
33 #define UTF8_2_BYTES_MASK_1  0xE0     //1110 0000 -> 110y yyyy
34 #define UTF8_2_BYTES_MASK_2  0xC0     //1100 0000 -> 10xx xxxx
35 #define UTF8_2_BYTES_SIG_1   0xC0     //1100 0000 -> 110y yyyy
36 #define UTF8_2_BYTES_SIG_2   0x80     //1000 0000 -> 10xx xxxx
37 
38 
39 #define UTF8_3_BYTES_MASK_1  0xF0     //1111 0000 -> 1110 zzzz
40 #define UTF8_3_BYTES_MASK_2  0xC0     //1100 0000 -> 10yy yyyy
41 #define UTF8_3_BYTES_MASK_3  0xC0     //1100 0000 -> 10xx xxxx
42 #define UTF8_3_BYTES_SIG_1   0xE0     //1110 0000 -> 1110 zzzz
43 #define UTF8_3_BYTES_SIG_2   0x80     //1000 0000 -> 10yy yyyy
44 #define UTF8_3_BYTES_SIG_3   0x80     //1000 0000 -> 10xx xxxx
45 
46 
47 #define UTF8_4_BYTES_MASK_1    0xF8     //1111 1000 -> 1111 0uuu
48 #define UTF8_4_BYTES_MASK_2    0xC0     //1100 0000 -> 10uu zzzz
49 #define UTF8_4_BYTES_MASK_3    0xC0     //1100 0000 -> 10yy yyyy
50 #define UTF8_4_BYTES_MASK_4    0xC0     //1100 0000 -> 10xx xxxx
51 #define UTF8_4_BYTES_SIG_1     0xF0     //1111 0000 -> 1111 0uuu
52 #define UTF8_4_BYTES_SIG_2     0x80     //1000 0000 -> 10uu zzzz
53 #define UTF8_4_BYTES_SIG_3     0x80     //1000 0000 -> 10yy yyyy
54 #define UTF8_4_BYTES_SIG_4     0x80     //1000 0000 -> 10xx xxxx
55 
56 /* ========================================================================
57  *  Function : oscl_str_is_valid_and_truncate_utf8
58  *  Date     : 10/25/2002
59  *  Purpose  :
60  *  Modified :
61  * ========================================================================
62  */
oscl_str_is_valid_and_truncate_utf8(const uint8 * str_buf,uint32 & num_valid_characters,uint32 max_bytes,uint32 & truncate_pos_index,uint32 max_char_2_valid=0)63 OSCL_EXPORT_REF bool  oscl_str_is_valid_and_truncate_utf8(const uint8 *str_buf, uint32& num_valid_characters, uint32 max_bytes, uint32& truncate_pos_index,
64         uint32 max_char_2_valid = 0)
65 {
66     bool result = true, done = false;
67     uint32 bytes_left;
68     uint32 total_num_valid_characters;
69     const uint8 *ptr;
70 
71     ptr = str_buf;
72     bytes_left = max_bytes;
73     total_num_valid_characters = 0;
74     truncate_pos_index = 0;
75 
76     while (!done)
77     {
78         if ((*ptr & UTF8_1_BYTE_MASK) == UTF8_1_BYTE_SIG)
79         {
80             if ((*ptr == 0x0) && (max_bytes == 0)) //if need to be terminated at first null char
81             {
82                 result = true;
83                 done = true;
84                 continue;
85             }
86             if (bytes_left > 0)
87                 bytes_left--;
88             if (max_bytes && !bytes_left) //done when there is no more data
89                 done = true;
90 
91             ptr++;
92             total_num_valid_characters ++;
93             truncate_pos_index += 1;
94         }
95         else if ((*ptr & UTF8_2_BYTES_MASK_1) == UTF8_2_BYTES_SIG_1)
96         {
97             if ((!max_bytes) || (bytes_left >= 2))
98             {
99                 if (((*(ptr + 1) & UTF8_2_BYTES_MASK_2) != UTF8_2_BYTES_SIG_2)
100                         || (*ptr < 0xC2)  //C0, C1 are illegal
101                    )
102                 { //invalid char
103                     done = true;
104                     result = false;
105                     continue;
106                 }
107                 if (bytes_left)
108                     bytes_left -= 2;
109 
110                 if (max_bytes && !bytes_left) //done when there is no more data
111                     done = true;
112 
113                 ptr += 2;
114                 total_num_valid_characters++;
115                 truncate_pos_index += 2;
116             }
117             else
118             {
119                 done = true;
120                 result = false;
121             }
122         }
123         else if ((*ptr & UTF8_3_BYTES_MASK_1) == UTF8_3_BYTES_SIG_1)
124         {
125             if ((!max_bytes) || (bytes_left >= 3))
126             {
127                 if (((*(ptr + 1) & UTF8_3_BYTES_MASK_2) != UTF8_3_BYTES_SIG_2)
128                         || ((*(ptr + 2) & UTF8_3_BYTES_MASK_3) != UTF8_3_BYTES_SIG_3)
129                         || ((*ptr == 0xE0) && (*(ptr + 1) < 0xA0)) //if 1st byte =E0, 2n byte has to be greater than 0xA0
130                    )
131                 { //invalid char
132                     done = true;
133                     result = false;
134                     continue;
135                 }
136                 if (bytes_left > 0)
137                     bytes_left -= 3;
138 
139                 if (max_bytes && !bytes_left) //done when there is no more data
140                     done = true;
141 
142                 ptr += 3;
143                 total_num_valid_characters++;
144                 truncate_pos_index += 3;
145             }
146             else
147             {
148                 done = true;
149                 result = false;
150             }
151         }
152         else if ((*ptr & UTF8_4_BYTES_MASK_1) == UTF8_4_BYTES_SIG_1)
153         {
154             if ((!max_bytes) || (bytes_left >= 4))
155             {
156                 if (((*(ptr + 1) & UTF8_4_BYTES_MASK_2) != UTF8_4_BYTES_SIG_2)
157                         || ((*(ptr + 2) & UTF8_4_BYTES_MASK_3) != UTF8_4_BYTES_SIG_3)
158                         || ((*(ptr + 3) & UTF8_4_BYTES_MASK_4) != UTF8_4_BYTES_SIG_4)
159                         || (*ptr > 0xF4)
160                         || ((*ptr == 0xF4) && (*(ptr + 1) > 0x8F))
161                    )
162                 { //invalid char
163                     done = true;
164                     result = false;
165                     continue;
166                 }
167                 if (bytes_left > 0)
168                     bytes_left -= 4;
169 
170                 if (max_bytes && !bytes_left) //done when there is no more data
171                     done = true;
172 
173                 ptr += 4;
174                 total_num_valid_characters++;
175                 truncate_pos_index += 4;
176             }
177             else
178             {
179                 done = true;
180                 result = false;
181             }
182         }
183         else  //found invalid char
184         {
185             done = true;
186             result = false;
187 
188             if (bytes_left > 0)
189                 bytes_left--;
190         }
191         if (num_valid_characters && (total_num_valid_characters == num_valid_characters))
192             done = true;
193         if ((!num_valid_characters) && max_char_2_valid && (total_num_valid_characters == max_char_2_valid))
194             done = true;
195 
196     }
197     num_valid_characters = total_num_valid_characters;
198 
199     return result;
200 }
201 
202 /* ========================================================================
203  *  Function : oscl_str_is_valid_utf8
204  *  Date     : 10/25/2002
205  *  Purpose  : see oscl_string_utf8.h
206  *  Modified :
207  * ========================================================================
208  */
oscl_str_is_valid_utf8(const uint8 * str_buf,uint32 & num_valid_characters,uint32 max_bytes,uint32 max_char_2_valid,uint32 * num_byte_4_char)209 OSCL_EXPORT_REF bool  oscl_str_is_valid_utf8(const uint8 *str_buf, uint32& num_valid_characters, uint32 max_bytes,
210         uint32 max_char_2_valid, uint32 *num_byte_4_char)
211 {
212     uint32 truncate_pos_index = 0;
213     num_valid_characters = 0;
214     bool result;
215 
216     if (num_byte_4_char)
217         *num_byte_4_char = 0;
218     result = oscl_str_is_valid_and_truncate_utf8(str_buf, num_valid_characters, max_bytes, truncate_pos_index, max_char_2_valid);
219     if (num_byte_4_char)
220         *num_byte_4_char = truncate_pos_index;
221 
222     return result;
223 }
224 /* ========================================================================
225  *  Function : oscl_str_truncate_utf8
226  *  Date     : 10/25/2002
227  *  Purpose  : see oscl_string_utf8.h
228  *  Modified :
229  * ========================================================================
230  */
oscl_str_truncate_utf8(uint8 * str_buf,uint32 max_chars,uint32 max_bytes)231 OSCL_EXPORT_REF int32  oscl_str_truncate_utf8(uint8 *str_buf, uint32 max_chars, uint32 max_bytes)
232 {
233     uint32 num_valid_characters, truncate_pos_index;
234 
235     num_valid_characters = max_chars;
236     truncate_pos_index = 0;
237 
238     oscl_str_is_valid_and_truncate_utf8(str_buf, num_valid_characters, max_bytes, truncate_pos_index);
239     if (max_bytes)
240     {
241         if (truncate_pos_index <= (max_bytes - 1))
242             *(str_buf + truncate_pos_index) = 0x00;
243     }
244     else
245     {
246         *(str_buf + truncate_pos_index) = 0x00;
247     }
248 
249     return num_valid_characters;
250 }
251 
252