1 /******************************************************************************
2
3 @File PVRTUnicode.cpp
4
5 @Title PVRTUnicode
6
7 @Version @Version
8
9 @Copyright Copyright (c) Imagination Technologies Limited.
10
11 @Platform All
12
13 @Description A small collection of functions used to decode Unicode formats to
14 individual code points.
15
16 ******************************************************************************/
17 #include "PVRTUnicode.h"
18 #include <string.h>
19
20 /****************************************************************************
21 ** Constants
22 ****************************************************************************/
23 const PVRTuint32 c_u32ReplChar = 0xFFFD;
24
25 #define VALID_ASCII 0x80
26 #define TAIL_MASK 0x3F
27 #define BYTES_PER_TAIL 6
28
29 #define UTF16_SURG_H_MARK 0xD800
30 #define UTF16_SURG_H_END 0xDBFF
31 #define UTF16_SURG_L_MARK 0xDC00
32 #define UTF16_SURG_L_END 0xDFFF
33
34 #define UNICODE_NONCHAR_MARK 0xFDD0
35 #define UNICODE_NONCHAR_END 0xFDEF
36 #define UNICODE_RESERVED 0xFFFE
37 #define UNICODE_MAX 0x10FFFF
38
39 #define MAX_LEN 0x8FFF
40
41 /****************************************************************************
42 ** A table which allows quick lookup to determine the number of bytes of a
43 ** UTF8 code point.
44 ****************************************************************************/
45 const PVRTuint8 c_u8UTF8Lengths[256] =
46 {
47 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
48 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
49 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
50 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
51 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
52 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
53 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
54 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
55 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
56 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
57 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
58 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
59 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
60 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
61 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
62 3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,
63 };
64
65 /****************************************************************************
66 ** A table which allows quick lookup to determine whether a UTF8 sequence
67 ** is 'overlong'.
68 ****************************************************************************/
69 const PVRTuint32 c_u32MinVals[4] =
70 {
71 0x00000000, // 0 tail bytes
72 0x00000080, // 1 tail bytes
73 0x00000800, // 2 tail bytes
74 0x00010000, // 3 tail bytes
75 };
76
77 /*!***************************************************************************
78 @Function CheckGenericUnicode
79 @Input c32 A UTF32 character/Unicode code point
80 @Returns Success or failure.
81 @Description Checks that the decoded code point is valid.
82 *****************************************************************************/
CheckGenericUnicode(PVRTuint32 c32)83 static bool CheckGenericUnicode(PVRTuint32 c32)
84 {
85 // Check that this value isn't a UTF16 surrogate mask.
86 if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_L_END)
87 return false;
88 // Check non-char values
89 if(c32 >= UNICODE_NONCHAR_MARK && c32 <= UNICODE_NONCHAR_END)
90 return false;
91 // Check reserved values
92 if((c32 & UNICODE_RESERVED) == UNICODE_RESERVED)
93 return false;
94 // Check max value.
95 if(c32 > UNICODE_MAX)
96 return false;
97
98 return true;
99 }
100
101 /*!***************************************************************************
102 @Function PVRTUnicodeUTF8ToUTF32
103 @Input pUTF8 A UTF8 string, which is null terminated.
104 @Output aUTF32 An array of Unicode code points.
105 @Returns Success or failure.
106 @Description Decodes a UTF8-encoded string in to Unicode code points
107 (UTF32). If pUTF8 is not null terminated, the results are
108 undefined.
109 *****************************************************************************/
PVRTUnicodeUTF8ToUTF32(const PVRTuint8 * const pUTF8,CPVRTArray<PVRTuint32> & aUTF32)110 EPVRTError PVRTUnicodeUTF8ToUTF32(const PVRTuint8* const pUTF8, CPVRTArray<PVRTuint32>& aUTF32)
111 {
112 unsigned int uiTailLen, uiIndex;
113 unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8);
114 PVRTuint32 c32;
115
116 const PVRTuint8* pC = pUTF8;
117 while(*pC)
118 {
119 // Quick optimisation for ASCII characters
120 while(*pC && *pC < VALID_ASCII)
121 {
122 aUTF32.Append(*pC++);
123 }
124 // Done
125 if(!*pC)
126 break;
127
128 c32 = *pC++;
129 uiTailLen = c_u8UTF8Lengths[c32];
130
131 // Check for invalid tail length. Maximum 4 bytes for each UTF8 character.
132 // Also check to make sure the tail length is inside the provided buffer.
133 if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes))
134 return PVR_OVERFLOW;
135
136 c32 &= (TAIL_MASK >> uiTailLen); // Get the data out of the first byte. This depends on the length of the tail.
137
138 // Get the data out of each tail byte
139 uiIndex = 0;
140 while(uiIndex < uiTailLen)
141 {
142 if((pC[uiIndex] & 0xC0) != 0x80)
143 return PVR_FAIL; // Invalid tail byte!
144
145 c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK);
146 uiIndex++;
147 }
148
149 pC += uiIndex;
150
151 // Check overlong values.
152 if(c32 < c_u32MinVals[uiTailLen])
153 return PVR_FAIL;
154
155 if(!CheckGenericUnicode(c32))
156 return PVR_FAIL;
157
158 // OK
159 aUTF32.Append(c32);
160 }
161
162 return PVR_SUCCESS;
163 }
164
165 /*!***************************************************************************
166 @Function PVRTUnicodeUTF16ToUTF32
167 @Input pUTF16 A UTF16 string, which is null terminated.
168 @Output aUTF32 An array of Unicode code points.
169 @Returns Success or failure.
170 @Description Decodes a UTF16-encoded string in to Unicode code points
171 (UTF32). If pUTF16 is not null terminated, the results are
172 undefined.
173 *****************************************************************************/
PVRTUnicodeUTF16ToUTF32(const PVRTuint16 * const pUTF16,CPVRTArray<PVRTuint32> & aUTF32)174 EPVRTError PVRTUnicodeUTF16ToUTF32(const PVRTuint16* const pUTF16, CPVRTArray<PVRTuint32>& aUTF32)
175 {
176 const PVRTuint16* pC = pUTF16;
177
178 // Determine the number of shorts
179 while(*++pC && (pC - pUTF16) < MAX_LEN);
180 unsigned int uiBufferLen = (unsigned int) (pC - pUTF16);
181
182 if(uiBufferLen == MAX_LEN)
183 return PVR_OVERFLOW; // Probably not NULL terminated.
184
185 // Reset to start.
186 pC = pUTF16;
187
188 PVRTuint32 c32;
189 while(*pC)
190 {
191 // Straight copy. We'll check for surrogate pairs next...
192 c32 = *pC++;
193
194 // Check surrogate pair
195 if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_H_END)
196 {
197 // Make sure the next 2 bytes are in range...
198 if(pC + 1 > pUTF16 + uiBufferLen || *pC == 0)
199 return PVR_OVERFLOW;
200
201 // Check that the next value is in the low surrogate range
202 if(*pC < UTF16_SURG_L_MARK || *pC > UTF16_SURG_L_END)
203 return PVR_FAIL;
204
205 // Decode
206 c32 = ((c32 - UTF16_SURG_H_MARK) << 10) + (*pC - UTF16_SURG_L_MARK) + 0x10000;
207 pC++;
208 }
209
210 if(!CheckGenericUnicode(c32))
211 return PVR_FAIL;
212
213 // OK
214 aUTF32.Append(c32);
215 }
216
217 return PVR_SUCCESS;
218 }
219
220 /*!***************************************************************************
221 @Function PVRTUnicodeUTF8Length
222 @Input pUTF8 A UTF8 string, which is null terminated.
223 @Returns The length of the string, in Unicode code points.
224 @Description Calculates the length of a UTF8 string. If pUTF8 is
225 not null terminated, the results are undefined.
226 *****************************************************************************/
PVRTUnicodeUTF8Length(const PVRTuint8 * const pUTF8)227 unsigned int PVRTUnicodeUTF8Length(const PVRTuint8* const pUTF8)
228 {
229 const PVRTuint8* pC = pUTF8;
230
231 unsigned int charCount = 0;
232 unsigned int mask;
233 while(*pC)
234 {
235 // Quick optimisation for ASCII characters
236 const PVRTuint8* pStart = pC;
237 while(*pC && *pC < VALID_ASCII)
238 pC++;
239
240 charCount += (unsigned int) (pC - pStart);
241
242 // Done
243 if(!*pC)
244 break;
245
246 mask = *pC & 0xF0;
247 switch(mask)
248 {
249 case 0xF0: pC++;
250 case 0xE0: pC++;
251 case 0xC0: pC++;
252 break;
253 default:
254 _ASSERT(!"Invalid tail byte!");
255 return 0;
256 }
257
258 pC++;
259 charCount++;
260 }
261
262 return charCount;
263 }
264
265 /*!***************************************************************************
266 @Function PVRTUnicodeUTF16Length
267 @Input pUTF16 A UTF16 string, which is null terminated.
268 @Returns The length of the string, in Unicode code points.
269 @Description Calculates the length of a UTF16 string.
270 If pUTF16 is not null terminated, the results are
271 undefined.
272 *****************************************************************************/
PVRTUnicodeUTF16Length(const PVRTuint16 * const pUTF16)273 unsigned int PVRTUnicodeUTF16Length(const PVRTuint16* const pUTF16)
274 {
275 const PVRTuint16* pC = pUTF16;
276 unsigned int charCount = 0;
277 while(*pC && (pC - pUTF16) < MAX_LEN)
278 {
279 if( pC[0] >= UTF16_SURG_H_MARK && pC[0] <= UTF16_SURG_H_END
280 && pC[1] >= UTF16_SURG_L_MARK && pC[0] <= UTF16_SURG_L_END)
281 {
282 pC += 2;
283 }
284 else
285 {
286 pC += 1;
287 }
288
289 charCount++;
290 }
291
292 return charCount;
293 }
294
295 /*!***************************************************************************
296 @Function PVRTUnicodeValidUTF8
297 @Input pUTF8 A UTF8 string, which is null terminated.
298 @Returns true or false
299 @Description Checks whether the encoding of a UTF8 string is valid.
300 If pUTF8 is not null terminated, the results are undefined.
301 *****************************************************************************/
PVRTUnicodeValidUTF8(const PVRTuint8 * const pUTF8)302 bool PVRTUnicodeValidUTF8(const PVRTuint8* const pUTF8)
303 {
304 unsigned int uiTailLen, uiIndex;
305 unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8);
306 const PVRTuint8* pC = pUTF8;
307 while(*pC)
308 {
309 // Quick optimisation for ASCII characters
310 while(*pC && *pC < VALID_ASCII) pC++;
311 // Done?
312 if(!*pC)
313 break;
314
315 PVRTuint32 c32 = *pC++;
316 uiTailLen = c_u8UTF8Lengths[c32];
317
318 // Check for invalid tail length. Maximum 4 bytes for each UTF8 character.
319 // Also check to make sure the tail length is inside the provided buffer.
320 if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes))
321 return false;
322
323 // Get the data out of each tail byte
324 uiIndex = 0;
325 while(uiIndex < uiTailLen)
326 {
327 if((pC[uiIndex] & 0xC0) != 0x80)
328 return false; // Invalid tail byte!
329
330 c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK);
331 uiIndex++;
332 }
333
334 pC += uiIndex;
335
336 // Check overlong values.
337 if(c32 < c_u32MinVals[uiTailLen])
338 return false;
339 if(!CheckGenericUnicode(c32))
340 return false;
341 }
342
343 return true;
344 }
345
346 /*****************************************************************************
347 End of file (PVRTUnicode.cpp)
348 *****************************************************************************/
349
350