1 /********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1998-2012, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6 /*
7 * File utf8tst.c
8 *
9 * Modification History:
10 *
11 * Date Name Description
12 * 07/24/2000 Madhu Creation
13 *******************************************************************************
14 */
15
16 #include "unicode/utypes.h"
17 #include "unicode/utf8.h"
18 #include "cmemory.h"
19 #include "cintltst.h"
20
21 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
22
23 /* lenient UTF-8 ------------------------------------------------------------ */
24
25 /*
26 * Lenient UTF-8 differs from conformant UTF-8 in that it allows surrogate
27 * code points with their "natural" encoding.
28 * Effectively, this allows a mix of UTF-8 and CESU-8 as well as encodings of
29 * single surrogates.
30 *
31 * This is not conformant with UTF-8.
32 *
33 * Supplementary code points may be encoded as pairs of 3-byte sequences, but
34 * the macros below do not attempt to assemble such pairs.
35 */
36
37 #define L8_NEXT(s, i, length, c) { \
38 (c)=(uint8_t)(s)[(i)++]; \
39 if((c)>=0x80) { \
40 if(U8_IS_LEAD(c)) { \
41 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
42 } else { \
43 (c)=U_SENTINEL; \
44 } \
45 } \
46 }
47
48 #define L8_PREV(s, start, i, c) { \
49 (c)=(uint8_t)(s)[--(i)]; \
50 if((c)>=0x80) { \
51 if((c)<=0xbf) { \
52 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
53 } else { \
54 (c)=U_SENTINEL; \
55 } \
56 } \
57 }
58
59 /* -------------------------------------------------------------------------- */
60
61 static void printUChars(const uint8_t *uchars, int16_t len);
62
63 static void TestCodeUnitValues(void);
64 static void TestCharLength(void);
65 static void TestGetChar(void);
66 static void TestNextPrevChar(void);
67 static void TestNulTerminated(void);
68 static void TestNextPrevNonCharacters(void);
69 static void TestNextPrevCharUnsafe(void);
70 static void TestFwdBack(void);
71 static void TestFwdBackUnsafe(void);
72 static void TestSetChar(void);
73 static void TestSetCharUnsafe(void);
74 static void TestAppendChar(void);
75 static void TestAppend(void);
76 static void TestSurrogates(void);
77
78 void addUTF8Test(TestNode** root);
79
80 void
addUTF8Test(TestNode ** root)81 addUTF8Test(TestNode** root)
82 {
83 addTest(root, &TestCodeUnitValues, "utf8tst/TestCodeUnitValues");
84 addTest(root, &TestCharLength, "utf8tst/TestCharLength");
85 addTest(root, &TestGetChar, "utf8tst/TestGetChar");
86 addTest(root, &TestNextPrevChar, "utf8tst/TestNextPrevChar");
87 addTest(root, &TestNulTerminated, "utf8tst/TestNulTerminated");
88 addTest(root, &TestNextPrevNonCharacters, "utf8tst/TestNextPrevNonCharacters");
89 addTest(root, &TestNextPrevCharUnsafe, "utf8tst/TestNextPrevCharUnsafe");
90 addTest(root, &TestFwdBack, "utf8tst/TestFwdBack");
91 addTest(root, &TestFwdBackUnsafe, "utf8tst/TestFwdBackUnsafe");
92 addTest(root, &TestSetChar, "utf8tst/TestSetChar");
93 addTest(root, &TestSetCharUnsafe, "utf8tst/TestSetCharUnsafe");
94 addTest(root, &TestAppendChar, "utf8tst/TestAppendChar");
95 addTest(root, &TestAppend, "utf8tst/TestAppend");
96 addTest(root, &TestSurrogates, "utf8tst/TestSurrogates");
97 }
98
TestCodeUnitValues()99 static void TestCodeUnitValues()
100 {
101 static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc0, 0xc4, 0xf0, 0xfd, 0x80, 0x81, 0xbc, 0xbe,};
102
103 int16_t i;
104 for(i=0; i<LENGTHOF(codeunit); i++){
105 uint8_t c=codeunit[i];
106 log_verbose("Testing code unit value of %x\n", c);
107 if(i<4){
108 if(!UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || UTF8_IS_TRAIL(c) || !U8_IS_SINGLE(c) || U8_IS_LEAD(c) || U8_IS_TRAIL(c)){
109 log_err("ERROR: 0x%02x is a single byte but results in single: %c lead: %c trail: %c\n",
110 c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
111 }
112 } else if(i< 8){
113 if(!UTF8_IS_LEAD(c) || UTF8_IS_SINGLE(c) || UTF8_IS_TRAIL(c) || !U8_IS_LEAD(c) || U8_IS_SINGLE(c) || U8_IS_TRAIL(c)){
114 log_err("ERROR: 0x%02x is a lead byte but results in single: %c lead: %c trail: %c\n",
115 c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
116 }
117 } else if(i< 12){
118 if(!UTF8_IS_TRAIL(c) || UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || !U8_IS_TRAIL(c) || U8_IS_SINGLE(c) || U8_IS_LEAD(c)){
119 log_err("ERROR: 0x%02x is a trail byte but results in single: %c lead: %c trail: %c\n",
120 c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
121 }
122 }
123 }
124 }
125
TestCharLength()126 static void TestCharLength()
127 {
128 static const uint32_t codepoint[]={
129 1, 0x0061,
130 1, 0x007f,
131 2, 0x016f,
132 2, 0x07ff,
133 3, 0x0865,
134 3, 0x20ac,
135 4, 0x20402,
136 4, 0x23456,
137 4, 0x24506,
138 4, 0x20402,
139 4, 0x10402,
140 3, 0xd7ff,
141 3, 0xe000,
142
143 };
144
145 int16_t i;
146 UBool multiple;
147 for(i=0; i<LENGTHOF(codepoint); i=(int16_t)(i+2)){
148 UChar32 c=codepoint[i+1];
149 if(UTF8_CHAR_LENGTH(c) != (uint16_t)codepoint[i] || U8_LENGTH(c) != (uint16_t)codepoint[i]){
150 log_err("The no: of code units for %lx:- Expected: %d Got: %d\n", c, codepoint[i], UTF8_CHAR_LENGTH(c));
151 }else{
152 log_verbose("The no: of code units for %lx is %d\n",c, UTF8_CHAR_LENGTH(c));
153 }
154 multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
155 if(UTF8_NEED_MULTIPLE_UCHAR(c) != multiple){
156 log_err("ERROR: UTF8_NEED_MULTIPLE_UCHAR failed for %lx\n", c);
157 }
158 }
159 }
160
TestGetChar()161 static void TestGetChar()
162 {
163 static const uint8_t input[]={
164 /* code unit,*/
165 0x61,
166 0x7f,
167 0xe4,
168 0xba,
169 0x8c,
170 0xF0,
171 0x90,
172 0x90,
173 0x81,
174 0xc0,
175 0x65,
176 0x31,
177 0x9a,
178 0xc9
179 };
180 static const UChar32 result[]={
181 /* codepoint-unsafe, codepoint-safe(not strict) codepoint-safe(strict) */
182 0x61, 0x61, 0x61,
183 0x7f, 0x7f, 0x7f,
184 0x4e8c, 0x4e8c, 0x4e8c,
185 0x4e8c, 0x4e8c, 0x4e8c ,
186 0x4e8c, 0x4e8c, 0x4e8c,
187 0x10401, 0x10401, 0x10401 ,
188 0x10401, 0x10401, 0x10401 ,
189 0x10401, 0x10401, 0x10401 ,
190 0x10401, 0x10401, 0x10401,
191 0x25, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
192 0x65, 0x65, 0x65,
193 0x31, 0x31, 0x31,
194 0x31, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
195 0x240, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1
196 };
197 uint16_t i=0;
198 UChar32 c, expected;
199 uint32_t offset=0;
200
201 for(offset=0; offset<sizeof(input); offset++) {
202 if (offset < sizeof(input) - 1) {
203 UTF8_GET_CHAR_UNSAFE(input, offset, c);
204 if(c != result[i]){
205 log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
206
207 }
208
209 U8_GET_UNSAFE(input, offset, c);
210 if(c != result[i]){
211 log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
212
213 }
214 }
215
216 UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE);
217 expected=result[i+1];
218 if(c != expected){
219 log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
220 }
221
222 U8_GET(input, 0, offset, sizeof(input), c);
223 if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
224 if(c != expected){
225 log_err("ERROR: U8_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
226 }
227
228 U8_GET_OR_FFFD(input, 0, offset, sizeof(input), c);
229 if(expected<0) { expected=0xfffd; }
230 if(c != expected){
231 log_err("ERROR: U8_GET_OR_FFFD failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
232 }
233
234 UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, TRUE);
235 if(c != result[i+2]){
236 log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
237 }
238
239 i=(uint16_t)(i+3);
240 }
241 }
242
TestNextPrevChar()243 static void TestNextPrevChar() {
244 static const uint8_t input[]={0x61, 0xf0, 0x90, 0x90, 0x81, 0xc0, 0x80, 0xfd, 0xbe, 0xc2, 0x61, 0x81, 0x90, 0x90, 0xf0, 0x00};
245 static const UChar32 result[]={
246 /* next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns prev_safe_s */
247 0x0061, 0x0061, 0x0061, 0x0000, 0x0000, 0x0000,
248 0x10401, 0x10401, 0x10401, 0xf0, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
249 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x2841410, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
250 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xa1050, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
251 0x81, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x2841, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
252 0x00, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, 0x61, 0x61, 0x61,
253 0x80, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xc2, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
254 0xfd, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, 0x77e, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
255 0xbe, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xfd, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
256 0xa1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x00, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
257 0x61, 0x61, 0x61, 0xc0, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
258 0x81, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x10401, 0x10401, 0x10401,
259 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x410, UTF_ERROR_VALUE, UTF_ERROR_VALUE,
260 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x410, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
261 0x0840, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xf0, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
262 0x0000, 0x0000, 0x0000, 0x0061, 0x0061, 0x0061
263 };
264 static const int32_t movedOffset[]={
265 /* next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns prev_safe_s */
266 1, 1, 1, 15, 15, 15,
267 5, 5, 5, 14, 14 , 14,
268 3, 3, 3, 9, 13, 13,
269 4, 4, 4, 9, 12, 12,
270 5, 5, 5, 9, 11, 11,
271 7, 7, 7, 10, 10, 10,
272 7, 7, 7, 9, 9, 9,
273 8, 9, 9, 7, 7, 7,
274 9, 9, 9, 7, 7, 7,
275 11, 10, 10, 5, 5, 5,
276 11, 11, 11, 5, 5, 5,
277 12, 12, 12, 1, 1, 1,
278 13, 13, 13, 1, 1, 1,
279 14, 14, 14, 1, 1, 1,
280 14, 15, 15, 1, 1, 1,
281 14, 16, 16, 0, 0, 0,
282 };
283 /* TODO: remove unused columns for next_unsafe & prev_unsafe, and adjust the test code */
284
285 UChar32 c, expected;
286 uint32_t i=0;
287 uint32_t offset=0;
288 int32_t setOffset=0;
289 for(offset=0; offset<sizeof(input); offset++){
290 setOffset=offset;
291 UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
292 if(setOffset != movedOffset[i+1]){
293 log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
294 offset, movedOffset[i+1], setOffset);
295 }
296 expected=result[i+1];
297 if(c != expected){
298 log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
299 }
300
301 setOffset=offset;
302 U8_NEXT(input, setOffset, sizeof(input), c);
303 if(setOffset != movedOffset[i+1]){
304 log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
305 offset, movedOffset[i+1], setOffset);
306 }
307 if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
308 if(c != expected){
309 log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
310 }
311
312 setOffset=offset;
313 U8_NEXT_OR_FFFD(input, setOffset, sizeof(input), c);
314 if(setOffset != movedOffset[i+1]){
315 log_err("ERROR: U8_NEXT_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
316 offset, movedOffset[i+1], setOffset);
317 }
318 if(expected<0) { expected=0xfffd; }
319 if(c != expected){
320 log_err("ERROR: U8_NEXT_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
321 }
322
323 setOffset=offset;
324 UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
325 if(setOffset != movedOffset[i+1]){
326 log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
327 offset, movedOffset[i+2], setOffset);
328 }
329 if(c != result[i+2]){
330 log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
331 }
332
333 i=i+6;
334 }
335
336 i=0;
337 for(offset=sizeof(input); offset > 0; --offset){
338 setOffset=offset;
339 UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
340 if(setOffset != movedOffset[i+4]){
341 log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
342 offset, movedOffset[i+4], setOffset);
343 }
344 expected=result[i+4];
345 if(c != expected){
346 log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
347 }
348
349 setOffset=offset;
350 U8_PREV(input, 0, setOffset, c);
351 if(setOffset != movedOffset[i+4]){
352 log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
353 offset, movedOffset[i+4], setOffset);
354 }
355 if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
356 if(c != expected){
357 log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
358 }
359
360 setOffset=offset;
361 U8_PREV_OR_FFFD(input, 0, setOffset, c);
362 if(setOffset != movedOffset[i+4]){
363 log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
364 offset, movedOffset[i+4], setOffset);
365 }
366 if(expected<0) { expected=0xfffd; }
367 if(c != expected){
368 log_err("ERROR: U8_PREV_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
369 }
370
371 setOffset=offset;
372 UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE);
373 if(setOffset != movedOffset[i+5]){
374 log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
375 offset, movedOffset[i+5], setOffset);
376 }
377 if(c != result[i+5]){
378 log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c);
379 }
380
381 i=i+6;
382 }
383 }
384
385 /* keep this in sync with utf16tst.c's TestNulTerminated() */
TestNulTerminated()386 static void TestNulTerminated() {
387 static const uint8_t input[]={
388 /* 0 */ 0x61,
389 /* 1 */ 0xf0, 0x90, 0x90, 0x81,
390 /* 5 */ 0xc0, 0x80,
391 /* 7 */ 0xdf, 0x80,
392 /* 9 */ 0xc2,
393 /* 10 */ 0x62,
394 /* 11 */ 0xfd, 0xbe,
395 /* 13 */ 0xe0, 0xa0, 0x80,
396 /* 16 */ 0xe2, 0x82, 0xac,
397 /* 19 */ 0xf0, 0x90, 0x90,
398 /* 22 */ 0x00
399 /* 23 */
400 };
401 static const UChar32 result[]={
402 0x61,
403 0x10401,
404 U_SENTINEL,
405 0x7c0,
406 U_SENTINEL,
407 0x62,
408 U_SENTINEL,
409 0x800,
410 0x20ac,
411 U_SENTINEL,
412 0
413 };
414
415 UChar32 c, c2, expected;
416 int32_t i0, i=0, j, k, expectedIndex;
417 int32_t cpIndex=0;
418 do {
419 i0=i;
420 U8_NEXT(input, i, -1, c);
421 expected=result[cpIndex];
422 if(c!=expected) {
423 log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, expected);
424 }
425 j=i0;
426 U8_NEXT_OR_FFFD(input, j, -1, c);
427 if(expected<0) { expected=0xfffd; }
428 if(c!=expected) {
429 log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0, c, expected);
430 }
431 if(j!=i) {
432 log_err("U8_NEXT_OR_FFFD() moved to index %d but U8_NEXT() moved to %d\n", j, i);
433 }
434 j=i0;
435 U8_FWD_1(input, j, -1);
436 if(j!=i) {
437 log_err("U8_FWD_1() moved to index %d but U8_NEXT() moved to %d\n", j, i);
438 }
439 ++cpIndex;
440 /*
441 * Move by this many code points from the start.
442 * U8_FWD_N() stops at the end of the string, that is, at the NUL if necessary.
443 */
444 expectedIndex= (c==0) ? i-1 : i;
445 k=0;
446 U8_FWD_N(input, k, -1, cpIndex);
447 if(k!=expectedIndex) {
448 log_err("U8_FWD_N(code points from 0) moved to index %d but expected %d\n", k, expectedIndex);
449 }
450 } while(c!=0);
451
452 i=0;
453 do {
454 j=i0=i;
455 U8_NEXT(input, i, -1, c);
456 do {
457 U8_GET(input, 0, j, -1, c2);
458 if(c2!=c) {
459 log_err("U8_NEXT(from %d)=U+%04x != U+%04x=U8_GET(at %d)\n", i0, c, c2, j);
460 }
461 U8_GET_OR_FFFD(input, 0, j, -1, c2);
462 expected= (c>=0) ? c : 0xfffd;
463 if(c2!=expected) {
464 log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U8_GET_OR_FFFD(at %d)\n", i0, expected, c2, j);
465 }
466 /* U8_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */
467 k=j+1;
468 U8_SET_CP_LIMIT(input, 0, k, -1);
469 if(k!=i) {
470 log_err("U8_NEXT() moved to %d but U8_SET_CP_LIMIT(%d) moved to %d\n", i, j+1, k);
471 }
472 } while(++j<i);
473 } while(c!=0);
474 }
475
TestNextPrevNonCharacters()476 static void TestNextPrevNonCharacters() {
477 /* test non-characters */
478 static const uint8_t nonChars[]={
479 0xef, 0xb7, 0x90, /* U+fdd0 */
480 0xef, 0xbf, 0xbf, /* U+feff */
481 0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */
482 0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */
483 0xf4, 0x8f, 0xbf, 0xbe /* U+10fffe */
484 };
485
486 UChar32 ch;
487 int32_t idx;
488
489 for(idx=0; idx<(int32_t)sizeof(nonChars);) {
490 U8_NEXT(nonChars, idx, sizeof(nonChars), ch);
491 if(!U_IS_UNICODE_NONCHAR(ch)) {
492 log_err("U8_NEXT(before %d) failed to read a non-character\n", idx);
493 }
494 }
495 for(idx=(int32_t)sizeof(nonChars); idx>0;) {
496 U8_PREV(nonChars, 0, idx, ch);
497 if(!U_IS_UNICODE_NONCHAR(ch)) {
498 log_err("U8_PREV(at %d) failed to read a non-character\n", idx);
499 }
500 }
501 }
502
TestNextPrevCharUnsafe()503 static void TestNextPrevCharUnsafe() {
504 /*
505 * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
506 * The behavior of _UNSAFE macros for ill-formed strings is undefined.
507 */
508 static const uint8_t input[]={
509 0x61,
510 0xf0, 0x90, 0x90, 0x81,
511 0xc0, 0x80, /* non-shortest form */
512 0xe2, 0x82, 0xac,
513 0xc2, 0xa1,
514 0xf4, 0x8f, 0xbf, 0xbf,
515 0x00
516 };
517 static const UChar32 codePoints[]={
518 0x61,
519 0x10401,
520 0,
521 0x20ac,
522 0xa1,
523 0x10ffff,
524 0
525 };
526
527 UChar32 c;
528 int32_t i;
529 uint32_t offset;
530 for(i=0, offset=0; offset<sizeof(input); ++i) {
531 UTF8_NEXT_CHAR_UNSAFE(input, offset, c);
532 if(c != codePoints[i]){
533 log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
534 offset, codePoints[i], c);
535 }
536 }
537 for(i=0, offset=0; offset<sizeof(input); ++i) {
538 U8_NEXT_UNSAFE(input, offset, c);
539 if(c != codePoints[i]){
540 log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
541 offset, codePoints[i], c);
542 }
543 }
544
545 for(i=LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
546 UTF8_PREV_CHAR_UNSAFE(input, offset, c);
547 if(c != codePoints[i]){
548 log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
549 offset, codePoints[i], c);
550 }
551 }
552 for(i=LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
553 U8_PREV_UNSAFE(input, offset, c);
554 if(c != codePoints[i]){
555 log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
556 offset, codePoints[i], c);
557 }
558 }
559 }
560
TestFwdBack()561 static void TestFwdBack() {
562 static const uint8_t input[]={0x61, 0xF0, 0x90, 0x90, 0x81, 0xff, 0x62, 0xc0, 0x80, 0x7f, 0x8f, 0xc0, 0x63, 0x81, 0x90, 0x90, 0xF0, 0x00};
563 static const uint16_t fwd_safe[] ={1, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
564 static const uint16_t back_safe[] ={17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6, 5, 1, 0};
565
566 static const uint16_t Nvalue[]= {0, 1, 2, 3, 1, 2, 1, 5};
567 static const uint16_t fwd_N_safe[] ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */
568 static const uint16_t back_N_safe[] ={18, 17, 15, 12, 11, 9, 7, 0};
569
570 uint32_t offsafe=0;
571
572 uint32_t i=0;
573 while(offsafe < sizeof(input)){
574 UTF8_FWD_1_SAFE(input, offsafe, sizeof(input));
575 if(offsafe != fwd_safe[i]){
576 log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
577 }
578 i++;
579 }
580
581 i=0;
582 while(offsafe < sizeof(input)){
583 U8_FWD_1(input, offsafe, sizeof(input));
584 if(offsafe != fwd_safe[i]){
585 log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
586 }
587 i++;
588 }
589
590 i=0;
591 offsafe=sizeof(input);
592 while(offsafe > 0){
593 UTF8_BACK_1_SAFE(input, 0, offsafe);
594 if(offsafe != back_safe[i]){
595 log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_safe[i], offsafe);
596 }
597 i++;
598 }
599
600 i=0;
601 offsafe=sizeof(input);
602 while(offsafe > 0){
603 U8_BACK_1(input, 0, offsafe);
604 if(offsafe != back_safe[i]){
605 log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_safe[i], offsafe);
606 }
607 i++;
608 }
609
610 offsafe=0;
611 for(i=0; i<LENGTHOF(Nvalue); i++){
612 UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]);
613 if(offsafe != fwd_N_safe[i]){
614 log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
615 }
616
617 }
618
619 offsafe=0;
620 for(i=0; i<LENGTHOF(Nvalue); i++){
621 U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]);
622 if(offsafe != fwd_N_safe[i]){
623 log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
624 }
625
626 }
627
628 offsafe=sizeof(input);
629 for(i=0; i<LENGTHOF(Nvalue); i++){
630 UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]);
631 if(offsafe != back_N_safe[i]){
632 log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
633 }
634 }
635
636 offsafe=sizeof(input);
637 for(i=0; i<LENGTHOF(Nvalue); i++){
638 U8_BACK_N(input, 0, offsafe, Nvalue[i]);
639 if(offsafe != back_N_safe[i]){
640 log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
641 }
642 }
643 }
644
TestFwdBackUnsafe()645 static void TestFwdBackUnsafe() {
646 /*
647 * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
648 * The behavior of _UNSAFE macros for ill-formed strings is undefined.
649 */
650 static const uint8_t input[]={
651 0x61,
652 0xf0, 0x90, 0x90, 0x81,
653 0xc0, 0x80, /* non-shortest form */
654 0xe2, 0x82, 0xac,
655 0xc2, 0xa1,
656 0xf4, 0x8f, 0xbf, 0xbf,
657 0x00
658 };
659 static const int8_t boundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 };
660
661 int32_t offset;
662 int32_t i;
663 for(i=1, offset=0; offset<LENGTHOF(input); ++i) {
664 UTF8_FWD_1_UNSAFE(input, offset);
665 if(offset != boundaries[i]){
666 log_err("ERROR: UTF8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
667 }
668 }
669 for(i=1, offset=0; offset<LENGTHOF(input); ++i) {
670 U8_FWD_1_UNSAFE(input, offset);
671 if(offset != boundaries[i]){
672 log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
673 }
674 }
675
676 for(i=LENGTHOF(boundaries)-2, offset=LENGTHOF(input); offset>0; --i) {
677 UTF8_BACK_1_UNSAFE(input, offset);
678 if(offset != boundaries[i]){
679 log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
680 }
681 }
682 for(i=LENGTHOF(boundaries)-2, offset=LENGTHOF(input); offset>0; --i) {
683 U8_BACK_1_UNSAFE(input, offset);
684 if(offset != boundaries[i]){
685 log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
686 }
687 }
688
689 for(i=0; i<LENGTHOF(boundaries); ++i) {
690 offset=0;
691 UTF8_FWD_N_UNSAFE(input, offset, i);
692 if(offset != boundaries[i]) {
693 log_err("ERROR: UTF8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
694 }
695 }
696 for(i=0; i<LENGTHOF(boundaries); ++i) {
697 offset=0;
698 U8_FWD_N_UNSAFE(input, offset, i);
699 if(offset != boundaries[i]) {
700 log_err("ERROR: U8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
701 }
702 }
703
704 for(i=0; i<LENGTHOF(boundaries); ++i) {
705 int32_t j=LENGTHOF(boundaries)-1-i;
706 offset=LENGTHOF(input);
707 UTF8_BACK_N_UNSAFE(input, offset, i);
708 if(offset != boundaries[j]) {
709 log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[j], offset);
710 }
711 }
712 for(i=0; i<LENGTHOF(boundaries); ++i) {
713 int32_t j=LENGTHOF(boundaries)-1-i;
714 offset=LENGTHOF(input);
715 U8_BACK_N_UNSAFE(input, offset, i);
716 if(offset != boundaries[j]) {
717 log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[j], offset);
718 }
719 }
720 }
721
TestSetChar()722 static void TestSetChar() {
723 static const uint8_t input[]
724 = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0xfe, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x00 };
725 static const int16_t start_safe[]
726 = {0, 1, 1, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
727 static const int16_t limit_safe[]
728 = {0, 1, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
729
730 uint32_t i=0;
731 int32_t offset=0, setOffset=0;
732 for(offset=0; offset<=LENGTHOF(input); offset++){
733 if (offset<LENGTHOF(input)){
734 setOffset=offset;
735 UTF8_SET_CHAR_START_SAFE(input, 0, setOffset);
736 if(setOffset != start_safe[i]){
737 log_err("ERROR: UTF8_SET_CHAR_START_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
738 }
739
740 setOffset=offset;
741 U8_SET_CP_START(input, 0, setOffset);
742 if(setOffset != start_safe[i]){
743 log_err("ERROR: U8_SET_CP_START failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
744 }
745 }
746
747 setOffset=offset;
748 UTF8_SET_CHAR_LIMIT_SAFE(input,0, setOffset, sizeof(input));
749 if(setOffset != limit_safe[i]){
750 log_err("ERROR: UTF8_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
751 }
752
753 setOffset=offset;
754 U8_SET_CP_LIMIT(input,0, setOffset, sizeof(input));
755 if(setOffset != limit_safe[i]){
756 log_err("ERROR: U8_SET_CP_LIMIT failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
757 }
758
759 i++;
760 }
761 }
762
TestSetCharUnsafe()763 static void TestSetCharUnsafe() {
764 static const uint8_t input[]
765 = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0x2e, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x80, 0x80, 0x00 };
766 static const int16_t start_unsafe[]
767 = {0, 1, 1, 1, 4, 5, 6, 7, 8, 9, 9, 9, 12, 12, 12, 15 };
768 static const int16_t limit_unsafe[]
769 = {0, 1, 4, 4, 4, 5, 6, 7, 9, 9, 10, 10, 10, 15, 15, 15, 16 };
770
771 uint32_t i=0;
772 int32_t offset=0, setOffset=0;
773 for(offset=0; offset<=LENGTHOF(input); offset++){
774 if (offset<LENGTHOF(input)){
775 setOffset=offset;
776 UTF8_SET_CHAR_START_UNSAFE(input, setOffset);
777 if(setOffset != start_unsafe[i]){
778 log_err("ERROR: UTF8_SET_CHAR_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
779 }
780
781 setOffset=offset;
782 U8_SET_CP_START_UNSAFE(input, setOffset);
783 if(setOffset != start_unsafe[i]){
784 log_err("ERROR: U8_SET_CP_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
785 }
786 }
787
788 if (offset != 0) { /* Can't have it go off the end of the array */
789 setOffset=offset;
790 UTF8_SET_CHAR_LIMIT_UNSAFE(input, setOffset);
791 if(setOffset != limit_unsafe[i]){
792 log_err("ERROR: UTF8_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
793 }
794
795 setOffset=offset;
796 U8_SET_CP_LIMIT_UNSAFE(input, setOffset);
797 if(setOffset != limit_unsafe[i]){
798 log_err("ERROR: U8_SET_CP_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
799 }
800 }
801
802 i++;
803 }
804 }
805
TestAppendChar()806 static void TestAppendChar(){
807 static const uint8_t s[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00};
808 static const uint32_t test[]={
809 /* append-position(unsafe), CHAR to be appended */
810 0, 0x10401,
811 2, 0x0028,
812 2, 0x007f,
813 3, 0xd801,
814 1, 0x20402,
815 8, 0x10401,
816 5, 0xc0,
817 5, 0xc1,
818 5, 0xfd,
819 6, 0x80,
820 6, 0x81,
821 6, 0xbf,
822 7, 0xfe,
823
824 /* append-position(safe), CHAR to be appended */
825 0, 0x10401,
826 2, 0x0028,
827 3, 0x7f,
828 3, 0xd801, /* illegal for UTF-8 starting with Unicode 3.2 */
829 1, 0x20402,
830 9, 0x10401,
831 5, 0xc0,
832 5, 0xc1,
833 5, 0xfd,
834 6, 0x80,
835 6, 0x81,
836 6, 0xbf,
837 7, 0xfe,
838
839 };
840 static const uint16_t movedOffset[]={
841 /* offset-moved-to(unsafe) */
842 4, /*for append-pos: 0 , CHAR 0x10401*/
843 3,
844 3,
845 6,
846 5,
847 12,
848 7,
849 7,
850 7,
851 8,
852 8,
853 8,
854 9,
855
856 /* offset-moved-to(safe) */
857 4, /*for append-pos: 0, CHAR 0x10401*/
858 3,
859 4,
860 6,
861 5,
862 11,
863 7,
864 7,
865 7,
866 8,
867 8,
868 8,
869 9,
870
871 };
872
873 static const uint8_t result[][11]={
874 /*unsafe*/
875 {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
876 {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
877 {0x61, 0x62, 0x7f, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
878 {0x61, 0x62, 0x63, 0xed, 0xa0, 0x81, 0x67, 0x68, 0x69, 0x6a, 0x00},
879 {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
880 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0xF0, 0x90, 0x90},
881
882 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
883 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
884 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
885
886 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
887 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
888 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
889
890 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
891 /*safe*/
892 {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
893 {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
894 {0x61, 0x62, 0x63, 0x7f, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
895 {0x61, 0x62, 0x63, 0xef, 0xbf, 0xbf, 0x67, 0x68, 0x69, 0x6a, 0x00},
896 {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
897 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xc2, 0x9f}, /*gets UTF8_ERROR_VALUE_2 which takes 2 bytes 0xc0, 0x9f*/
898
899 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
900 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
901 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
902
903 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
904 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
905 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
906
907 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
908
909 };
910 uint16_t i, count=0;
911 uint8_t str[12];
912 uint32_t offset;
913 /* UChar32 c=0;*/
914 uint16_t size=LENGTHOF(s);
915 for(i=0; i<LENGTHOF(test); i=(uint16_t)(i+2)){
916 uprv_memcpy(str, s, size);
917 offset=test[i];
918 if(count<13){
919 UTF8_APPEND_CHAR_UNSAFE(str, offset, test[i+1]);
920 if(offset != movedOffset[count]){
921 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",
922 count, movedOffset[count], offset);
923
924 }
925 if(uprv_memcmp(str, result[count], size) !=0){
926 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed for count=%d. \nExpected:", count);
927 printUChars(result[count], size);
928 log_err("\nGot: ");
929 printUChars(str, size);
930 log_err("\n");
931 }
932 }else{
933 UTF8_APPEND_CHAR_SAFE(str, offset, size, test[i+1]);
934 if(offset != movedOffset[count]){
935 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",
936 count, movedOffset[count], offset);
937
938 }
939 if(uprv_memcmp(str, result[count], size) !=0){
940 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed for count=%d. \nExpected:", count);
941 printUChars(result[count], size);
942 log_err("\nGot: ");
943 printUChars(str, size);
944 log_err("\n");
945 }
946 /*call the API instead of MACRO
947 uprv_memcpy(str, s, size);
948 offset=test[i];
949 c=test[i+1];
950 if((uint32_t)(c)<=0x7f) {
951 (str)[(offset)++]=(uint8_t)(c);
952 } else {
953 (offset)=utf8_appendCharSafeBody(str, (int32_t)(offset), (int32_t)(size), c);
954 }
955 if(offset != movedOffset[count]){
956 log_err("ERROR: utf8_appendCharSafeBody() failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",
957 count, movedOffset[count], offset);
958
959 }
960 if(uprv_memcmp(str, result[count], size) !=0){
961 log_err("ERROR: utf8_appendCharSafeBody() failed for count=%d. \nExpected:", count);
962 printUChars(result[count], size);
963 printf("\nGot: ");
964 printUChars(str, size);
965 printf("\n");
966 }
967 */
968 }
969 count++;
970 }
971
972
973 }
974
TestAppend()975 static void TestAppend() {
976 static const UChar32 codePoints[]={
977 0x61, 0xdf, 0x901, 0x3040,
978 0xac00, 0xd800, 0xdbff, 0xdcde,
979 0xdffd, 0xe000, 0xffff, 0x10000,
980 0x12345, 0xe0021, 0x10ffff, 0x110000,
981 0x234567, 0x7fffffff, -1, -1000,
982 0, 0x400
983 };
984 static const uint8_t expectUnsafe[]={
985 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80,
986 0xea, 0xb0, 0x80, 0xed, 0xa0, 0x80, 0xed, 0xaf, 0xbf, 0xed, 0xb3, 0x9e,
987 0xed, 0xbf, 0xbd, 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80,
988 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */
989 /* none from this line */
990 0, 0xd0, 0x80
991 }, expectSafe[]={
992 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80,
993 0xea, 0xb0, 0x80, /* no surrogates */
994 /* no surrogates */ 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80,
995 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */
996 /* none from this line */
997 0, 0xd0, 0x80
998 };
999
1000 uint8_t buffer[100];
1001 UChar32 c;
1002 int32_t i, length;
1003 UBool isError, expectIsError, wrongIsError;
1004
1005 length=0;
1006 for(i=0; i<LENGTHOF(codePoints); ++i) {
1007 c=codePoints[i];
1008 if(c<0 || 0x10ffff<c) {
1009 continue; /* skip non-code points for U8_APPEND_UNSAFE */
1010 }
1011
1012 U8_APPEND_UNSAFE(buffer, length, c);
1013 }
1014 if(length!=LENGTHOF(expectUnsafe) || 0!=memcmp(buffer, expectUnsafe, length)) {
1015 log_err("U8_APPEND_UNSAFE did not generate the expected output\n");
1016 }
1017
1018 length=0;
1019 wrongIsError=FALSE;
1020 for(i=0; i<LENGTHOF(codePoints); ++i) {
1021 c=codePoints[i];
1022 expectIsError= c<0 || 0x10ffff<c || U_IS_SURROGATE(c);
1023 isError=FALSE;
1024
1025 U8_APPEND(buffer, length, LENGTHOF(buffer), c, isError);
1026 wrongIsError|= isError!=expectIsError;
1027 }
1028 if(wrongIsError) {
1029 log_err("U8_APPEND did not set isError correctly\n");
1030 }
1031 if(length!=LENGTHOF(expectSafe) || 0!=memcmp(buffer, expectSafe, length)) {
1032 log_err("U8_APPEND did not generate the expected output\n");
1033 }
1034 }
1035
1036 static void
TestSurrogates()1037 TestSurrogates() {
1038 static const uint8_t b[]={
1039 0xc3, 0x9f, /* 00DF */
1040 0xed, 0x9f, 0xbf, /* D7FF */
1041 0xed, 0xa0, 0x81, /* D801 */
1042 0xed, 0xbf, 0xbe, /* DFFE */
1043 0xee, 0x80, 0x80, /* E000 */
1044 0xf0, 0x97, 0xbf, 0xbe /* 17FFE */
1045 };
1046 static const UChar32 cp[]={
1047 0xdf, 0xd7ff, 0xd801, 0xdffe, 0xe000, 0x17ffe
1048 };
1049
1050 UChar32 cu, cs, cl;
1051 int32_t i, j, k, iu, is, il, length;
1052
1053 k=0; /* index into cp[] */
1054 length=LENGTHOF(b);
1055 for(i=0; i<length;) {
1056 j=i;
1057 U8_NEXT_UNSAFE(b, j, cu);
1058 iu=j;
1059
1060 j=i;
1061 U8_NEXT(b, j, length, cs);
1062 is=j;
1063
1064 j=i;
1065 L8_NEXT(b, j, length, cl);
1066 il=j;
1067
1068 if(cu!=cp[k]) {
1069 log_err("U8_NEXT_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
1070 }
1071
1072 /* U8_NEXT() returns <0 for surrogate code points */
1073 if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
1074 log_err("U8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
1075 }
1076
1077 /* L8_NEXT() returns surrogate code points like U8_NEXT_UNSAFE() */
1078 if(cl!=cu) {
1079 log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
1080 }
1081
1082 if(is!=iu || il!=iu) {
1083 log_err("U8_NEXT(b[%ld]) or L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1084 }
1085
1086 ++k; /* next code point */
1087 i=iu; /* advance by one UTF-8 sequence */
1088 }
1089
1090 while(i>0) {
1091 --k; /* previous code point */
1092
1093 j=i;
1094 U8_PREV_UNSAFE(b, j, cu);
1095 iu=j;
1096
1097 j=i;
1098 U8_PREV(b, 0, j, cs);
1099 is=j;
1100
1101 j=i;
1102 L8_PREV(b, 0, j, cl);
1103 il=j;
1104
1105 if(cu!=cp[k]) {
1106 log_err("U8_PREV_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
1107 }
1108
1109 /* U8_PREV() returns <0 for surrogate code points */
1110 if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
1111 log_err("U8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
1112 }
1113
1114 /* L8_PREV() returns surrogate code points like U8_PREV_UNSAFE() */
1115 if(cl!=cu) {
1116 log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
1117 }
1118
1119 if(is!=iu || il !=iu) {
1120 log_err("U8_PREV(b[%ld]) or L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1121 }
1122
1123 i=iu; /* go back by one UTF-8 sequence */
1124 }
1125 }
1126
printUChars(const uint8_t * uchars,int16_t len)1127 static void printUChars(const uint8_t *uchars, int16_t len){
1128 int16_t i=0;
1129 for(i=0; i<len; i++){
1130 log_err("0x%02x ", *(uchars+i));
1131 }
1132 }
1133