• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /********************************************************************
2  * COPYRIGHT:
3  * Copyright (c) 1998-2012, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  ********************************************************************/
6 /*
7 * File utf8tst.c
8 *
9 * Modification History:
10 *
11 *   Date          Name        Description
12 *   07/24/2000    Madhu       Creation
13 *******************************************************************************
14 */
15 
16 #include "unicode/utypes.h"
17 #include "unicode/utf8.h"
18 #include "cmemory.h"
19 #include "cintltst.h"
20 
21 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
22 
23 /* lenient UTF-8 ------------------------------------------------------------ */
24 
25 /*
26  * Lenient UTF-8 differs from conformant UTF-8 in that it allows surrogate
27  * code points with their "natural" encoding.
28  * Effectively, this allows a mix of UTF-8 and CESU-8 as well as encodings of
29  * single surrogates.
30  *
31  * This is not conformant with UTF-8.
32  *
33  * Supplementary code points may be encoded as pairs of 3-byte sequences, but
34  * the macros below do not attempt to assemble such pairs.
35  */
36 
37 #define L8_NEXT(s, i, length, c) { \
38     (c)=(uint8_t)(s)[(i)++]; \
39     if((c)>=0x80) { \
40         if(U8_IS_LEAD(c)) { \
41             (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
42         } else { \
43             (c)=U_SENTINEL; \
44         } \
45     } \
46 }
47 
48 #define L8_PREV(s, start, i, c) { \
49     (c)=(uint8_t)(s)[--(i)]; \
50     if((c)>=0x80) { \
51         if((c)<=0xbf) { \
52             (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
53         } else { \
54             (c)=U_SENTINEL; \
55         } \
56     } \
57 }
58 
59 /* -------------------------------------------------------------------------- */
60 
61 static void printUChars(const uint8_t *uchars, int16_t len);
62 
63 static void TestCodeUnitValues(void);
64 static void TestCharLength(void);
65 static void TestGetChar(void);
66 static void TestNextPrevChar(void);
67 static void TestNulTerminated(void);
68 static void TestNextPrevNonCharacters(void);
69 static void TestNextPrevCharUnsafe(void);
70 static void TestFwdBack(void);
71 static void TestFwdBackUnsafe(void);
72 static void TestSetChar(void);
73 static void TestSetCharUnsafe(void);
74 static void TestAppendChar(void);
75 static void TestAppend(void);
76 static void TestSurrogates(void);
77 
78 void addUTF8Test(TestNode** root);
79 
80 void
addUTF8Test(TestNode ** root)81 addUTF8Test(TestNode** root)
82 {
83     addTest(root, &TestCodeUnitValues,          "utf8tst/TestCodeUnitValues");
84     addTest(root, &TestCharLength,              "utf8tst/TestCharLength");
85     addTest(root, &TestGetChar,                 "utf8tst/TestGetChar");
86     addTest(root, &TestNextPrevChar,            "utf8tst/TestNextPrevChar");
87     addTest(root, &TestNulTerminated,           "utf8tst/TestNulTerminated");
88     addTest(root, &TestNextPrevNonCharacters,   "utf8tst/TestNextPrevNonCharacters");
89     addTest(root, &TestNextPrevCharUnsafe,      "utf8tst/TestNextPrevCharUnsafe");
90     addTest(root, &TestFwdBack,                 "utf8tst/TestFwdBack");
91     addTest(root, &TestFwdBackUnsafe,           "utf8tst/TestFwdBackUnsafe");
92     addTest(root, &TestSetChar,                 "utf8tst/TestSetChar");
93     addTest(root, &TestSetCharUnsafe,           "utf8tst/TestSetCharUnsafe");
94     addTest(root, &TestAppendChar,              "utf8tst/TestAppendChar");
95     addTest(root, &TestAppend,                  "utf8tst/TestAppend");
96     addTest(root, &TestSurrogates,              "utf8tst/TestSurrogates");
97 }
98 
TestCodeUnitValues()99 static void TestCodeUnitValues()
100 {
101     static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc0, 0xc4, 0xf0, 0xfd, 0x80, 0x81, 0xbc, 0xbe,};
102 
103     int16_t i;
104     for(i=0; i<LENGTHOF(codeunit); i++){
105         uint8_t c=codeunit[i];
106         log_verbose("Testing code unit value of %x\n", c);
107         if(i<4){
108             if(!UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || UTF8_IS_TRAIL(c) || !U8_IS_SINGLE(c) || U8_IS_LEAD(c) || U8_IS_TRAIL(c)){
109                 log_err("ERROR: 0x%02x is a single byte but results in single: %c lead: %c trail: %c\n",
110                     c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
111             }
112         } else if(i< 8){
113             if(!UTF8_IS_LEAD(c) || UTF8_IS_SINGLE(c) || UTF8_IS_TRAIL(c) || !U8_IS_LEAD(c) || U8_IS_SINGLE(c) || U8_IS_TRAIL(c)){
114                 log_err("ERROR: 0x%02x is a lead byte but results in single: %c lead: %c trail: %c\n",
115                     c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
116             }
117         } else if(i< 12){
118             if(!UTF8_IS_TRAIL(c) || UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || !U8_IS_TRAIL(c) || U8_IS_SINGLE(c) || U8_IS_LEAD(c)){
119                 log_err("ERROR: 0x%02x is a trail byte but results in single: %c lead: %c trail: %c\n",
120                     c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
121             }
122         }
123     }
124 }
125 
TestCharLength()126 static void TestCharLength()
127 {
128     static const uint32_t codepoint[]={
129         1, 0x0061,
130         1, 0x007f,
131         2, 0x016f,
132         2, 0x07ff,
133         3, 0x0865,
134         3, 0x20ac,
135         4, 0x20402,
136         4, 0x23456,
137         4, 0x24506,
138         4, 0x20402,
139         4, 0x10402,
140         3, 0xd7ff,
141         3, 0xe000,
142 
143     };
144 
145     int16_t i;
146     UBool multiple;
147     for(i=0; i<LENGTHOF(codepoint); i=(int16_t)(i+2)){
148         UChar32 c=codepoint[i+1];
149         if(UTF8_CHAR_LENGTH(c) != (uint16_t)codepoint[i] || U8_LENGTH(c) != (uint16_t)codepoint[i]){
150               log_err("The no: of code units for %lx:- Expected: %d Got: %d\n", c, codepoint[i], UTF8_CHAR_LENGTH(c));
151         }else{
152               log_verbose("The no: of code units for %lx is %d\n",c, UTF8_CHAR_LENGTH(c));
153         }
154         multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
155         if(UTF8_NEED_MULTIPLE_UCHAR(c) != multiple){
156               log_err("ERROR: UTF8_NEED_MULTIPLE_UCHAR failed for %lx\n", c);
157         }
158     }
159 }
160 
TestGetChar()161 static void TestGetChar()
162 {
163     static const uint8_t input[]={
164     /*  code unit,*/
165         0x61,
166         0x7f,
167         0xe4,
168         0xba,
169         0x8c,
170         0xF0,
171         0x90,
172         0x90,
173         0x81,
174         0xc0,
175         0x65,
176         0x31,
177         0x9a,
178         0xc9
179     };
180     static const UChar32 result[]={
181     /*  codepoint-unsafe, codepoint-safe(not strict)  codepoint-safe(strict) */
182         0x61,             0x61,                       0x61,
183         0x7f,             0x7f,                       0x7f,
184         0x4e8c,           0x4e8c,                     0x4e8c,
185         0x4e8c,           0x4e8c,                     0x4e8c ,
186         0x4e8c,           0x4e8c,                     0x4e8c,
187         0x10401,          0x10401,                    0x10401 ,
188         0x10401,          0x10401,                    0x10401 ,
189         0x10401,          0x10401,                    0x10401 ,
190         0x10401,          0x10401,                    0x10401,
191         0x25,             UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,
192         0x65,             0x65,                       0x65,
193         0x31,             0x31,                       0x31,
194         0x31,             UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,
195         0x240,            UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1
196     };
197     uint16_t i=0;
198     UChar32 c, expected;
199     uint32_t offset=0;
200 
201     for(offset=0; offset<sizeof(input); offset++) {
202         if (offset < sizeof(input) - 1) {
203             UTF8_GET_CHAR_UNSAFE(input, offset, c);
204             if(c != result[i]){
205                 log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
206 
207             }
208 
209             U8_GET_UNSAFE(input, offset, c);
210             if(c != result[i]){
211                 log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
212 
213             }
214         }
215 
216         UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE);
217         expected=result[i+1];
218         if(c != expected){
219             log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
220         }
221 
222         U8_GET(input, 0, offset, sizeof(input), c);
223         if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
224         if(c != expected){
225             log_err("ERROR: U8_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
226         }
227 
228         U8_GET_OR_FFFD(input, 0, offset, sizeof(input), c);
229         if(expected<0) { expected=0xfffd; }
230         if(c != expected){
231             log_err("ERROR: U8_GET_OR_FFFD failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
232         }
233 
234         UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, TRUE);
235         if(c != result[i+2]){
236             log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
237         }
238 
239         i=(uint16_t)(i+3);
240     }
241 }
242 
TestNextPrevChar()243 static void TestNextPrevChar() {
244     static const uint8_t input[]={0x61, 0xf0, 0x90, 0x90, 0x81, 0xc0, 0x80, 0xfd, 0xbe, 0xc2, 0x61, 0x81, 0x90, 0x90, 0xf0, 0x00};
245     static const UChar32 result[]={
246     /*  next_unsafe    next_safe_ns        next_safe_s          prev_unsafe   prev_safe_ns        prev_safe_s */
247         0x0061,        0x0061,             0x0061,              0x0000,       0x0000,             0x0000,
248         0x10401,       0x10401,            0x10401,             0xf0,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
249         0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x2841410,    UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
250         0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xa1050,      UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
251         0x81,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x2841,       UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
252         0x00,          UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,  0x61,         0x61,               0x61,
253         0x80,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xc2,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
254         0xfd,          UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,  0x77e,        UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
255         0xbe,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xfd,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
256         0xa1,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x00,         UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
257         0x61,          0x61,               0x61,                0xc0,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
258         0x81,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x10401,      0x10401,            0x10401,
259         0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x410,        UTF_ERROR_VALUE,    UTF_ERROR_VALUE,
260         0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x410,        UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
261         0x0840,        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xf0,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
262         0x0000,        0x0000,             0x0000,              0x0061,       0x0061,             0x0061
263     };
264     static const int32_t movedOffset[]={
265     /*  next_unsafe   next_safe_ns next_safe_s       prev_unsafe   prev_safe_ns      prev_safe_s */
266         1,            1,           1,                15,           15,               15,
267         5,            5,           5,                14,           14 ,              14,
268         3,            3,           3,                9,            13,               13,
269         4,            4,           4,                9,            12,               12,
270         5,            5,           5,                9,            11,               11,
271         7,            7,           7,                10,           10,               10,
272         7,            7,           7,                9,            9,                9,
273         8,            9,           9,                7,            7,                7,
274         9,            9,           9,                7,            7,                7,
275         11,           10,          10,               5,            5,                5,
276         11,           11,          11,               5,            5,                5,
277         12,           12,          12,               1,            1,                1,
278         13,           13,          13,               1,            1,                1,
279         14,           14,          14,               1,            1,                1,
280         14,           15,          15,               1,            1,                1,
281         14,           16,          16,               0,            0,                0,
282     };
283     /* TODO: remove unused columns for next_unsafe & prev_unsafe, and adjust the test code */
284 
285     UChar32 c, expected;
286     uint32_t i=0;
287     uint32_t offset=0;
288     int32_t setOffset=0;
289     for(offset=0; offset<sizeof(input); offset++){
290          setOffset=offset;
291          UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
292          if(setOffset != movedOffset[i+1]){
293              log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
294                  offset, movedOffset[i+1], setOffset);
295          }
296         expected=result[i+1];
297         if(c != expected){
298             log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
299         }
300 
301          setOffset=offset;
302          U8_NEXT(input, setOffset, sizeof(input), c);
303          if(setOffset != movedOffset[i+1]){
304              log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
305                  offset, movedOffset[i+1], setOffset);
306          }
307         if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
308         if(c != expected){
309             log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
310         }
311 
312         setOffset=offset;
313         U8_NEXT_OR_FFFD(input, setOffset, sizeof(input), c);
314         if(setOffset != movedOffset[i+1]){
315             log_err("ERROR: U8_NEXT_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
316                 offset, movedOffset[i+1], setOffset);
317         }
318         if(expected<0) { expected=0xfffd; }
319         if(c != expected){
320             log_err("ERROR: U8_NEXT_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
321         }
322 
323          setOffset=offset;
324          UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
325          if(setOffset != movedOffset[i+1]){
326              log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
327                  offset, movedOffset[i+2], setOffset);
328          }
329          if(c != result[i+2]){
330              log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
331          }
332 
333          i=i+6;
334     }
335 
336     i=0;
337     for(offset=sizeof(input); offset > 0; --offset){
338          setOffset=offset;
339          UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
340          if(setOffset != movedOffset[i+4]){
341              log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
342                  offset, movedOffset[i+4], setOffset);
343          }
344         expected=result[i+4];
345         if(c != expected){
346             log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
347         }
348 
349          setOffset=offset;
350          U8_PREV(input, 0, setOffset, c);
351          if(setOffset != movedOffset[i+4]){
352              log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
353                  offset, movedOffset[i+4], setOffset);
354          }
355         if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
356         if(c != expected){
357             log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
358         }
359 
360         setOffset=offset;
361         U8_PREV_OR_FFFD(input, 0, setOffset, c);
362         if(setOffset != movedOffset[i+4]){
363             log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
364                 offset, movedOffset[i+4], setOffset);
365         }
366         if(expected<0) { expected=0xfffd; }
367         if(c != expected){
368             log_err("ERROR: U8_PREV_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
369         }
370 
371          setOffset=offset;
372          UTF8_PREV_CHAR_SAFE(input, 0,  setOffset, c, TRUE);
373          if(setOffset != movedOffset[i+5]){
374              log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
375                  offset, movedOffset[i+5], setOffset);
376          }
377          if(c != result[i+5]){
378              log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c);
379          }
380 
381          i=i+6;
382     }
383 }
384 
385 /* keep this in sync with utf16tst.c's TestNulTerminated() */
TestNulTerminated()386 static void TestNulTerminated() {
387     static const uint8_t input[]={
388         /*  0 */  0x61,
389         /*  1 */  0xf0, 0x90, 0x90, 0x81,
390         /*  5 */  0xc0, 0x80,
391         /*  7 */  0xdf, 0x80,
392         /*  9 */  0xc2,
393         /* 10 */  0x62,
394         /* 11 */  0xfd, 0xbe,
395         /* 13 */  0xe0, 0xa0, 0x80,
396         /* 16 */  0xe2, 0x82, 0xac,
397         /* 19 */  0xf0, 0x90, 0x90,
398         /* 22 */  0x00
399         /* 23 */
400     };
401     static const UChar32 result[]={
402         0x61,
403         0x10401,
404         U_SENTINEL,
405         0x7c0,
406         U_SENTINEL,
407         0x62,
408         U_SENTINEL,
409         0x800,
410         0x20ac,
411         U_SENTINEL,
412         0
413     };
414 
415     UChar32 c, c2, expected;
416     int32_t i0, i=0, j, k, expectedIndex;
417     int32_t cpIndex=0;
418     do {
419         i0=i;
420         U8_NEXT(input, i, -1, c);
421         expected=result[cpIndex];
422         if(c!=expected) {
423             log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, expected);
424         }
425         j=i0;
426         U8_NEXT_OR_FFFD(input, j, -1, c);
427         if(expected<0) { expected=0xfffd; }
428         if(c!=expected) {
429             log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0, c, expected);
430         }
431         if(j!=i) {
432             log_err("U8_NEXT_OR_FFFD() moved to index %d but U8_NEXT() moved to %d\n", j, i);
433         }
434         j=i0;
435         U8_FWD_1(input, j, -1);
436         if(j!=i) {
437             log_err("U8_FWD_1() moved to index %d but U8_NEXT() moved to %d\n", j, i);
438         }
439         ++cpIndex;
440         /*
441          * Move by this many code points from the start.
442          * U8_FWD_N() stops at the end of the string, that is, at the NUL if necessary.
443          */
444         expectedIndex= (c==0) ? i-1 : i;
445         k=0;
446         U8_FWD_N(input, k, -1, cpIndex);
447         if(k!=expectedIndex) {
448             log_err("U8_FWD_N(code points from 0) moved to index %d but expected %d\n", k, expectedIndex);
449         }
450     } while(c!=0);
451 
452     i=0;
453     do {
454         j=i0=i;
455         U8_NEXT(input, i, -1, c);
456         do {
457             U8_GET(input, 0, j, -1, c2);
458             if(c2!=c) {
459                 log_err("U8_NEXT(from %d)=U+%04x != U+%04x=U8_GET(at %d)\n", i0, c, c2, j);
460             }
461             U8_GET_OR_FFFD(input, 0, j, -1, c2);
462             expected= (c>=0) ? c : 0xfffd;
463             if(c2!=expected) {
464                 log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U8_GET_OR_FFFD(at %d)\n", i0, expected, c2, j);
465             }
466             /* U8_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */
467             k=j+1;
468             U8_SET_CP_LIMIT(input, 0, k, -1);
469             if(k!=i) {
470                 log_err("U8_NEXT() moved to %d but U8_SET_CP_LIMIT(%d) moved to %d\n", i, j+1, k);
471             }
472         } while(++j<i);
473     } while(c!=0);
474 }
475 
TestNextPrevNonCharacters()476 static void TestNextPrevNonCharacters() {
477     /* test non-characters */
478     static const uint8_t nonChars[]={
479         0xef, 0xb7, 0x90,       /* U+fdd0 */
480         0xef, 0xbf, 0xbf,       /* U+feff */
481         0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */
482         0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */
483         0xf4, 0x8f, 0xbf, 0xbe  /* U+10fffe */
484     };
485 
486     UChar32 ch;
487     int32_t idx;
488 
489     for(idx=0; idx<(int32_t)sizeof(nonChars);) {
490         U8_NEXT(nonChars, idx, sizeof(nonChars), ch);
491         if(!U_IS_UNICODE_NONCHAR(ch)) {
492             log_err("U8_NEXT(before %d) failed to read a non-character\n", idx);
493         }
494     }
495     for(idx=(int32_t)sizeof(nonChars); idx>0;) {
496         U8_PREV(nonChars, 0, idx, ch);
497         if(!U_IS_UNICODE_NONCHAR(ch)) {
498             log_err("U8_PREV(at %d) failed to read a non-character\n", idx);
499         }
500     }
501 }
502 
TestNextPrevCharUnsafe()503 static void TestNextPrevCharUnsafe() {
504     /*
505      * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
506      * The behavior of _UNSAFE macros for ill-formed strings is undefined.
507      */
508     static const uint8_t input[]={
509         0x61,
510         0xf0, 0x90, 0x90, 0x81,
511         0xc0, 0x80,  /* non-shortest form */
512         0xe2, 0x82, 0xac,
513         0xc2, 0xa1,
514         0xf4, 0x8f, 0xbf, 0xbf,
515         0x00
516     };
517     static const UChar32 codePoints[]={
518         0x61,
519         0x10401,
520         0,
521         0x20ac,
522         0xa1,
523         0x10ffff,
524         0
525     };
526 
527     UChar32 c;
528     int32_t i;
529     uint32_t offset;
530     for(i=0, offset=0; offset<sizeof(input); ++i) {
531         UTF8_NEXT_CHAR_UNSAFE(input, offset, c);
532         if(c != codePoints[i]){
533             log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
534                     offset, codePoints[i], c);
535         }
536     }
537     for(i=0, offset=0; offset<sizeof(input); ++i) {
538         U8_NEXT_UNSAFE(input, offset, c);
539         if(c != codePoints[i]){
540             log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
541                     offset, codePoints[i], c);
542         }
543     }
544 
545     for(i=LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
546          UTF8_PREV_CHAR_UNSAFE(input, offset, c);
547          if(c != codePoints[i]){
548              log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
549                      offset, codePoints[i], c);
550          }
551     }
552     for(i=LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
553          U8_PREV_UNSAFE(input, offset, c);
554          if(c != codePoints[i]){
555              log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
556                      offset, codePoints[i], c);
557          }
558     }
559 }
560 
TestFwdBack()561 static void TestFwdBack() {
562     static const uint8_t input[]={0x61, 0xF0, 0x90, 0x90, 0x81, 0xff, 0x62, 0xc0, 0x80, 0x7f, 0x8f, 0xc0, 0x63, 0x81, 0x90, 0x90, 0xF0, 0x00};
563     static const uint16_t fwd_safe[]   ={1, 5, 6, 7, 9, 10, 11,  12, 13, 14, 15, 16, 17, 18};
564     static const uint16_t back_safe[]  ={17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6, 5, 1, 0};
565 
566     static const uint16_t Nvalue[]= {0, 1, 2, 3, 1, 2, 1, 5};
567     static const uint16_t fwd_N_safe[]   ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */
568     static const uint16_t back_N_safe[]  ={18, 17, 15, 12, 11, 9, 7, 0};
569 
570     uint32_t offsafe=0;
571 
572     uint32_t i=0;
573     while(offsafe < sizeof(input)){
574         UTF8_FWD_1_SAFE(input, offsafe, sizeof(input));
575         if(offsafe != fwd_safe[i]){
576             log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
577         }
578         i++;
579     }
580 
581     i=0;
582     while(offsafe < sizeof(input)){
583         U8_FWD_1(input, offsafe, sizeof(input));
584         if(offsafe != fwd_safe[i]){
585             log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
586         }
587         i++;
588     }
589 
590     i=0;
591     offsafe=sizeof(input);
592     while(offsafe > 0){
593         UTF8_BACK_1_SAFE(input, 0,  offsafe);
594         if(offsafe != back_safe[i]){
595             log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_safe[i], offsafe);
596         }
597         i++;
598     }
599 
600     i=0;
601     offsafe=sizeof(input);
602     while(offsafe > 0){
603         U8_BACK_1(input, 0,  offsafe);
604         if(offsafe != back_safe[i]){
605             log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_safe[i], offsafe);
606         }
607         i++;
608     }
609 
610     offsafe=0;
611     for(i=0; i<LENGTHOF(Nvalue); i++){
612         UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]);
613         if(offsafe != fwd_N_safe[i]){
614             log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
615         }
616 
617     }
618 
619     offsafe=0;
620     for(i=0; i<LENGTHOF(Nvalue); i++){
621         U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]);
622         if(offsafe != fwd_N_safe[i]){
623             log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
624         }
625 
626     }
627 
628     offsafe=sizeof(input);
629     for(i=0; i<LENGTHOF(Nvalue); i++){
630         UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]);
631         if(offsafe != back_N_safe[i]){
632             log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
633         }
634     }
635 
636     offsafe=sizeof(input);
637     for(i=0; i<LENGTHOF(Nvalue); i++){
638         U8_BACK_N(input, 0, offsafe, Nvalue[i]);
639         if(offsafe != back_N_safe[i]){
640             log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
641         }
642     }
643 }
644 
TestFwdBackUnsafe()645 static void TestFwdBackUnsafe() {
646     /*
647      * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
648      * The behavior of _UNSAFE macros for ill-formed strings is undefined.
649      */
650     static const uint8_t input[]={
651         0x61,
652         0xf0, 0x90, 0x90, 0x81,
653         0xc0, 0x80,  /* non-shortest form */
654         0xe2, 0x82, 0xac,
655         0xc2, 0xa1,
656         0xf4, 0x8f, 0xbf, 0xbf,
657         0x00
658     };
659     static const int8_t boundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 };
660 
661     int32_t offset;
662     int32_t i;
663     for(i=1, offset=0; offset<LENGTHOF(input); ++i) {
664         UTF8_FWD_1_UNSAFE(input, offset);
665         if(offset != boundaries[i]){
666             log_err("ERROR: UTF8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
667         }
668     }
669     for(i=1, offset=0; offset<LENGTHOF(input); ++i) {
670         U8_FWD_1_UNSAFE(input, offset);
671         if(offset != boundaries[i]){
672             log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
673         }
674     }
675 
676     for(i=LENGTHOF(boundaries)-2, offset=LENGTHOF(input); offset>0; --i) {
677         UTF8_BACK_1_UNSAFE(input, offset);
678         if(offset != boundaries[i]){
679             log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
680         }
681     }
682     for(i=LENGTHOF(boundaries)-2, offset=LENGTHOF(input); offset>0; --i) {
683         U8_BACK_1_UNSAFE(input, offset);
684         if(offset != boundaries[i]){
685             log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
686         }
687     }
688 
689     for(i=0; i<LENGTHOF(boundaries); ++i) {
690         offset=0;
691         UTF8_FWD_N_UNSAFE(input, offset, i);
692         if(offset != boundaries[i]) {
693             log_err("ERROR: UTF8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
694         }
695     }
696     for(i=0; i<LENGTHOF(boundaries); ++i) {
697         offset=0;
698         U8_FWD_N_UNSAFE(input, offset, i);
699         if(offset != boundaries[i]) {
700             log_err("ERROR: U8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
701         }
702     }
703 
704     for(i=0; i<LENGTHOF(boundaries); ++i) {
705         int32_t j=LENGTHOF(boundaries)-1-i;
706         offset=LENGTHOF(input);
707         UTF8_BACK_N_UNSAFE(input, offset, i);
708         if(offset != boundaries[j]) {
709             log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[j], offset);
710         }
711     }
712     for(i=0; i<LENGTHOF(boundaries); ++i) {
713         int32_t j=LENGTHOF(boundaries)-1-i;
714         offset=LENGTHOF(input);
715         U8_BACK_N_UNSAFE(input, offset, i);
716         if(offset != boundaries[j]) {
717             log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[j], offset);
718         }
719     }
720 }
721 
TestSetChar()722 static void TestSetChar() {
723     static const uint8_t input[]
724         = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0xfe, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x00 };
725     static const int16_t start_safe[]
726         = {0,    1,    1,    1,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13,  14 };
727     static const int16_t limit_safe[]
728         = {0,    1,    4,    4,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13,  14 };
729 
730     uint32_t i=0;
731     int32_t offset=0, setOffset=0;
732     for(offset=0; offset<=LENGTHOF(input); offset++){
733         if (offset<LENGTHOF(input)){
734             setOffset=offset;
735             UTF8_SET_CHAR_START_SAFE(input, 0, setOffset);
736             if(setOffset != start_safe[i]){
737                 log_err("ERROR: UTF8_SET_CHAR_START_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
738             }
739 
740             setOffset=offset;
741             U8_SET_CP_START(input, 0, setOffset);
742             if(setOffset != start_safe[i]){
743                 log_err("ERROR: U8_SET_CP_START failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
744             }
745         }
746 
747         setOffset=offset;
748         UTF8_SET_CHAR_LIMIT_SAFE(input,0, setOffset, sizeof(input));
749         if(setOffset != limit_safe[i]){
750             log_err("ERROR: UTF8_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
751         }
752 
753         setOffset=offset;
754         U8_SET_CP_LIMIT(input,0, setOffset, sizeof(input));
755         if(setOffset != limit_safe[i]){
756             log_err("ERROR: U8_SET_CP_LIMIT failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
757         }
758 
759         i++;
760     }
761 }
762 
TestSetCharUnsafe()763 static void TestSetCharUnsafe() {
764     static const uint8_t input[]
765         = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0x2e, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x80, 0x80, 0x00 };
766     static const int16_t start_unsafe[]
767         = {0,    1,    1,    1,    4,    5,    6,    7,    8,    9,    9,    9,    12,   12,   12,   15 };
768     static const int16_t limit_unsafe[]
769         = {0,    1,    4,    4,    4,    5,    6,    7,    9,    9,    10,   10,   10,   15,   15,   15,   16 };
770 
771     uint32_t i=0;
772     int32_t offset=0, setOffset=0;
773     for(offset=0; offset<=LENGTHOF(input); offset++){
774         if (offset<LENGTHOF(input)){
775             setOffset=offset;
776             UTF8_SET_CHAR_START_UNSAFE(input, setOffset);
777             if(setOffset != start_unsafe[i]){
778                 log_err("ERROR: UTF8_SET_CHAR_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
779             }
780 
781             setOffset=offset;
782             U8_SET_CP_START_UNSAFE(input, setOffset);
783             if(setOffset != start_unsafe[i]){
784                 log_err("ERROR: U8_SET_CP_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
785             }
786         }
787 
788         if (offset != 0) { /* Can't have it go off the end of the array */
789             setOffset=offset;
790             UTF8_SET_CHAR_LIMIT_UNSAFE(input, setOffset);
791             if(setOffset != limit_unsafe[i]){
792                 log_err("ERROR: UTF8_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
793             }
794 
795             setOffset=offset;
796             U8_SET_CP_LIMIT_UNSAFE(input, setOffset);
797             if(setOffset != limit_unsafe[i]){
798                 log_err("ERROR: U8_SET_CP_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
799             }
800         }
801 
802         i++;
803     }
804 }
805 
TestAppendChar()806 static void TestAppendChar(){
807     static const uint8_t s[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00};
808     static const uint32_t test[]={
809     /*  append-position(unsafe),  CHAR to be appended */
810         0,                        0x10401,
811         2,                        0x0028,
812         2,                        0x007f,
813         3,                        0xd801,
814         1,                        0x20402,
815         8,                        0x10401,
816         5,                        0xc0,
817         5,                        0xc1,
818         5,                        0xfd,
819         6,                        0x80,
820         6,                        0x81,
821         6,                        0xbf,
822         7,                        0xfe,
823 
824     /*  append-position(safe),    CHAR to be appended */
825         0,                        0x10401,
826         2,                        0x0028,
827         3,                        0x7f,
828         3,                        0xd801,   /* illegal for UTF-8 starting with Unicode 3.2 */
829         1,                        0x20402,
830         9,                        0x10401,
831         5,                        0xc0,
832         5,                        0xc1,
833         5,                        0xfd,
834         6,                        0x80,
835         6,                        0x81,
836         6,                        0xbf,
837         7,                        0xfe,
838 
839     };
840     static const uint16_t movedOffset[]={
841     /* offset-moved-to(unsafe) */
842           4,              /*for append-pos: 0 , CHAR 0x10401*/
843           3,
844           3,
845           6,
846           5,
847           12,
848           7,
849           7,
850           7,
851           8,
852           8,
853           8,
854           9,
855 
856     /* offset-moved-to(safe) */
857           4,              /*for append-pos: 0, CHAR  0x10401*/
858           3,
859           4,
860           6,
861           5,
862           11,
863           7,
864           7,
865           7,
866           8,
867           8,
868           8,
869           9,
870 
871     };
872 
873     static const uint8_t result[][11]={
874         /*unsafe*/
875         {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
876         {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
877         {0x61, 0x62, 0x7f, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
878         {0x61, 0x62, 0x63, 0xed, 0xa0, 0x81, 0x67, 0x68, 0x69, 0x6a, 0x00},
879         {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
880         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0xF0, 0x90, 0x90},
881 
882         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
883         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
884         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
885 
886         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
887         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
888         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
889 
890         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
891         /*safe*/
892         {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
893         {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
894         {0x61, 0x62, 0x63, 0x7f, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
895         {0x61, 0x62, 0x63, 0xef, 0xbf, 0xbf, 0x67, 0x68, 0x69, 0x6a, 0x00},
896         {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
897         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xc2, 0x9f}, /*gets UTF8_ERROR_VALUE_2 which takes 2 bytes 0xc0, 0x9f*/
898 
899         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
900         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
901         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
902 
903         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
904         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
905         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
906 
907         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
908 
909     };
910     uint16_t i, count=0;
911     uint8_t str[12];
912     uint32_t offset;
913 /*    UChar32 c=0;*/
914     uint16_t size=LENGTHOF(s);
915     for(i=0; i<LENGTHOF(test); i=(uint16_t)(i+2)){
916         uprv_memcpy(str, s, size);
917         offset=test[i];
918         if(count<13){
919             UTF8_APPEND_CHAR_UNSAFE(str, offset, test[i+1]);
920             if(offset != movedOffset[count]){
921                 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d  currentOffset=%d\n",
922                     count, movedOffset[count], offset);
923 
924             }
925             if(uprv_memcmp(str, result[count], size) !=0){
926                 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed for count=%d. \nExpected:", count);
927                 printUChars(result[count], size);
928                 log_err("\nGot:      ");
929                 printUChars(str, size);
930                 log_err("\n");
931             }
932         }else{
933             UTF8_APPEND_CHAR_SAFE(str, offset, size, test[i+1]);
934             if(offset != movedOffset[count]){
935                 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d  currentOffset=%d\n",
936                     count, movedOffset[count], offset);
937 
938             }
939             if(uprv_memcmp(str, result[count], size) !=0){
940                 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed for count=%d. \nExpected:", count);
941                 printUChars(result[count], size);
942                 log_err("\nGot:     ");
943                 printUChars(str, size);
944                 log_err("\n");
945             }
946             /*call the API instead of MACRO
947             uprv_memcpy(str, s, size);
948             offset=test[i];
949             c=test[i+1];
950             if((uint32_t)(c)<=0x7f) {
951                   (str)[(offset)++]=(uint8_t)(c);
952             } else {
953                  (offset)=utf8_appendCharSafeBody(str, (int32_t)(offset), (int32_t)(size), c);
954             }
955             if(offset != movedOffset[count]){
956                 log_err("ERROR: utf8_appendCharSafeBody() failed to move the offset correctly for count=%d.\nExpectedOffset=%d  currentOffset=%d\n",
957                     count, movedOffset[count], offset);
958 
959             }
960             if(uprv_memcmp(str, result[count], size) !=0){
961                 log_err("ERROR: utf8_appendCharSafeBody() failed for count=%d. \nExpected:", count);
962                 printUChars(result[count], size);
963                 printf("\nGot:     ");
964                 printUChars(str, size);
965                 printf("\n");
966             }
967             */
968         }
969         count++;
970     }
971 
972 
973 }
974 
TestAppend()975 static void TestAppend() {
976     static const UChar32 codePoints[]={
977         0x61, 0xdf, 0x901, 0x3040,
978         0xac00, 0xd800, 0xdbff, 0xdcde,
979         0xdffd, 0xe000, 0xffff, 0x10000,
980         0x12345, 0xe0021, 0x10ffff, 0x110000,
981         0x234567, 0x7fffffff, -1, -1000,
982         0, 0x400
983     };
984     static const uint8_t expectUnsafe[]={
985         0x61,  0xc3, 0x9f,  0xe0, 0xa4, 0x81,  0xe3, 0x81, 0x80,
986         0xea, 0xb0, 0x80,  0xed, 0xa0, 0x80,  0xed, 0xaf, 0xbf,  0xed, 0xb3, 0x9e,
987         0xed, 0xbf, 0xbd,  0xee, 0x80, 0x80,  0xef, 0xbf, 0xbf,  0xf0, 0x90, 0x80, 0x80,
988         0xf0, 0x92, 0x8d, 0x85,  0xf3, 0xa0, 0x80, 0xa1,  0xf4, 0x8f, 0xbf, 0xbf,  /* not 0x110000 */
989         /* none from this line */
990         0,  0xd0, 0x80
991     }, expectSafe[]={
992         0x61,  0xc3, 0x9f,  0xe0, 0xa4, 0x81,  0xe3, 0x81, 0x80,
993         0xea, 0xb0, 0x80,  /* no surrogates */
994         /* no surrogates */  0xee, 0x80, 0x80,  0xef, 0xbf, 0xbf,  0xf0, 0x90, 0x80, 0x80,
995         0xf0, 0x92, 0x8d, 0x85,  0xf3, 0xa0, 0x80, 0xa1,  0xf4, 0x8f, 0xbf, 0xbf,  /* not 0x110000 */
996         /* none from this line */
997         0,  0xd0, 0x80
998     };
999 
1000     uint8_t buffer[100];
1001     UChar32 c;
1002     int32_t i, length;
1003     UBool isError, expectIsError, wrongIsError;
1004 
1005     length=0;
1006     for(i=0; i<LENGTHOF(codePoints); ++i) {
1007         c=codePoints[i];
1008         if(c<0 || 0x10ffff<c) {
1009             continue; /* skip non-code points for U8_APPEND_UNSAFE */
1010         }
1011 
1012         U8_APPEND_UNSAFE(buffer, length, c);
1013     }
1014     if(length!=LENGTHOF(expectUnsafe) || 0!=memcmp(buffer, expectUnsafe, length)) {
1015         log_err("U8_APPEND_UNSAFE did not generate the expected output\n");
1016     }
1017 
1018     length=0;
1019     wrongIsError=FALSE;
1020     for(i=0; i<LENGTHOF(codePoints); ++i) {
1021         c=codePoints[i];
1022         expectIsError= c<0 || 0x10ffff<c || U_IS_SURROGATE(c);
1023         isError=FALSE;
1024 
1025         U8_APPEND(buffer, length, LENGTHOF(buffer), c, isError);
1026         wrongIsError|= isError!=expectIsError;
1027     }
1028     if(wrongIsError) {
1029         log_err("U8_APPEND did not set isError correctly\n");
1030     }
1031     if(length!=LENGTHOF(expectSafe) || 0!=memcmp(buffer, expectSafe, length)) {
1032         log_err("U8_APPEND did not generate the expected output\n");
1033     }
1034 }
1035 
1036 static void
TestSurrogates()1037 TestSurrogates() {
1038     static const uint8_t b[]={
1039         0xc3, 0x9f,             /*  00DF */
1040         0xed, 0x9f, 0xbf,       /*  D7FF */
1041         0xed, 0xa0, 0x81,       /*  D801 */
1042         0xed, 0xbf, 0xbe,       /*  DFFE */
1043         0xee, 0x80, 0x80,       /*  E000 */
1044         0xf0, 0x97, 0xbf, 0xbe  /* 17FFE */
1045     };
1046     static const UChar32 cp[]={
1047         0xdf, 0xd7ff, 0xd801, 0xdffe, 0xe000, 0x17ffe
1048     };
1049 
1050     UChar32 cu, cs, cl;
1051     int32_t i, j, k, iu, is, il, length;
1052 
1053     k=0; /* index into cp[] */
1054     length=LENGTHOF(b);
1055     for(i=0; i<length;) {
1056         j=i;
1057         U8_NEXT_UNSAFE(b, j, cu);
1058         iu=j;
1059 
1060         j=i;
1061         U8_NEXT(b, j, length, cs);
1062         is=j;
1063 
1064         j=i;
1065         L8_NEXT(b, j, length, cl);
1066         il=j;
1067 
1068         if(cu!=cp[k]) {
1069             log_err("U8_NEXT_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
1070         }
1071 
1072         /* U8_NEXT() returns <0 for surrogate code points */
1073         if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
1074             log_err("U8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
1075         }
1076 
1077         /* L8_NEXT() returns surrogate code points like U8_NEXT_UNSAFE() */
1078         if(cl!=cu) {
1079             log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
1080         }
1081 
1082         if(is!=iu || il!=iu) {
1083             log_err("U8_NEXT(b[%ld]) or L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1084         }
1085 
1086         ++k;    /* next code point */
1087         i=iu;   /* advance by one UTF-8 sequence */
1088     }
1089 
1090     while(i>0) {
1091         --k; /* previous code point */
1092 
1093         j=i;
1094         U8_PREV_UNSAFE(b, j, cu);
1095         iu=j;
1096 
1097         j=i;
1098         U8_PREV(b, 0, j, cs);
1099         is=j;
1100 
1101         j=i;
1102         L8_PREV(b, 0, j, cl);
1103         il=j;
1104 
1105         if(cu!=cp[k]) {
1106             log_err("U8_PREV_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
1107         }
1108 
1109         /* U8_PREV() returns <0 for surrogate code points */
1110         if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
1111             log_err("U8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
1112         }
1113 
1114         /* L8_PREV() returns surrogate code points like U8_PREV_UNSAFE() */
1115         if(cl!=cu) {
1116             log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
1117         }
1118 
1119         if(is!=iu || il !=iu) {
1120             log_err("U8_PREV(b[%ld]) or L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1121         }
1122 
1123         i=iu;   /* go back by one UTF-8 sequence */
1124     }
1125 }
1126 
printUChars(const uint8_t * uchars,int16_t len)1127 static void printUChars(const uint8_t *uchars, int16_t len){
1128     int16_t i=0;
1129     for(i=0; i<len; i++){
1130         log_err("0x%02x ", *(uchars+i));
1131     }
1132 }
1133