• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /********************************************************************
2  * COPYRIGHT:
3  * Copyright (c) 1998-2006, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  ********************************************************************/
6 /*
7 * File test.c
8 *
9 * Modification History:
10 *
11 *   Date          Name        Description
12 *   07/24/2000    Madhu       Creation
13 *******************************************************************************
14 */
15 
16 #include "unicode/utypes.h"
17 #include "unicode/utf8.h"
18 #include "cmemory.h"
19 #include "cintltst.h"
20 
21 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
22 
23 /* lenient UTF-8 ------------------------------------------------------------ */
24 
25 /*
26  * Lenient UTF-8 differs from conformant UTF-8 in that it allows surrogate
27  * code points with their "natural" encoding.
28  * Effectively, this allows a mix of UTF-8 and CESU-8 as well as encodings of
29  * single surrogates.
30  *
31  * This is not conformant with UTF-8.
32  *
33  * Supplementary code points may be encoded as pairs of 3-byte sequences, but
34  * the macros below do not attempt to assemble such pairs.
35  */
36 
37 #define L8_NEXT(s, i, length, c) { \
38     (c)=(uint8_t)(s)[(i)++]; \
39     if((c)>=0x80) { \
40         if(U8_IS_LEAD(c)) { \
41             (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
42         } else { \
43             (c)=U_SENTINEL; \
44         } \
45     } \
46 }
47 
48 #define L8_PREV(s, start, i, c) { \
49     (c)=(uint8_t)(s)[--(i)]; \
50     if((c)>=0x80) { \
51         if((c)<=0xbf) { \
52             (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
53         } else { \
54             (c)=U_SENTINEL; \
55         } \
56     } \
57 }
58 
59 /* -------------------------------------------------------------------------- */
60 
61 static void printUChars(const uint8_t *uchars, int16_t len);
62 
63 static void TestCodeUnitValues(void);
64 static void TestCharLength(void);
65 static void TestGetChar(void);
66 static void TestNextPrevChar(void);
67 static void TestFwdBack(void);
68 static void TestSetChar(void);
69 static void TestAppendChar(void);
70 static void TestAppend(void);
71 static void TestSurrogates(void);
72 
73 void addUTF8Test(TestNode** root);
74 
75 void
addUTF8Test(TestNode ** root)76 addUTF8Test(TestNode** root)
77 {
78   addTest(root, &TestCodeUnitValues,    "utf8tst/TestCodeUnitValues");
79   addTest(root, &TestCharLength,        "utf8tst/TestCharLength"    );
80   addTest(root, &TestGetChar,           "utf8tst/TestGetChar"       );
81   addTest(root, &TestNextPrevChar,      "utf8tst/TestNextPrevChar"  );
82   addTest(root, &TestFwdBack,           "utf8tst/TestFwdBack"       );
83   addTest(root, &TestSetChar,           "utf8tst/TestSetChar"       );
84   addTest(root, &TestAppendChar,        "utf8tst/TestAppendChar"    );
85   addTest(root, &TestAppend,            "utf8tst/TestAppend"        );
86   addTest(root, &TestSurrogates,        "utf8tst/TestSurrogates"    );
87 }
88 
TestCodeUnitValues()89 static void TestCodeUnitValues()
90 {
91     static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc0, 0xc4, 0xf0, 0xfd, 0x80, 0x81, 0xbc, 0xbe,};
92 
93     int16_t i;
94     for(i=0; i<sizeof(codeunit)/sizeof(codeunit[0]); i++){
95         uint8_t c=codeunit[i];
96         log_verbose("Testing code unit value of %x\n", c);
97         if(i<4){
98             if(!UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || UTF8_IS_TRAIL(c) || !U8_IS_SINGLE(c) || U8_IS_LEAD(c) || U8_IS_TRAIL(c)){
99                 log_err("ERROR: 0x%02x is a single byte but results in single: %c lead: %c trail: %c\n",
100                     c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
101             }
102         } else if(i< 8){
103             if(!UTF8_IS_LEAD(c) || UTF8_IS_SINGLE(c) || UTF8_IS_TRAIL(c) || !U8_IS_LEAD(c) || U8_IS_SINGLE(c) || U8_IS_TRAIL(c)){
104                 log_err("ERROR: 0x%02x is a lead byte but results in single: %c lead: %c trail: %c\n",
105                     c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
106             }
107         } else if(i< 12){
108             if(!UTF8_IS_TRAIL(c) || UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || !U8_IS_TRAIL(c) || U8_IS_SINGLE(c) || U8_IS_LEAD(c)){
109                 log_err("ERROR: 0x%02x is a trail byte but results in single: %c lead: %c trail: %c\n",
110                     c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
111             }
112         }
113     }
114 }
115 
TestCharLength()116 static void TestCharLength()
117 {
118     static const uint32_t codepoint[]={
119         1, 0x0061,
120         1, 0x007f,
121         2, 0x016f,
122         2, 0x07ff,
123         3, 0x0865,
124         3, 0x20ac,
125         4, 0x20402,
126         4, 0x23456,
127         4, 0x24506,
128         4, 0x20402,
129         4, 0x10402,
130         3, 0xd7ff,
131         3, 0xe000,
132 
133     };
134 
135     int16_t i;
136     UBool multiple;
137     for(i=0; i<sizeof(codepoint)/sizeof(codepoint[0]); i=(int16_t)(i+2)){
138         UChar32 c=codepoint[i+1];
139         if(UTF8_CHAR_LENGTH(c) != (uint16_t)codepoint[i] || U8_LENGTH(c) != (uint16_t)codepoint[i]){
140               log_err("The no: of code units for %lx:- Expected: %d Got: %d\n", c, codepoint[i], UTF8_CHAR_LENGTH(c));
141         }else{
142               log_verbose("The no: of code units for %lx is %d\n",c, UTF8_CHAR_LENGTH(c) );
143         }
144         multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
145         if(UTF8_NEED_MULTIPLE_UCHAR(c) != multiple){
146               log_err("ERROR: UTF8_NEED_MULTIPLE_UCHAR failed for %lx\n", c);
147         }
148     }
149 }
150 
TestGetChar()151 static void TestGetChar()
152 {
153     static const uint8_t input[]={
154     /*  code unit,*/
155         0x61,
156         0x7f,
157         0xe4,
158         0xba,
159         0x8c,
160         0xF0,
161         0x90,
162         0x90,
163         0x81,
164         0xc0,
165         0x65,
166         0x31,
167         0x9a,
168         0xc9
169     };
170     static const UChar32 result[]={
171      /*codepoint-unsafe,  codepoint-safe(not strict)  codepoint-safe(strict)*/
172         0x61,             0x61,                       0x61,
173         0x7f,             0x7f,                       0x7f,
174         0x4e8c,           0x4e8c,                     0x4e8c,
175         0x4e8c,           0x4e8c,                     0x4e8c ,
176         0x4e8c,           0x4e8c,                     0x4e8c,
177         0x10401,          0x10401,                    0x10401 ,
178         0x10401,          0x10401,                    0x10401 ,
179         0x10401,          0x10401,                    0x10401 ,
180         0x10401,          0x10401,                    0x10401,
181         0x25,             UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,
182         0x65,             0x65,                       0x65,
183         0x31,             0x31,                       0x31,
184         0x31,             UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,
185         0x240,            UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1
186     };
187     uint16_t i=0;
188     UChar32 c;
189     uint32_t offset=0;
190 
191     for(offset=0; offset<sizeof(input); offset++) {
192         if (offset < sizeof(input) - 1) {
193             UTF8_GET_CHAR_UNSAFE(input, offset, c);
194             if(c != result[i]){
195                 log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
196 
197             }
198 
199             U8_GET_UNSAFE(input, offset, c);
200             if(c != result[i]){
201                 log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
202 
203             }
204         }
205 
206         U8_GET(input, 0, offset, sizeof(input), c);
207         if(UTF_IS_ERROR(result[i+1]) ? c >= 0 : c != result[i+1]){
208             log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
209         }
210 
211         UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE);
212         if(c != result[i+1]){
213             log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
214         }
215 
216         UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, TRUE);
217         if(c != result[i+2]){
218             log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
219         }
220 
221          i=(uint16_t)(i+3);
222     }
223 }
224 
TestNextPrevChar()225 static void TestNextPrevChar(){
226     static const uint8_t input[]={0x61, 0xf0, 0x90, 0x90, 0x81, 0xc0, 0x80, 0xfd, 0xbe, 0xc2, 0x61, 0x81, 0x90, 0x90, 0xf0, 0x00};
227     static const UChar32 result[]={
228     /*next_unsafe    next_safe_ns        next_safe_s          prev_unsafe   prev_safe_ns         prev_safe_s*/
229         0x0061,        0x0061,             0x0061,              0x0000,       0x0000,             0x0000,
230         0x10401,       0x10401,            0x10401,             0xf0,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
231         0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x2841410,    UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
232         0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xa1050,      UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
233         0x81,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x2841,       UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
234         0x00,          UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,  0x61,         0x61,               0x61,
235         0x80,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xc2,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
236         0xfd,          UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,  0x77e,        UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
237         0xbe,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xfd,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
238         0xa1,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x00,         UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
239         0x61,          0x61,               0x61,                0xc0,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
240         0x81,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x10401,      0x10401,            0x10401,
241         0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x410,        UTF_ERROR_VALUE,    UTF_ERROR_VALUE,
242         0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x410,        UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
243         0x0840,        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xf0,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
244         0x0000,        0x0000,             0x0000,              0x0061,       0x0061,             0x0061
245     };
246     static const int32_t movedOffset[]={
247    /*next_unsafe    next_safe_ns  next_safe_s       prev_unsafe   prev_safe_ns     prev_safe_s*/
248         1,            1,           1,                15,           15,               15,
249         5,            5,           5,                14,           14 ,              14,
250         3,            3,           3,                9,            13,               13,
251         4,            4,           4,                9,            12,               12,
252         5,            5,           5,                9,            11,               11,
253         7,            7,           7,                10,           10,               10,
254         7,            7,           7,                9,            9,                9,
255         8,            9,           9,                7,            7,                7,
256         9,            9,           9,                7,            7,                7,
257         11,           10,          10,               5,            5,                5,
258         11,           11,          11,               5,            5,                5,
259         12,           12,          12,               1,            1,                1,
260         13,           13,          13,               1,            1,                1,
261         14,           14,          14,               1,            1,                1,
262         14,           15,          15,               1,            1,                1,
263         14,           16,          16,               0,            0,                0,
264 
265 
266     };
267 
268 
269     UChar32 c=0x0000;
270     uint32_t i=0;
271     uint32_t offset=0;
272     int32_t setOffset=0;
273     for(offset=0; offset<sizeof(input); offset++){
274          if (offset < sizeof(input) - 2) { /* Can't have it go off the end of the array based on input */
275              setOffset=offset;
276              UTF8_NEXT_CHAR_UNSAFE(input, setOffset, c);
277              if(setOffset != movedOffset[i]){
278                  log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
279                      offset, movedOffset[i], setOffset);
280              }
281              if(c != result[i]){
282                  log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
283              }
284 
285              setOffset=offset;
286              U8_NEXT_UNSAFE(input, setOffset, c);
287              if(setOffset != movedOffset[i]){
288                  log_err("ERROR: U8_NEXT_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
289                      offset, movedOffset[i], setOffset);
290              }
291              if(c != result[i]){
292                  log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
293              }
294          }
295 
296          setOffset=offset;
297          UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
298          if(setOffset != movedOffset[i+1]){
299              log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
300                  offset, movedOffset[i+1], setOffset);
301          }
302          if(c != result[i+1]){
303              log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
304          }
305 
306          setOffset=offset;
307          U8_NEXT(input, setOffset, sizeof(input), c);
308          if(setOffset != movedOffset[i+1]){
309              log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
310                  offset, movedOffset[i+1], setOffset);
311          }
312          if(UTF_IS_ERROR(result[i+1]) ? c >= 0 : c != result[i+1]){
313              log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
314          }
315 
316          setOffset=offset;
317          UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
318          if(setOffset != movedOffset[i+1]){
319              log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
320                  offset, movedOffset[i+2], setOffset);
321          }
322          if(c != result[i+2]){
323              log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
324          }
325 
326          i=i+6;
327     }
328 
329     i=0;
330     for(offset=sizeof(input); offset > 0; --offset){
331          setOffset=offset;
332          UTF8_PREV_CHAR_UNSAFE(input, setOffset, c);
333          if(setOffset != movedOffset[i+3]){
334              log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
335                  offset, movedOffset[i+3], setOffset);
336          }
337          if(c != result[i+3]){
338              log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c);
339          }
340 
341          setOffset=offset;
342          UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
343          if(setOffset != movedOffset[i+4]){
344              log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
345                  offset, movedOffset[i+4], setOffset);
346          }
347          if(c != result[i+4]){
348              log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
349          }
350 
351          setOffset=offset;
352          U8_PREV(input, 0, setOffset, c);
353          if(setOffset != movedOffset[i+4]){
354              log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
355                  offset, movedOffset[i+4], setOffset);
356          }
357          if(UTF_IS_ERROR(result[i+4]) ? c >= 0 : c != result[i+4]){
358              log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
359          }
360 
361          setOffset=offset;
362          UTF8_PREV_CHAR_SAFE(input, 0,  setOffset, c, TRUE);
363          if(setOffset != movedOffset[i+5]){
364              log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
365                  offset, movedOffset[i+5], setOffset);
366          }
367          if(c != result[i+5]){
368              log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c);
369          }
370 
371          i=i+6;
372     }
373 
374     {
375         /* test non-characters */
376         static const uint8_t nonChars[]={
377             0xef, 0xb7, 0x90,       /* U+fdd0 */
378             0xef, 0xbf, 0xbf,       /* U+feff */
379             0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */
380             0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */
381             0xf4, 0x8f, 0xbf, 0xbe  /* U+10fffe */
382         };
383 
384         UChar32 ch;
385         int32_t idx;
386 
387         for(idx=0; idx<(int32_t)sizeof(nonChars);) {
388             U8_NEXT(nonChars, idx, sizeof(nonChars), ch);
389             if(!U_IS_UNICODE_NONCHAR(ch)) {
390                 log_err("U8_NEXT(before %d) failed to read a non-character\n", idx);
391             }
392         }
393         for(idx=(int32_t)sizeof(nonChars); idx>0;) {
394             U8_PREV(nonChars, 0, idx, ch);
395             if(!U_IS_UNICODE_NONCHAR(ch)) {
396                 log_err("U8_PREV(at %d) failed to read a non-character\n", idx);
397             }
398         }
399     }
400 }
401 
TestFwdBack()402 static void TestFwdBack(){
403     static const uint8_t input[]={0x61, 0xF0, 0x90, 0x90, 0x81, 0xff, 0x62, 0xc0, 0x80, 0x7f, 0x8f, 0xc0, 0x63, 0x81, 0x90, 0x90, 0xF0, 0x00};
404     static const uint16_t fwd_unsafe[] ={1, 5, 6, 7,  9, 10, 11, 13, 14, 15, 16,  20, };
405     static const uint16_t fwd_safe[]   ={1, 5, 6, 7, 9, 10, 11,  12, 13, 14, 15, 16, 17, 18};
406     static const uint16_t back_unsafe[]={17, 16, 12, 11, 9, 7, 6, 5, 1, 0};
407     static const uint16_t back_safe[]  ={17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6, 5, 1, 0};
408 
409     static const uint16_t Nvalue[]= {0, 1, 2, 3, 1, 2, 1, 5};
410     static const uint16_t fwd_N_unsafe[] ={0, 1, 6, 10, 11, 14, 15};
411     static const uint16_t fwd_N_safe[]   ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */
412     static const uint16_t back_N_unsafe[]={18, 17, 12, 7, 6, 1, 0};
413     static const uint16_t back_N_safe[]  ={18, 17, 15, 12, 11, 9, 7, 0};
414 
415 
416     uint32_t offunsafe=0, offsafe=0;
417 
418     uint32_t i=0;
419     while(offunsafe < sizeof(input)){
420         UTF8_FWD_1_UNSAFE(input, offunsafe);
421         if(offunsafe != fwd_unsafe[i]){
422             log_err("ERROR: Forward_unsafe offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe);
423         }
424         i++;
425     }
426 
427     i=0;
428     while(offunsafe < sizeof(input)){
429         U8_FWD_1_UNSAFE(input, offunsafe);
430         if(offunsafe != fwd_unsafe[i]){
431             log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe);
432         }
433         i++;
434     }
435 
436     i=0;
437     while(offsafe < sizeof(input)){
438         UTF8_FWD_1_SAFE(input, offsafe, sizeof(input));
439         if(offsafe != fwd_safe[i]){
440             log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
441         }
442         i++;
443     }
444 
445     i=0;
446     while(offsafe < sizeof(input)){
447         U8_FWD_1(input, offsafe, sizeof(input));
448         if(offsafe != fwd_safe[i]){
449             log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
450         }
451         i++;
452     }
453 
454     offunsafe=sizeof(input);
455     i=0;
456     while(offunsafe > 0){
457         UTF8_BACK_1_UNSAFE(input, offunsafe);
458         if(offunsafe != back_unsafe[i]){
459             log_err("ERROR: Backward_unsafe offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe);
460         }
461         i++;
462     }
463 
464     offunsafe=sizeof(input);
465     i=0;
466     while(offunsafe > 0){
467         U8_BACK_1_UNSAFE(input, offunsafe);
468         if(offunsafe != back_unsafe[i]){
469             log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe);
470         }
471         i++;
472     }
473 
474     i=0;
475     offsafe=sizeof(input);
476     while(offsafe > 0){
477         UTF8_BACK_1_SAFE(input, 0,  offsafe);
478         if(offsafe != back_safe[i]){
479             log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_unsafe[i], offsafe);
480         }
481         i++;
482     }
483 
484     i=0;
485     offsafe=sizeof(input);
486     while(offsafe > 0){
487         U8_BACK_1(input, 0,  offsafe);
488         if(offsafe != back_safe[i]){
489             log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_unsafe[i], offsafe);
490         }
491         i++;
492     }
493 
494     offunsafe=0;
495     for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
496         UTF8_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]);
497         if(offunsafe != fwd_N_unsafe[i]){
498             log_err("ERROR: Forward_N_unsafe offset=%d expected:%d, Got:%d\n", i, fwd_N_unsafe[i], offunsafe);
499         }
500     }
501 
502     offunsafe=0;
503     for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
504         U8_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]);
505         if(offunsafe != fwd_N_unsafe[i]){
506             log_err("ERROR: U8_FWD_N_UNSAFE offset=%d expected:%d, Got:%d\n", i, fwd_N_unsafe[i], offunsafe);
507         }
508     }
509 
510     offsafe=0;
511     for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
512         UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]);
513         if(offsafe != fwd_N_safe[i]){
514             log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
515         }
516 
517     }
518 
519     offsafe=0;
520     for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
521         U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]);
522         if(offsafe != fwd_N_safe[i]){
523             log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
524         }
525 
526     }
527 
528     offunsafe=sizeof(input);
529     for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
530         UTF8_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]);
531         if(offunsafe != back_N_unsafe[i]){
532             log_err("ERROR: backward_N_unsafe offset=%d expected:%d, Got:%d\n", i, back_N_unsafe[i], offunsafe);
533         }
534     }
535 
536     offunsafe=sizeof(input);
537     for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
538         U8_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]);
539         if(offunsafe != back_N_unsafe[i]){
540             log_err("ERROR: U8_BACK_N_UNSAFE offset=%d expected:%d, Got:%d\n", i, back_N_unsafe[i], offunsafe);
541         }
542     }
543 
544     offsafe=sizeof(input);
545     for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
546         UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]);
547         if(offsafe != back_N_safe[i]){
548             log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
549         }
550     }
551 
552     offsafe=sizeof(input);
553     for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
554         U8_BACK_N(input, 0, offsafe, Nvalue[i]);
555         if(offsafe != back_N_safe[i]){
556             log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
557         }
558     }
559 }
560 
TestSetChar()561 static void TestSetChar(){
562     static const uint8_t input[]
563         = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0xfe, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x00 };
564     static const int16_t start_unsafe[]
565         = {0,    1,    1,    1,    4,    5,    6,    7,    8,    9,    9,    9,    12,   13 };
566     static const int16_t start_safe[]
567         = {0,    1,    1,    1,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13 };
568     static const int16_t limit_unsafe[]
569         = {0,    1,    4,    4,    4,    5,    6,    7,    9,    9,    10,   10,   10,   15 };
570     static const int16_t limit_safe[]
571         = {0,    1,    4,    4,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13 };
572 
573     uint32_t i=0;
574     int32_t offset=0, setOffset=0;
575     for(offset=0; offset<(int32_t)sizeof(input); offset++){
576          setOffset=offset;
577          UTF8_SET_CHAR_START_UNSAFE(input, setOffset);
578          if(setOffset != start_unsafe[i]){
579              log_err("ERROR: UTF8_SET_CHAR_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
580          }
581 
582          setOffset=offset;
583          U8_SET_CP_START_UNSAFE(input, setOffset);
584          if(setOffset != start_unsafe[i]){
585              log_err("ERROR: U8_SET_CP_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
586          }
587 
588          setOffset=offset;
589          UTF8_SET_CHAR_START_SAFE(input, 0, setOffset);
590          if(setOffset != start_safe[i]){
591              log_err("ERROR: UTF8_SET_CHAR_START_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
592          }
593 
594          setOffset=offset;
595          U8_SET_CP_START(input, 0, setOffset);
596          if(setOffset != start_safe[i]){
597              log_err("ERROR: U8_SET_CP_START failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
598          }
599 
600          if (offset != 0) { /* Can't have it go off the end of the array */
601              setOffset=offset;
602              UTF8_SET_CHAR_LIMIT_UNSAFE(input, setOffset);
603              if(setOffset != limit_unsafe[i]){
604                  log_err("ERROR: UTF8_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
605              }
606 
607              setOffset=offset;
608              U8_SET_CP_LIMIT_UNSAFE(input, setOffset);
609              if(setOffset != limit_unsafe[i]){
610                  log_err("ERROR: U8_SET_CP_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
611              }
612          }
613 
614          setOffset=offset;
615          UTF8_SET_CHAR_LIMIT_SAFE(input,0, setOffset, sizeof(input));
616          if(setOffset != limit_safe[i]){
617              log_err("ERROR: UTF8_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
618          }
619 
620          setOffset=offset;
621          U8_SET_CP_LIMIT(input,0, setOffset, sizeof(input));
622          if(setOffset != limit_safe[i]){
623              log_err("ERROR: U8_SET_CP_LIMIT failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
624          }
625 
626          i++;
627     }
628 }
629 
TestAppendChar()630 static void TestAppendChar(){
631     static const uint8_t s[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00};
632     static const uint32_t test[]={
633      /*append-position(unsafe),  CHAR to be appended  */
634         0,                        0x10401,
635         2,                        0x0028,
636         2,                        0x007f,
637         3,                        0xd801,
638         1,                        0x20402,
639         8,                        0x10401,
640         5,                        0xc0,
641         5,                        0xc1,
642         5,                        0xfd,
643         6,                        0x80,
644         6,                        0x81,
645         6,                        0xbf,
646         7,                        0xfe,
647 
648     /*append-position(safe),     CHAR to be appended */
649         0,                        0x10401,
650         2,                        0x0028,
651         3,                        0x7f,
652         3,                        0xd801,   /* illegal for UTF-8 starting with Unicode 3.2 */
653         1,                        0x20402,
654         9,                        0x10401,
655         5,                        0xc0,
656         5,                        0xc1,
657         5,                        0xfd,
658         6,                        0x80,
659         6,                        0x81,
660         6,                        0xbf,
661         7,                        0xfe,
662 
663     };
664     static const uint16_t movedOffset[]={
665         /*offset-moved-to(unsafe)*/
666           4,              /*for append-pos: 0 , CHAR 0x10401*/
667           3,
668           3,
669           6,
670           5,
671           12,
672           7,
673           7,
674           7,
675           8,
676           8,
677           8,
678           9,
679 
680           /*offset-moved-to(safe)*/
681           4,              /*for append-pos: 0, CHAR  0x10401*/
682           3,
683           4,
684           6,
685           5,
686           11,
687           7,
688           7,
689           7,
690           8,
691           8,
692           8,
693           9,
694 
695     };
696 
697     static const uint8_t result[][11]={
698         /*unsafe*/
699         {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
700         {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
701         {0x61, 0x62, 0x7f, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
702         {0x61, 0x62, 0x63, 0xed, 0xa0, 0x81, 0x67, 0x68, 0x69, 0x6a, 0x00},
703         {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
704         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0xF0, 0x90, 0x90},
705 
706         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
707         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
708         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
709 
710         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
711         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
712         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
713 
714         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
715         /*safe*/
716         {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
717         {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
718         {0x61, 0x62, 0x63, 0x7f, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
719         {0x61, 0x62, 0x63, 0xef, 0xbf, 0xbf, 0x67, 0x68, 0x69, 0x6a, 0x00},
720         {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
721         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xc2, 0x9f}, /*gets UTF8_ERROR_VALUE_2 which takes 2 bytes 0xc0, 0x9f*/
722 
723         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
724         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
725         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
726 
727         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
728         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
729         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
730 
731         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
732 
733     };
734     uint16_t i, count=0;
735     uint8_t str[12];
736     uint32_t offset;
737 /*    UChar32 c=0;*/
738     uint16_t size=sizeof(s)/sizeof(s[0]);
739     for(i=0; i<sizeof(test)/sizeof(test[0]); i=(uint16_t)(i+2)){
740         uprv_memcpy(str, s, size);
741         offset=test[i];
742         if(count<13){
743             UTF8_APPEND_CHAR_UNSAFE(str, offset, test[i+1]);
744             if(offset != movedOffset[count]){
745                 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d  currentOffset=%d\n",
746                     count, movedOffset[count], offset);
747 
748             }
749             if(uprv_memcmp(str, result[count], size) !=0){
750                 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed for count=%d. \nExpected:", count);
751                 printUChars(result[count], size);
752                 log_err("\nGot:      ");
753                 printUChars(str, size);
754                 log_err("\n");
755             }
756         }else{
757             UTF8_APPEND_CHAR_SAFE(str, offset, size, test[i+1]);
758             if(offset != movedOffset[count]){
759                 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d  currentOffset=%d\n",
760                     count, movedOffset[count], offset);
761 
762             }
763             if(uprv_memcmp(str, result[count], size) !=0){
764                 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed for count=%d. \nExpected:", count);
765                 printUChars(result[count], size);
766                 log_err("\nGot:     ");
767                 printUChars(str, size);
768                 log_err("\n");
769             }
770             /*call the API instead of MACRO
771             uprv_memcpy(str, s, size);
772             offset=test[i];
773             c=test[i+1];
774             if((uint32_t)(c)<=0x7f) {
775                   (str)[(offset)++]=(uint8_t)(c);
776             } else {
777                  (offset)=utf8_appendCharSafeBody(str, (int32_t)(offset), (int32_t)(size), c);
778             }
779             if(offset != movedOffset[count]){
780                 log_err("ERROR: utf8_appendCharSafeBody() failed to move the offset correctly for count=%d.\nExpectedOffset=%d  currentOffset=%d\n",
781                     count, movedOffset[count], offset);
782 
783             }
784             if(uprv_memcmp(str, result[count], size) !=0){
785                 log_err("ERROR: utf8_appendCharSafeBody() failed for count=%d. \nExpected:", count);
786                 printUChars(result[count], size);
787                 printf("\nGot:     ");
788                 printUChars(str, size);
789                 printf("\n");
790             }
791             */
792         }
793         count++;
794     }
795 
796 
797 }
798 
TestAppend()799 static void TestAppend() {
800     static const UChar32 codePoints[]={
801         0x61, 0xdf, 0x901, 0x3040,
802         0xac00, 0xd800, 0xdbff, 0xdcde,
803         0xdffd, 0xe000, 0xffff, 0x10000,
804         0x12345, 0xe0021, 0x10ffff, 0x110000,
805         0x234567, 0x7fffffff, -1, -1000,
806         0, 0x400
807     };
808     static const uint8_t expectUnsafe[]={
809         0x61,  0xc3, 0x9f,  0xe0, 0xa4, 0x81,  0xe3, 0x81, 0x80,
810         0xea, 0xb0, 0x80,  0xed, 0xa0, 0x80,  0xed, 0xaf, 0xbf,  0xed, 0xb3, 0x9e,
811         0xed, 0xbf, 0xbd,  0xee, 0x80, 0x80,  0xef, 0xbf, 0xbf,  0xf0, 0x90, 0x80, 0x80,
812         0xf0, 0x92, 0x8d, 0x85,  0xf3, 0xa0, 0x80, 0xa1,  0xf4, 0x8f, 0xbf, 0xbf,  /* not 0x110000 */
813         /* none from this line */
814         0,  0xd0, 0x80
815     }, expectSafe[]={
816         0x61,  0xc3, 0x9f,  0xe0, 0xa4, 0x81,  0xe3, 0x81, 0x80,
817         0xea, 0xb0, 0x80,  /* no surrogates */
818         /* no surrogates */  0xee, 0x80, 0x80,  0xef, 0xbf, 0xbf,  0xf0, 0x90, 0x80, 0x80,
819         0xf0, 0x92, 0x8d, 0x85,  0xf3, 0xa0, 0x80, 0xa1,  0xf4, 0x8f, 0xbf, 0xbf,  /* not 0x110000 */
820         /* none from this line */
821         0,  0xd0, 0x80
822     };
823 
824     uint8_t buffer[100];
825     UChar32 c;
826     int32_t i, length;
827     UBool isError, expectIsError, wrongIsError;
828 
829     length=0;
830     for(i=0; i<LENGTHOF(codePoints); ++i) {
831         c=codePoints[i];
832         if(c<0 || 0x10ffff<c) {
833             continue; /* skip non-code points for U8_APPEND_UNSAFE */
834         }
835 
836         U8_APPEND_UNSAFE(buffer, length, c);
837     }
838     if(length!=LENGTHOF(expectUnsafe) || 0!=memcmp(buffer, expectUnsafe, length)) {
839         log_err("U8_APPEND_UNSAFE did not generate the expected output\n");
840     }
841 
842     length=0;
843     wrongIsError=FALSE;
844     for(i=0; i<LENGTHOF(codePoints); ++i) {
845         c=codePoints[i];
846         expectIsError= c<0 || 0x10ffff<c || U_IS_SURROGATE(c);
847         isError=FALSE;
848 
849         U8_APPEND(buffer, length, LENGTHOF(buffer), c, isError);
850         wrongIsError|= isError!=expectIsError;
851     }
852     if(wrongIsError) {
853         log_err("U8_APPEND did not set isError correctly\n");
854     }
855     if(length!=LENGTHOF(expectSafe) || 0!=memcmp(buffer, expectSafe, length)) {
856         log_err("U8_APPEND did not generate the expected output\n");
857     }
858 }
859 
860 static void
TestSurrogates()861 TestSurrogates() {
862     static const uint8_t b[]={
863         0xc3, 0x9f,             /*  00DF */
864         0xed, 0x9f, 0xbf,       /*  D7FF */
865         0xed, 0xa0, 0x81,       /*  D801 */
866         0xed, 0xbf, 0xbe,       /*  DFFE */
867         0xee, 0x80, 0x80,       /*  E000 */
868         0xf0, 0x97, 0xbf, 0xbe  /* 17FFE */
869     };
870     static const UChar32 cp[]={
871         0xdf, 0xd7ff, 0xd801, 0xdffe, 0xe000, 0x17ffe
872     };
873 
874     UChar32 cu, cs, cl;
875     int32_t i, j, k, iu, is, il, length;
876 
877     k=0; /* index into cp[] */
878     length=LENGTHOF(b);
879     for(i=0; i<length;) {
880         j=i;
881         U8_NEXT_UNSAFE(b, j, cu);
882         iu=j;
883 
884         j=i;
885         U8_NEXT(b, j, length, cs);
886         is=j;
887 
888         j=i;
889         L8_NEXT(b, j, length, cl);
890         il=j;
891 
892         if(cu!=cp[k]) {
893             log_err("U8_NEXT_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
894         }
895 
896         /* U8_NEXT() returns <0 for surrogate code points */
897         if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
898             log_err("U8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
899         }
900 
901         /* L8_NEXT() returns surrogate code points like U8_NEXT_UNSAFE() */
902         if(cl!=cu) {
903             log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
904         }
905 
906         if(is!=iu || il!=iu) {
907             log_err("U8_NEXT(b[%ld]) or L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
908         }
909 
910         ++k;    /* next code point */
911         i=iu;   /* advance by one UTF-8 sequence */
912     }
913 
914     while(i>0) {
915         --k; /* previous code point */
916 
917         j=i;
918         U8_PREV_UNSAFE(b, j, cu);
919         iu=j;
920 
921         j=i;
922         U8_PREV(b, 0, j, cs);
923         is=j;
924 
925         j=i;
926         L8_PREV(b, 0, j, cl);
927         il=j;
928 
929         if(cu!=cp[k]) {
930             log_err("U8_PREV_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
931         }
932 
933         /* U8_PREV() returns <0 for surrogate code points */
934         if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
935             log_err("U8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
936         }
937 
938         /* L8_PREV() returns surrogate code points like U8_PREV_UNSAFE() */
939         if(cl!=cu) {
940             log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
941         }
942 
943         if(is!=iu || il !=iu) {
944             log_err("U8_PREV(b[%ld]) or L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
945         }
946 
947         i=iu;   /* go back by one UTF-8 sequence */
948     }
949 }
950 
printUChars(const uint8_t * uchars,int16_t len)951 static void printUChars(const uint8_t *uchars, int16_t len){
952     int16_t i=0;
953     for(i=0; i<len; i++){
954         log_err("0x%02x ", *(uchars+i));
955     }
956 }
957