• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * COPYRIGHT:
5  * Copyright (c) 1998-2014, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  ********************************************************************/
8 /*
9 * File utf8tst.c
10 *
11 * Modification History:
12 *
13 *   Date          Name        Description
14 *   07/24/2000    Madhu       Creation
15 *******************************************************************************
16 */
17 
18 #include <stdbool.h>
19 
20 #include "unicode/utypes.h"
21 #include "unicode/utf8.h"
22 #include "unicode/utf_old.h"
23 #include "cmemory.h"
24 #include "cintltst.h"
25 
26 /* lenient UTF-8 ------------------------------------------------------------ */
27 
28 /*
29  * Lenient UTF-8 differs from conformant UTF-8 in that it allows surrogate
30  * code points with their "natural" encoding.
31  * Effectively, this allows a mix of UTF-8 and CESU-8 as well as encodings of
32  * single surrogates.
33  *
34  * This is not conformant with UTF-8.
35  *
36  * Supplementary code points may be encoded as pairs of 3-byte sequences, but
37  * the macros below do not attempt to assemble such pairs.
38  */
39 
40 #define L8_NEXT(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
41     (c)=(uint8_t)(s)[(i)++]; \
42     if((c)>=0x80) { \
43         if(U8_IS_LEAD(c)) { \
44             (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
45         } else { \
46             (c)=U_SENTINEL; \
47         } \
48     } \
49 } UPRV_BLOCK_MACRO_END
50 
51 #define L8_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
52     (c)=(uint8_t)(s)[--(i)]; \
53     if((c)>=0x80) { \
54         if((c)<=0xbf) { \
55             (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
56         } else { \
57             (c)=U_SENTINEL; \
58         } \
59     } \
60 } UPRV_BLOCK_MACRO_END
61 
62 /* -------------------------------------------------------------------------- */
63 
64 // Obsolete macros from obsolete unicode/utf_old.h, for some old test data.
65 #ifndef UTF8_ERROR_VALUE_1
66 #   define UTF8_ERROR_VALUE_1 0x15
67 #endif
68 #ifndef UTF8_ERROR_VALUE_2
69 #   define UTF8_ERROR_VALUE_2 0x9f
70 #endif
71 #ifndef UTF_ERROR_VALUE
72 #   define UTF_ERROR_VALUE 0xffff
73 #endif
74 #ifndef UTF_IS_ERROR
75 #   define UTF_IS_ERROR(c) \
76         (((c)&0xfffe)==0xfffe || (c)==UTF8_ERROR_VALUE_1 || (c)==UTF8_ERROR_VALUE_2)
77 #endif
78 
79 #if !U_HIDE_OBSOLETE_UTF_OLD_H
printUChars(const uint8_t * uchars,int16_t len)80 static void printUChars(const uint8_t *uchars, int16_t len){
81     int16_t i=0;
82     for(i=0; i<len; i++){
83         log_err("0x%02x ", *(uchars+i));
84     }
85 }
86 #endif
87 
88 static void TestCodeUnitValues(void);
89 static void TestCharLength(void);
90 static void TestGetChar(void);
91 static void TestNextPrevChar(void);
92 static void TestNulTerminated(void);
93 static void TestNextPrevNonCharacters(void);
94 static void TestNextPrevCharUnsafe(void);
95 static void TestFwdBack(void);
96 static void TestFwdBackUnsafe(void);
97 static void TestSetChar(void);
98 static void TestSetCharUnsafe(void);
99 static void TestTruncateIfIncomplete(void);
100 static void TestAppendChar(void);
101 static void TestAppend(void);
102 static void TestSurrogates(void);
103 
104 void addUTF8Test(TestNode** root);
105 
106 void
addUTF8Test(TestNode ** root)107 addUTF8Test(TestNode** root)
108 {
109     addTest(root, &TestCodeUnitValues,          "utf8tst/TestCodeUnitValues");
110     addTest(root, &TestCharLength,              "utf8tst/TestCharLength");
111     addTest(root, &TestGetChar,                 "utf8tst/TestGetChar");
112     addTest(root, &TestNextPrevChar,            "utf8tst/TestNextPrevChar");
113     addTest(root, &TestNulTerminated,           "utf8tst/TestNulTerminated");
114     addTest(root, &TestNextPrevNonCharacters,   "utf8tst/TestNextPrevNonCharacters");
115     addTest(root, &TestNextPrevCharUnsafe,      "utf8tst/TestNextPrevCharUnsafe");
116     addTest(root, &TestFwdBack,                 "utf8tst/TestFwdBack");
117     addTest(root, &TestFwdBackUnsafe,           "utf8tst/TestFwdBackUnsafe");
118     addTest(root, &TestSetChar,                 "utf8tst/TestSetChar");
119     addTest(root, &TestSetCharUnsafe,           "utf8tst/TestSetCharUnsafe");
120     addTest(root, &TestTruncateIfIncomplete,    "utf8tst/TestTruncateIfIncomplete");
121     addTest(root, &TestAppendChar,              "utf8tst/TestAppendChar");
122     addTest(root, &TestAppend,                  "utf8tst/TestAppend");
123     addTest(root, &TestSurrogates,              "utf8tst/TestSurrogates");
124 }
125 
TestCodeUnitValues()126 static void TestCodeUnitValues()
127 {
128     static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc2, 0xc4, 0xf0, 0xf4, 0x80, 0x81, 0xbc, 0xbe,};
129 
130     int16_t i;
131     for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
132         uint8_t c=codeunit[i];
133         log_verbose("Testing code unit value of %x\n", c);
134         if(i<4){
135             if(
136 #if !U_HIDE_OBSOLETE_UTF_OLD_H
137                     !UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || UTF8_IS_TRAIL(c) ||
138 #endif
139                     !U8_IS_SINGLE(c) || U8_IS_LEAD(c) || U8_IS_TRAIL(c)) {
140                 log_err("ERROR: 0x%02x is a single byte but results in single: %c lead: %c trail: %c\n",
141                     c, U8_IS_SINGLE(c) ? 'y' : 'n', U8_IS_LEAD(c) ? 'y' : 'n', U8_IS_TRAIL(c) ? 'y' : 'n');
142             }
143         } else if(i< 8){
144             if(
145 #if !U_HIDE_OBSOLETE_UTF_OLD_H
146                     !UTF8_IS_LEAD(c) || UTF8_IS_SINGLE(c) || UTF8_IS_TRAIL(c) ||
147 #endif
148                     !U8_IS_LEAD(c) || U8_IS_SINGLE(c) || U8_IS_TRAIL(c)) {
149                 log_err("ERROR: 0x%02x is a lead byte but results in single: %c lead: %c trail: %c\n",
150                     c, U8_IS_SINGLE(c) ? 'y' : 'n', U8_IS_LEAD(c) ? 'y' : 'n', U8_IS_TRAIL(c) ? 'y' : 'n');
151             }
152         } else if(i< 12){
153             if(
154 #if !U_HIDE_OBSOLETE_UTF_OLD_H
155                     !UTF8_IS_TRAIL(c) || UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) ||
156 #endif
157                     !U8_IS_TRAIL(c) || U8_IS_SINGLE(c) || U8_IS_LEAD(c)){
158                 log_err("ERROR: 0x%02x is a trail byte but results in single: %c lead: %c trail: %c\n",
159                     c, U8_IS_SINGLE(c) ? 'y' : 'n', U8_IS_LEAD(c) ? 'y' : 'n', U8_IS_TRAIL(c) ? 'y' : 'n');
160             }
161         }
162     }
163 }
164 
TestCharLength()165 static void TestCharLength()
166 {
167     static const uint32_t codepoint[]={
168         1, 0x0061,
169         1, 0x007f,
170         2, 0x016f,
171         2, 0x07ff,
172         3, 0x0865,
173         3, 0x20ac,
174         4, 0x20402,
175         4, 0x23456,
176         4, 0x24506,
177         4, 0x20402,
178         4, 0x10402,
179         3, 0xd7ff,
180         3, 0xe000,
181 
182     };
183 
184     int16_t i;
185 #if !U_HIDE_OBSOLETE_UTF_OLD_H
186     UBool multiple;
187 #endif
188     for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){
189         UChar32 c=codepoint[i+1];
190         if(
191 #if !U_HIDE_OBSOLETE_UTF_OLD_H
192                 UTF8_CHAR_LENGTH(c) != (uint16_t)codepoint[i] ||
193 #endif
194                 U8_LENGTH(c) != (uint16_t)codepoint[i]) {
195             log_err("The no: of code units for %lx:- Expected: %d Got: %d\n", c, codepoint[i], U8_LENGTH(c));
196         }else{
197               log_verbose("The no: of code units for %lx is %d\n",c, U8_LENGTH(c));
198         }
199 #if !U_HIDE_OBSOLETE_UTF_OLD_H
200         multiple=(UBool)(codepoint[i] == 1 ? false : true);
201         if(UTF8_NEED_MULTIPLE_UCHAR(c) != multiple){
202               log_err("ERROR: UTF8_NEED_MULTIPLE_UCHAR failed for %lx\n", c);
203         }
204 #endif
205     }
206 }
207 
TestGetChar()208 static void TestGetChar()
209 {
210     static const uint8_t input[]={
211     /*  code unit,*/
212         0x61,
213         0x7f,
214         0xe4,
215         0xba,
216         0x8c,
217         0xF0,
218         0x90,
219         0x90,
220         0x81,
221         0xc0,
222         0x65,
223         0x31,
224         0x9a,
225         0xc9
226     };
227     static const UChar32 result[]={
228     /*  codepoint-unsafe, codepoint-safe(not strict)  codepoint-safe(strict) */
229         0x61,             0x61,                       0x61,
230         0x7f,             0x7f,                       0x7f,
231         0x4e8c,           0x4e8c,                     0x4e8c,
232         0x4e8c,           0x4e8c,                     0x4e8c ,
233         0x4e8c,           0x4e8c,                     0x4e8c,
234         0x10401,          0x10401,                    0x10401 ,
235         0x10401,          0x10401,                    0x10401 ,
236         0x10401,          0x10401,                    0x10401 ,
237         0x10401,          0x10401,                    0x10401,
238         -1,               UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,
239         0x65,             0x65,                       0x65,
240         0x31,             0x31,                       0x31,
241         -1,               UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,
242         -1,               UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1
243     };
244     uint16_t i=0;
245     UChar32 c, expected;
246     uint32_t offset=0;
247 
248     for(offset=0; offset<sizeof(input); offset++) {
249         expected = result[i];
250         if (expected >= 0 && offset < sizeof(input) - 1) {
251 #if !U_HIDE_OBSOLETE_UTF_OLD_H
252             UTF8_GET_CHAR_UNSAFE(input, offset, c);
253             if(c != expected) {
254                 log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
255                         offset, expected, c);
256 
257             }
258 #endif
259             U8_GET_UNSAFE(input, offset, c);
260             if(c != expected) {
261                 log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
262                         offset, expected, c);
263 
264             }
265         }
266         expected=result[i+1];
267 #if !U_HIDE_OBSOLETE_UTF_OLD_H
268         UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, false);
269         if(c != expected){
270             log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
271         }
272 #endif
273         U8_GET(input, 0, offset, sizeof(input), c);
274         if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
275         if(c != expected){
276             log_err("ERROR: U8_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
277         }
278 
279         U8_GET_OR_FFFD(input, 0, offset, sizeof(input), c);
280         if(expected<0) { expected=0xfffd; }
281         if(c != expected){
282             log_err("ERROR: U8_GET_OR_FFFD failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
283         }
284 #if !U_HIDE_OBSOLETE_UTF_OLD_H
285         UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, true);
286         if(c != result[i+2]){
287             log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
288         }
289 #endif
290         i=(uint16_t)(i+3);
291     }
292 }
293 
TestNextPrevChar()294 static void TestNextPrevChar() {
295     static const uint8_t input[]={
296         0x61,
297         0xf0, 0x90, 0x90, 0x81,
298         0xc0, 0x80,  // non-shortest form
299         0xf3, 0xbe,  // truncated
300         0xc2,  // truncated
301         0x61,
302         0x81, 0x90, 0x90, 0xf0,  // "backwards" sequence
303         0x00
304     };
305     static const UChar32 result[]={
306     /*  next_safe_ns        next_safe_s          prev_safe_ns        prev_safe_s */
307         0x0061,             0x0061,              0x0000,             0x0000,
308         0x10401,            0x10401,             UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
309         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
310         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
311         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
312         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x61,               0x61,
313         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
314         UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,  UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
315         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
316         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
317         0x61,               0x61,                UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
318         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x10401,            0x10401,
319         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF_ERROR_VALUE,    UTF_ERROR_VALUE,
320         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
321         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
322         0x0000,             0x0000,              0x0061,             0x0061
323     };
324     static const int32_t movedOffset[]={
325     /*  next_safe    prev_safe_s */
326         1,           15,
327         5,           14,
328         3,           13,
329         4,           12,
330         5,           11,
331         6,           10,
332         7,           9,
333         9,           7,
334         9,           7,
335         10,          6,
336         11,          5,
337         12,          1,
338         13,          1,
339         14,          1,
340         15,          1,
341         16,          0,
342     };
343 
344     UChar32 c, expected;
345     uint32_t i=0, j=0;
346     uint32_t offset=0;
347     int32_t setOffset=0;
348     for(offset=0; offset<sizeof(input); offset++){
349         expected=result[i];  // next_safe_ns
350 #if !U_HIDE_OBSOLETE_UTF_OLD_H
351         setOffset=offset;
352         UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, false);
353         if(setOffset != movedOffset[j]) {
354             log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
355                 offset, movedOffset[j], setOffset);
356         }
357         if(c != expected) {
358             log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
359         }
360 #endif
361         setOffset=offset;
362         U8_NEXT(input, setOffset, sizeof(input), c);
363         if(setOffset != movedOffset[j]) {
364             log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
365                 offset, movedOffset[j], setOffset);
366         }
367         if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
368         if(c != expected) {
369             log_err("ERROR: U8_NEXT failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
370         }
371 
372         setOffset=offset;
373         U8_NEXT_OR_FFFD(input, setOffset, sizeof(input), c);
374         if(setOffset != movedOffset[j]) {
375             log_err("ERROR: U8_NEXT_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
376                 offset, movedOffset[j], setOffset);
377         }
378         if(expected<0) { expected=0xfffd; }
379         if(c != expected) {
380             log_err("ERROR: U8_NEXT_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
381         }
382 #if !U_HIDE_OBSOLETE_UTF_OLD_H
383         setOffset=offset;
384         UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, true);
385         if(setOffset != movedOffset[j]) {
386             log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
387                 offset, movedOffset[j], setOffset);
388         }
389         expected=result[i+1];  // next_safe_s
390         if(c != expected) {
391             log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",
392                     offset, expected, c);
393         }
394 #endif
395         i=i+4;
396         j=j+2;
397     }
398 
399     i=j=0;
400     for(offset=sizeof(input); offset > 0; --offset){
401         expected=result[i+2];  // prev_safe_ns
402 #if !U_HIDE_OBSOLETE_UTF_OLD_H
403         setOffset=offset;
404         UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, false);
405         if(setOffset != movedOffset[j+1]) {
406             log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
407                 offset, movedOffset[j+1], setOffset);
408         }
409         if(c != expected) {
410             log_err("ERROR: UTF8_PREV_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
411         }
412 #endif
413         setOffset=offset;
414         U8_PREV(input, 0, setOffset, c);
415         if(setOffset != movedOffset[j+1]) {
416             log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
417                 offset, movedOffset[j+1], setOffset);
418         }
419         if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
420         if(c != expected) {
421             log_err("ERROR: U8_PREV failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
422         }
423 
424         setOffset=offset;
425         U8_PREV_OR_FFFD(input, 0, setOffset, c);
426         if(setOffset != movedOffset[j+1]) {
427             log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
428                 offset, movedOffset[j+1], setOffset);
429         }
430         if(expected<0) { expected=0xfffd; }
431         if(c != expected) {
432             log_err("ERROR: U8_PREV_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
433         }
434 #if !U_HIDE_OBSOLETE_UTF_OLD_H
435         setOffset=offset;
436         UTF8_PREV_CHAR_SAFE(input, 0,  setOffset, c, true);
437         if(setOffset != movedOffset[j+1]) {
438             log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
439                 offset, movedOffset[j+1], setOffset);
440         }
441         expected=result[i+3];  // prev_safe_s
442         if(c != expected) {
443             log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",
444                     offset, expected, c);
445         }
446 #endif
447         i=i+4;
448         j=j+2;
449     }
450 }
451 
452 /* keep this in sync with utf16tst.c's TestNulTerminated() */
TestNulTerminated()453 static void TestNulTerminated() {
454     static const uint8_t input[]={
455         /*  0 */  0x61,
456         /*  1 */  0xf0, 0x90, 0x90, 0x81,
457         /*  5 */  0xc0,
458         /*  6 */  0x80,
459         /*  7 */  0xdf, 0x80,
460         /*  9 */  0xc2,
461         /* 10 */  0x62,
462         /* 11 */  0xfd,
463         /* 12 */  0xbe,
464         /* 13 */  0xe0, 0xa0, 0x80,
465         /* 16 */  0xe2, 0x82, 0xac,
466         /* 19 */  0xf0, 0x90, 0x90,
467         /* 22 */  0x00
468         /* 23 */
469     };
470     static const UChar32 result[]={
471         0x61,
472         0x10401,
473         U_SENTINEL,  // C0 not a lead byte
474         U_SENTINEL,  // 80
475         0x7c0,
476         U_SENTINEL,  // C2
477         0x62,
478         U_SENTINEL,  // FD not a lead byte
479         U_SENTINEL,  // BE
480         0x800,
481         0x20ac,
482         U_SENTINEL,  // truncated F0 90 90
483         0
484     };
485 
486     UChar32 c, c2, expected;
487     int32_t i0, i=0, j, k, expectedIndex;
488     int32_t cpIndex=0;
489     do {
490         i0=i;
491         U8_NEXT(input, i, -1, c);
492         expected=result[cpIndex];
493         if(c!=expected) {
494             log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, expected);
495         }
496         j=i0;
497         U8_NEXT_OR_FFFD(input, j, -1, c);
498         if(expected<0) { expected=0xfffd; }
499         if(c!=expected) {
500             log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0, c, expected);
501         }
502         if(j!=i) {
503             log_err("U8_NEXT_OR_FFFD() moved to index %d but U8_NEXT() moved to %d\n", j, i);
504         }
505         j=i0;
506         U8_FWD_1(input, j, -1);
507         if(j!=i) {
508             log_err("U8_FWD_1() moved to index %d but U8_NEXT() moved to %d\n", j, i);
509         }
510         ++cpIndex;
511         /*
512          * Move by this many code points from the start.
513          * U8_FWD_N() stops at the end of the string, that is, at the NUL if necessary.
514          */
515         expectedIndex= (c==0) ? i-1 : i;
516         k=0;
517         U8_FWD_N(input, k, -1, cpIndex);
518         if(k!=expectedIndex) {
519             log_err("U8_FWD_N(code points from 0) moved to index %d but expected %d\n", k, expectedIndex);
520         }
521     } while(c!=0);
522 
523     i=0;
524     do {
525         j=i0=i;
526         U8_NEXT(input, i, -1, c);
527         do {
528             U8_GET(input, 0, j, -1, c2);
529             if(c2!=c) {
530                 log_err("U8_NEXT(from %d)=U+%04x != U+%04x=U8_GET(at %d)\n", i0, c, c2, j);
531             }
532             U8_GET_OR_FFFD(input, 0, j, -1, c2);
533             expected= (c>=0) ? c : 0xfffd;
534             if(c2!=expected) {
535                 log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U8_GET_OR_FFFD(at %d)\n", i0, expected, c2, j);
536             }
537             /* U8_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */
538             k=j+1;
539             U8_SET_CP_LIMIT(input, 0, k, -1);
540             if(k!=i) {
541                 log_err("U8_NEXT() moved to %d but U8_SET_CP_LIMIT(%d) moved to %d\n", i, j+1, k);
542             }
543         } while(++j<i);
544     } while(c!=0);
545 }
546 
TestNextPrevNonCharacters()547 static void TestNextPrevNonCharacters() {
548     /* test non-characters */
549     static const uint8_t nonChars[]={
550         0xef, 0xb7, 0x90,       /* U+fdd0 */
551         0xef, 0xbf, 0xbf,       /* U+feff */
552         0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */
553         0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */
554         0xf4, 0x8f, 0xbf, 0xbe  /* U+10fffe */
555     };
556 
557     UChar32 ch;
558     int32_t idx;
559 
560     for(idx=0; idx<(int32_t)sizeof(nonChars);) {
561         U8_NEXT(nonChars, idx, sizeof(nonChars), ch);
562         if(!U_IS_UNICODE_NONCHAR(ch)) {
563             log_err("U8_NEXT(before %d) failed to read a non-character\n", idx);
564         }
565     }
566     for(idx=(int32_t)sizeof(nonChars); idx>0;) {
567         U8_PREV(nonChars, 0, idx, ch);
568         if(!U_IS_UNICODE_NONCHAR(ch)) {
569             log_err("U8_PREV(at %d) failed to read a non-character\n", idx);
570         }
571     }
572 #if !U_HIDE_OBSOLETE_UTF_OLD_H
573     for(idx=0; idx<(int32_t)sizeof(nonChars);) {
574         UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff;
575         UTF8_NEXT_CHAR_SAFE(nonChars, idx, sizeof(nonChars), ch, true);
576         if(ch!=expected) {
577             log_err("UTF8_NEXT_CHAR_SAFE(strict, before %d) failed to read a non-character\n", idx);
578         }
579     }
580     for(idx=(int32_t)sizeof(nonChars); idx>0;) {
581         UTF8_PREV_CHAR_SAFE(nonChars, 0, idx, ch, true);
582         UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff;
583         if(ch!=expected) {
584             log_err("UTF8_PREV_CHAR_SAFE(strict, at %d) failed to read a non-character\n", idx);
585         }
586     }
587 #endif
588 }
589 
TestNextPrevCharUnsafe()590 static void TestNextPrevCharUnsafe() {
591     /*
592      * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
593      * The behavior of _UNSAFE macros for ill-formed strings is undefined.
594      */
595     static const uint8_t input[]={
596         0x61,
597         0xf0, 0x90, 0x90, 0x81,
598         0xc0, 0x80,  /* non-shortest form */
599         0xe2, 0x82, 0xac,
600         0xc2, 0xa1,
601         0xf4, 0x8f, 0xbf, 0xbf,
602         0x00
603     };
604     static const UChar32 codePoints[]={
605         0x61,
606         0x10401,
607         -1,
608         0x20ac,
609         0xa1,
610         0x10ffff,
611         0
612     };
613 
614     UChar32 c, expected;
615     int32_t i;
616     uint32_t offset;
617 #if !U_HIDE_OBSOLETE_UTF_OLD_H
618     for(i=0, offset=0; offset<sizeof(input); ++i) {
619         UTF8_NEXT_CHAR_UNSAFE(input, offset, c);
620         expected = codePoints[i];
621         if(expected >= 0 && c != expected) {
622             log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
623                     offset, expected, c);
624         }
625         if(offset==6) {
626             // The obsolete UTF8_NEXT_CHAR_UNSAFE() skips 1+UTF8_COUNT_TRAIL_BYTES(lead) bytes
627             // while the new one skips C0 80 together.
628             ++offset;
629         }
630     }
631 #endif
632     for(i=0, offset=0; offset<sizeof(input); ++i) {
633         U8_NEXT_UNSAFE(input, offset, c);
634         expected = codePoints[i];
635         if(expected >= 0 && c != expected) {
636             log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
637                     offset, expected, c);
638         }
639     }
640 #if !U_HIDE_OBSOLETE_UTF_OLD_H
641     for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
642         UTF8_PREV_CHAR_UNSAFE(input, offset, c);
643         expected = codePoints[i];
644         if(expected >= 0 && c != expected) {
645             log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
646                     offset, expected, c);
647         }
648     }
649 #endif
650     for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
651         U8_PREV_UNSAFE(input, offset, c);
652         expected = codePoints[i];
653         if(expected >= 0 && c != expected) {
654             log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
655                     offset, expected, c);
656         }
657     }
658 }
659 
TestFwdBack()660 static void TestFwdBack() {
661     static const uint8_t input[]={
662         0x61,
663         0xF0, 0x90, 0x90, 0x81,
664         0xff,
665         0x62,
666         0xc0,
667         0x80,
668         0x7f,
669         0x8f,
670         0xc0,
671         0x63,
672         0x81,
673         0x90,
674         0x90,
675         0xF0,
676         0x00
677     };
678     static const uint16_t fwd_safe[]   ={1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
679     static const uint16_t back_safe[]  ={17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 1, 0};
680 
681     static const uint16_t Nvalue[]= {0, 1, 2, 4, 1, 2, 1, 5};
682     static const uint16_t fwd_N_safe[]   ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */
683     static const uint16_t back_N_safe[]  ={18, 17, 15, 11, 10, 8, 7, 0};
684 
685     uint32_t offsafe=0;
686 
687     uint32_t i=0;
688 #if !U_HIDE_OBSOLETE_UTF_OLD_H
689     while(offsafe < sizeof(input)){
690         UTF8_FWD_1_SAFE(input, offsafe, sizeof(input));
691         if(offsafe != fwd_safe[i]){
692             log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
693         }
694         i++;
695     }
696 #endif
697     offsafe=0;
698     i=0;
699     while(offsafe < sizeof(input)){
700         U8_FWD_1(input, offsafe, sizeof(input));
701         if(offsafe != fwd_safe[i]){
702             log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
703         }
704         i++;
705     }
706 #if !U_HIDE_OBSOLETE_UTF_OLD_H
707     i=0;
708     offsafe=sizeof(input);
709     while(offsafe > 0){
710         UTF8_BACK_1_SAFE(input, 0,  offsafe);
711         if(offsafe != back_safe[i]){
712             log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_safe[i], offsafe);
713         }
714         i++;
715     }
716 #endif
717     i=0;
718     offsafe=sizeof(input);
719     while(offsafe > 0){
720         U8_BACK_1(input, 0,  offsafe);
721         if(offsafe != back_safe[i]){
722             log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_safe[i], offsafe);
723         }
724         i++;
725     }
726 #if !U_HIDE_OBSOLETE_UTF_OLD_H
727     offsafe=0;
728     for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
729         UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]);
730         if(offsafe != fwd_N_safe[i]){
731             log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
732         }
733 
734     }
735 #endif
736     offsafe=0;
737     for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
738         U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]);
739         if(offsafe != fwd_N_safe[i]){
740             log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
741         }
742 
743     }
744 #if !U_HIDE_OBSOLETE_UTF_OLD_H
745     offsafe=sizeof(input);
746     for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
747         UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]);
748         if(offsafe != back_N_safe[i]){
749             log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
750         }
751     }
752 #endif
753     offsafe=sizeof(input);
754     for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
755         U8_BACK_N(input, 0, offsafe, Nvalue[i]);
756         if(offsafe != back_N_safe[i]){
757             log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
758         }
759     }
760 }
761 
762 /**
763  * Ticket #13636 - The optimizer in Visual Studio 2017 has problems optimizing this function.
764  * As a work-around, optimization is disabled for this function on VS2017.
765  * This work-around should be removed once the following versions of Visual Studio are no
766  * longer supported: All versions of VS2017, and versions of VS2019 below 16.4.
767  */
768 #if defined(_MSC_VER) && (_MSC_VER > 1900) && (_MSC_VER < 1924)
769 #pragma optimize( "", off )
770 #endif
771 
TestFwdBackUnsafe()772 static void TestFwdBackUnsafe() {
773     /*
774      * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
775      * The behavior of _UNSAFE macros for ill-formed strings is undefined.
776      */
777     static const uint8_t input[]={
778         0x61,
779         0xf0, 0x90, 0x90, 0x81,
780         0xc0, 0x80,  /* non-shortest form */
781         0xe2, 0x82, 0xac,
782         0xc2, 0xa1,
783         0xf4, 0x8f, 0xbf, 0xbf,
784         0x00
785     };
786     // forward unsafe skips only C0
787     static const int8_t boundaries[]={ 0, 1, 5, 6, 7, 10, 12, 16, 17 };
788     // backward unsafe skips C0 80 together
789     static const int8_t backBoundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 };
790 
791     int32_t offset;
792     int32_t i;
793 #if !U_HIDE_OBSOLETE_UTF_OLD_H
794     for(i=1, offset=0; offset<UPRV_LENGTHOF(input); ++i) {
795         UTF8_FWD_1_UNSAFE(input, offset);
796         if(offset != boundaries[i]){
797             log_err("ERROR: UTF8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
798         }
799     }
800 #endif
801     for(i=1, offset=0; offset<UPRV_LENGTHOF(input); ++i) {
802         U8_FWD_1_UNSAFE(input, offset);
803         if(offset != boundaries[i]){
804             log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
805         }
806     }
807 #if !U_HIDE_OBSOLETE_UTF_OLD_H
808     for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
809         UTF8_BACK_1_UNSAFE(input, offset);
810         if(offset != backBoundaries[i]){
811             log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset);
812         }
813     }
814 #endif
815     for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
816         U8_BACK_1_UNSAFE(input, offset);
817         if(offset != backBoundaries[i]){
818             log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset);
819         }
820     }
821 #if !U_HIDE_OBSOLETE_UTF_OLD_H
822     for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
823         offset=0;
824         UTF8_FWD_N_UNSAFE(input, offset, i);
825         if(offset != boundaries[i]) {
826             log_err("ERROR: UTF8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
827         }
828     }
829 #endif
830     for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
831         offset=0;
832         U8_FWD_N_UNSAFE(input, offset, i);
833         if(offset != boundaries[i]) {
834             log_err("ERROR: U8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
835         }
836     }
837 #if !U_HIDE_OBSOLETE_UTF_OLD_H
838     for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) {
839         int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i;
840         offset=UPRV_LENGTHOF(input);
841         UTF8_BACK_N_UNSAFE(input, offset, i);
842         if(offset != backBoundaries[j]) {
843             log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset);
844         }
845     }
846 #endif
847     for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) {
848         int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i;
849         offset=UPRV_LENGTHOF(input);
850         U8_BACK_N_UNSAFE(input, offset, i);
851         if(offset != backBoundaries[j]) {
852             log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset);
853         }
854     }
855 }
856 
857 // Ticket #13636 - Turn optimization back on.
858 #if defined(_MSC_VER) && (_MSC_VER > 1900) && (_MSC_VER < 1924)
859 #pragma optimize( "", on )
860 #endif
861 
TestSetChar()862 static void TestSetChar() {
863     static const uint8_t input[]
864         = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0xfe, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x00 };
865     static const int16_t start_safe[]
866         = {0,    1,    1,    1,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13,  14 };
867     static const int16_t limit_safe[]
868         = {0,    1,    4,    4,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13,  14 };
869 
870     uint32_t i=0;
871     int32_t offset=0, setOffset=0;
872     for(offset=0; offset<=UPRV_LENGTHOF(input); offset++){
873         if (offset<UPRV_LENGTHOF(input)){
874 #if !U_HIDE_OBSOLETE_UTF_OLD_H
875             setOffset=offset;
876             UTF8_SET_CHAR_START_SAFE(input, 0, setOffset);
877             if(setOffset != start_safe[i]){
878                 log_err("ERROR: UTF8_SET_CHAR_START_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
879             }
880 #endif
881             setOffset=offset;
882             U8_SET_CP_START(input, 0, setOffset);
883             if(setOffset != start_safe[i]){
884                 log_err("ERROR: U8_SET_CP_START failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
885             }
886         }
887 #if !U_HIDE_OBSOLETE_UTF_OLD_H
888         setOffset=offset;
889         UTF8_SET_CHAR_LIMIT_SAFE(input,0, setOffset, (int32_t)sizeof(input));
890         if(setOffset != limit_safe[i]){
891             log_err("ERROR: UTF8_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
892         }
893 #endif
894         setOffset=offset;
895         U8_SET_CP_LIMIT(input,0, setOffset, (int32_t)sizeof(input));
896         if(setOffset != limit_safe[i]){
897             log_err("ERROR: U8_SET_CP_LIMIT failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
898         }
899 
900         i++;
901     }
902 }
903 
TestSetCharUnsafe()904 static void TestSetCharUnsafe() {
905     static const uint8_t input[]
906         = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0x2e, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x80, 0x80, 0x00 };
907     static const int16_t start_unsafe[]
908         = {0,    1,    1,    1,    4,    5,    6,    7,    8,    9,    9,    9,    12,   12,   12,   15 };
909     static const int16_t limit_unsafe[]
910         = {0,    1,    4,    4,    4,    5,    6,    7,    9,    9,    10,   10,   10,   15,   15,   15,   16 };
911 
912     uint32_t i=0;
913     int32_t offset=0, setOffset=0;
914     for(offset=0; offset<=UPRV_LENGTHOF(input); offset++){
915         if (offset<UPRV_LENGTHOF(input)){
916 #if !U_HIDE_OBSOLETE_UTF_OLD_H
917             setOffset=offset;
918             UTF8_SET_CHAR_START_UNSAFE(input, setOffset);
919             if(setOffset != start_unsafe[i]){
920                 log_err("ERROR: UTF8_SET_CHAR_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
921             }
922 #endif
923             setOffset=offset;
924             U8_SET_CP_START_UNSAFE(input, setOffset);
925             if(setOffset != start_unsafe[i]){
926                 log_err("ERROR: U8_SET_CP_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
927             }
928         }
929 
930         if (offset != 0) { /* Can't have it go off the end of the array */
931 #if !U_HIDE_OBSOLETE_UTF_OLD_H
932             setOffset=offset;
933             UTF8_SET_CHAR_LIMIT_UNSAFE(input, setOffset);
934             if(setOffset != limit_unsafe[i]){
935                 log_err("ERROR: UTF8_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
936             }
937 #endif
938             setOffset=offset;
939             U8_SET_CP_LIMIT_UNSAFE(input, setOffset);
940             if(setOffset != limit_unsafe[i]){
941                 log_err("ERROR: U8_SET_CP_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
942             }
943         }
944 
945         i++;
946     }
947 }
948 
TestTruncateIfIncomplete()949 static void TestTruncateIfIncomplete() {
950     // Difference from U8_SET_CP_START():
951     // U8_TRUNCATE_IF_INCOMPLETE() does not look at s[length].
952     // Therefore, if the last byte is a lead byte, then this macro truncates
953     // even if the byte at the input index cannot continue a valid sequence
954     // (including when that is not a trail byte).
955     // On the other hand, if the last byte is a trail byte, then the two macros behave the same.
956     static const struct {
957         const char *s;
958         int32_t expected;
959     } cases[] = {
960         { "", 0 },
961         { "a", 1 },
962         { "\x80", 1 },
963         { "\xC1", 1 },
964         { "\xC2", 0 },
965         { "\xE0", 0 },
966         { "\xF4", 0 },
967         { "\xF5", 1 },
968         { "\x80\x80", 2 },
969         { "\xC2\xA0", 2 },
970         { "\xE0\x9F", 2 },
971         { "\xE0\xA0", 0 },
972         { "\xED\x9F", 0 },
973         { "\xED\xA0", 2 },
974         { "\xF0\x8F", 2 },
975         { "\xF0\x90", 0 },
976         { "\xF4\x8F", 0 },
977         { "\xF4\x90", 2 },
978         { "\xF5\x80", 2 },
979         { "\x80\x80\x80", 3 },
980         { "\xC2\xA0\x80", 3 },
981         { "\xE0\xA0\x80", 3 },
982         { "\xF0\x8F\x80", 3 },
983         { "\xF0\x90\x80", 0 },
984         { "\xF4\x8F\x80", 0 },
985         { "\xF4\x90\x80", 3 },
986         { "\xF5\x80\x80", 3 },
987         { "\x80\x80\x80\x80", 4 },
988         { "\xC2\xA0\x80\x80", 4 },
989         { "\xE0\xA0\x80\x80", 4 },
990         { "\xF0\x90\x80\x80", 4 },
991         { "\xF5\x80\x80\x80", 4 }
992     };
993     int32_t i;
994     for (i = 0; i < UPRV_LENGTHOF(cases); ++i) {
995         const char *s = cases[i].s;
996         int32_t expected = cases[i].expected;
997         int32_t length = (int32_t)strlen(s);
998         int32_t adjusted = length;
999         U8_TRUNCATE_IF_INCOMPLETE(s, 0, adjusted);
1000         if (adjusted != expected) {
1001             log_err("ERROR: U8_TRUNCATE_IF_INCOMPLETE failed for i=%d, length=%d. Expected:%d Got:%d\n",
1002                     (int)i, (int)length, (int)expected, (int)adjusted);
1003         }
1004     }
1005 }
1006 
TestAppendChar()1007 static void TestAppendChar(){
1008 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1009     static const uint8_t s[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00};
1010     static const uint32_t test[]={
1011     /*  append-position(unsafe),  CHAR to be appended */
1012         0,                        0x10401,
1013         2,                        0x0028,
1014         2,                        0x007f,
1015         3,                        0xd801,
1016         1,                        0x20402,
1017         8,                        0x10401,
1018         5,                        0xc0,
1019         5,                        0xc1,
1020         5,                        0xfd,
1021         6,                        0x80,
1022         6,                        0x81,
1023         6,                        0xbf,
1024         7,                        0xfe,
1025 
1026     /*  append-position(safe),    CHAR to be appended */
1027         0,                        0x10401,
1028         2,                        0x0028,
1029         3,                        0x7f,
1030         3,                        0xd801,   /* illegal for UTF-8 starting with Unicode 3.2 */
1031         1,                        0x20402,
1032         9,                        0x10401,
1033         5,                        0xc0,
1034         5,                        0xc1,
1035         5,                        0xfd,
1036         6,                        0x80,
1037         6,                        0x81,
1038         6,                        0xbf,
1039         7,                        0xfe,
1040 
1041     };
1042     static const uint16_t movedOffset[]={
1043     /* offset-moved-to(unsafe) */
1044           4,              /*for append-pos: 0 , CHAR 0x10401*/
1045           3,
1046           3,
1047           6,
1048           5,
1049           12,
1050           7,
1051           7,
1052           7,
1053           8,
1054           8,
1055           8,
1056           9,
1057 
1058     /* offset-moved-to(safe) */
1059           4,              /*for append-pos: 0, CHAR  0x10401*/
1060           3,
1061           4,
1062           6,
1063           5,
1064           11,
1065           7,
1066           7,
1067           7,
1068           8,
1069           8,
1070           8,
1071           9,
1072 
1073     };
1074 
1075     static const uint8_t result[][11]={
1076         /*unsafe*/
1077         {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1078         {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1079         {0x61, 0x62, 0x7f, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1080         {0x61, 0x62, 0x63, 0xed, 0xa0, 0x81, 0x67, 0x68, 0x69, 0x6a, 0x00},
1081         {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1082         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0xF0, 0x90, 0x90},
1083 
1084         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
1085         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
1086         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
1087 
1088         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
1089         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
1090         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
1091 
1092         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
1093         /*safe*/
1094         {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1095         {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1096         {0x61, 0x62, 0x63, 0x7f, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1097         {0x61, 0x62, 0x63, 0xef, 0xbf, 0xbf, 0x67, 0x68, 0x69, 0x6a, 0x00},
1098         {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1099         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xc2, 0x9f}, /*gets UTF8_ERROR_VALUE_2 which takes 2 bytes 0xc0, 0x9f*/
1100 
1101         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
1102         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
1103         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
1104 
1105         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
1106         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
1107         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
1108 
1109         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
1110 
1111     };
1112     uint16_t i, count=0;
1113     uint8_t str[12];
1114     uint32_t offset;
1115 /*    UChar32 c=0;*/
1116     uint16_t size=UPRV_LENGTHOF(s);
1117     for(i=0; i<UPRV_LENGTHOF(test); i=(uint16_t)(i+2)){
1118         uprv_memcpy(str, s, size);
1119         offset=test[i];
1120         if(count<13){
1121             UTF8_APPEND_CHAR_UNSAFE(str, offset, test[i+1]);
1122             if(offset != movedOffset[count]){
1123                 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d  currentOffset=%d\n",
1124                     count, movedOffset[count], offset);
1125 
1126             }
1127             if(uprv_memcmp(str, result[count], size) !=0){
1128                 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed for count=%d. \nExpected:", count);
1129                 printUChars(result[count], size);
1130                 log_err("\nGot:      ");
1131                 printUChars(str, size);
1132                 log_err("\n");
1133             }
1134         }else{
1135             UTF8_APPEND_CHAR_SAFE(str, offset, size, test[i+1]);
1136             if(offset != movedOffset[count]){
1137                 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d  currentOffset=%d\n",
1138                     count, movedOffset[count], offset);
1139 
1140             }
1141             if(uprv_memcmp(str, result[count], size) !=0){
1142                 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed for count=%d. \nExpected:", count);
1143                 printUChars(result[count], size);
1144                 log_err("\nGot:     ");
1145                 printUChars(str, size);
1146                 log_err("\n");
1147             }
1148             /*call the API instead of MACRO
1149             uprv_memcpy(str, s, size);
1150             offset=test[i];
1151             c=test[i+1];
1152             if((uint32_t)(c)<=0x7f) {
1153                   (str)[(offset)++]=(uint8_t)(c);
1154             } else {
1155                  (offset)=utf8_appendCharSafeBody(str, (int32_t)(offset), (int32_t)(size), c);
1156             }
1157             if(offset != movedOffset[count]){
1158                 log_err("ERROR: utf8_appendCharSafeBody() failed to move the offset correctly for count=%d.\nExpectedOffset=%d  currentOffset=%d\n",
1159                     count, movedOffset[count], offset);
1160 
1161             }
1162             if(uprv_memcmp(str, result[count], size) !=0){
1163                 log_err("ERROR: utf8_appendCharSafeBody() failed for count=%d. \nExpected:", count);
1164                 printUChars(result[count], size);
1165                 printf("\nGot:     ");
1166                 printUChars(str, size);
1167                 printf("\n");
1168             }
1169             */
1170         }
1171         count++;
1172     }
1173 #endif
1174 }
1175 
TestAppend()1176 static void TestAppend() {
1177     static const UChar32 codePoints[]={
1178         0x61, 0xdf, 0x901, 0x3040,
1179         0xac00, 0xd800, 0xdbff, 0xdcde,
1180         0xdffd, 0xe000, 0xffff, 0x10000,
1181         0x12345, 0xe0021, 0x10ffff, 0x110000,
1182         0x234567, 0x7fffffff, -1, -1000,
1183         0, 0x400
1184     };
1185     static const uint8_t expectUnsafe[]={
1186         0x61,  0xc3, 0x9f,  0xe0, 0xa4, 0x81,  0xe3, 0x81, 0x80,
1187         0xea, 0xb0, 0x80,  0xed, 0xa0, 0x80,  0xed, 0xaf, 0xbf,  0xed, 0xb3, 0x9e,
1188         0xed, 0xbf, 0xbd,  0xee, 0x80, 0x80,  0xef, 0xbf, 0xbf,  0xf0, 0x90, 0x80, 0x80,
1189         0xf0, 0x92, 0x8d, 0x85,  0xf3, 0xa0, 0x80, 0xa1,  0xf4, 0x8f, 0xbf, 0xbf,  /* not 0x110000 */
1190         /* none from this line */
1191         0,  0xd0, 0x80
1192     }, expectSafe[]={
1193         0x61,  0xc3, 0x9f,  0xe0, 0xa4, 0x81,  0xe3, 0x81, 0x80,
1194         0xea, 0xb0, 0x80,  /* no surrogates */
1195         /* no surrogates */  0xee, 0x80, 0x80,  0xef, 0xbf, 0xbf,  0xf0, 0x90, 0x80, 0x80,
1196         0xf0, 0x92, 0x8d, 0x85,  0xf3, 0xa0, 0x80, 0xa1,  0xf4, 0x8f, 0xbf, 0xbf,  /* not 0x110000 */
1197         /* none from this line */
1198         0,  0xd0, 0x80
1199     };
1200 
1201     uint8_t buffer[100];
1202     UChar32 c;
1203     int32_t i, length;
1204     UBool isError, expectIsError, wrongIsError;
1205 
1206     length=0;
1207     for(i=0; i<UPRV_LENGTHOF(codePoints); ++i) {
1208         c=codePoints[i];
1209         if(c<0 || 0x10ffff<c) {
1210             continue; /* skip non-code points for U8_APPEND_UNSAFE */
1211         }
1212 
1213         U8_APPEND_UNSAFE(buffer, length, c);
1214     }
1215     if(length!=UPRV_LENGTHOF(expectUnsafe) || 0!=memcmp(buffer, expectUnsafe, length)) {
1216         log_err("U8_APPEND_UNSAFE did not generate the expected output\n");
1217     }
1218 
1219     length=0;
1220     wrongIsError=false;
1221     for(i=0; i<UPRV_LENGTHOF(codePoints); ++i) {
1222         c=codePoints[i];
1223         expectIsError= c<0 || 0x10ffff<c || U_IS_SURROGATE(c);
1224         isError=false;
1225 
1226         U8_APPEND(buffer, length, UPRV_LENGTHOF(buffer), c, isError);
1227         wrongIsError|= isError!=expectIsError;
1228     }
1229     if(wrongIsError) {
1230         log_err("U8_APPEND did not set isError correctly\n");
1231     }
1232     if(length!=UPRV_LENGTHOF(expectSafe) || 0!=memcmp(buffer, expectSafe, length)) {
1233         log_err("U8_APPEND did not generate the expected output\n");
1234     }
1235 }
1236 
1237 static void
TestSurrogates()1238 TestSurrogates() {
1239     static const uint8_t b[]={
1240         0xc3, 0x9f,             /*  00DF */
1241         0xed, 0x9f, 0xbf,       /*  D7FF */
1242         0xed, 0xa0, 0x81,       /*  D801 */
1243         0xed, 0xbf, 0xbe,       /*  DFFE */
1244         0xee, 0x80, 0x80,       /*  E000 */
1245         0xf0, 0x97, 0xbf, 0xbe  /* 17FFE */
1246     };
1247     static const UChar32 cp[]={
1248         0xdf, 0xd7ff, 0xd801, 0xdffe, 0xe000, 0x17ffe
1249     };
1250 
1251     UChar32 cu, cs, cl;
1252     int32_t i, j, k, iu, is, il, length;
1253 
1254     k=0; /* index into cp[] */
1255     length=UPRV_LENGTHOF(b);
1256     for(i=0; i<length;) {
1257         j=i;
1258         U8_NEXT_UNSAFE(b, j, cu);
1259         iu=j;
1260 
1261         j=i;
1262         U8_NEXT(b, j, length, cs);
1263         is=j;
1264 
1265         j=i;
1266         L8_NEXT(b, j, length, cl);
1267         il=j;
1268 
1269         if(cu!=cp[k]) {
1270             log_err("U8_NEXT_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
1271         }
1272 
1273         /* U8_NEXT() returns <0 for surrogate code points */
1274         if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
1275             log_err("U8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
1276         }
1277 
1278         /* L8_NEXT() returns surrogate code points like U8_NEXT_UNSAFE() */
1279         if(cl!=cu) {
1280             log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
1281         }
1282 
1283         // U8_NEXT() skips only the first byte of a surrogate byte sequence.
1284         if(U_IS_SURROGATE(cu) ? is!=(i+1) : is!=iu) {
1285             log_err("U8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1286         }
1287         if(il!=iu) {
1288             log_err("L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1289         }
1290 
1291         ++k;    /* next code point */
1292         i=iu;   /* advance by one UTF-8 sequence */
1293     }
1294 
1295     while(i>0) {
1296         --k; /* previous code point */
1297 
1298         j=i;
1299         U8_PREV_UNSAFE(b, j, cu);
1300         iu=j;
1301 
1302         j=i;
1303         U8_PREV(b, 0, j, cs);
1304         is=j;
1305 
1306         j=i;
1307         L8_PREV(b, 0, j, cl);
1308         il=j;
1309 
1310         if(cu!=cp[k]) {
1311             log_err("U8_PREV_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
1312         }
1313 
1314         /* U8_PREV() returns <0 for surrogate code points */
1315         if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
1316             log_err("U8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
1317         }
1318 
1319         /* L8_PREV() returns surrogate code points like U8_PREV_UNSAFE() */
1320         if(cl!=cu) {
1321             log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
1322         }
1323 
1324         // U8_PREV() skips only the last byte of a surrogate byte sequence.
1325         if(U_IS_SURROGATE(cu) ? is!=(i-1) : is!=iu) {
1326             log_err("U8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1327         }
1328         if(il !=iu) {
1329             log_err("L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1330         }
1331 
1332         i=iu;   /* go back by one UTF-8 sequence */
1333     }
1334 }
1335