1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 1998-2014, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /*
9 * File utf8tst.c
10 *
11 * Modification History:
12 *
13 * Date Name Description
14 * 07/24/2000 Madhu Creation
15 *******************************************************************************
16 */
17
18 #include "unicode/utypes.h"
19 #include "unicode/utf8.h"
20 #include "unicode/utf_old.h"
21 #include "cmemory.h"
22 #include "cintltst.h"
23
24 /* lenient UTF-8 ------------------------------------------------------------ */
25
26 /*
27 * Lenient UTF-8 differs from conformant UTF-8 in that it allows surrogate
28 * code points with their "natural" encoding.
29 * Effectively, this allows a mix of UTF-8 and CESU-8 as well as encodings of
30 * single surrogates.
31 *
32 * This is not conformant with UTF-8.
33 *
34 * Supplementary code points may be encoded as pairs of 3-byte sequences, but
35 * the macros below do not attempt to assemble such pairs.
36 */
37
38 #define L8_NEXT(s, i, length, c) { \
39 (c)=(uint8_t)(s)[(i)++]; \
40 if((c)>=0x80) { \
41 if(U8_IS_LEAD(c)) { \
42 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
43 } else { \
44 (c)=U_SENTINEL; \
45 } \
46 } \
47 }
48
49 #define L8_PREV(s, start, i, c) { \
50 (c)=(uint8_t)(s)[--(i)]; \
51 if((c)>=0x80) { \
52 if((c)<=0xbf) { \
53 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
54 } else { \
55 (c)=U_SENTINEL; \
56 } \
57 } \
58 }
59
60 /* -------------------------------------------------------------------------- */
61
62 // Obsolete macros from obsolete unicode/utf_old.h, for some old test data.
63 #ifndef UTF8_ERROR_VALUE_1
64 # define UTF8_ERROR_VALUE_1 0x15
65 #endif
66 #ifndef UTF8_ERROR_VALUE_2
67 # define UTF8_ERROR_VALUE_2 0x9f
68 #endif
69 #ifndef UTF_ERROR_VALUE
70 # define UTF_ERROR_VALUE 0xffff
71 #endif
72 #ifndef UTF_IS_ERROR
73 # define UTF_IS_ERROR(c) \
74 (((c)&0xfffe)==0xfffe || (c)==UTF8_ERROR_VALUE_1 || (c)==UTF8_ERROR_VALUE_2)
75 #endif
76
77 #if !U_HIDE_OBSOLETE_UTF_OLD_H
printUChars(const uint8_t * uchars,int16_t len)78 static void printUChars(const uint8_t *uchars, int16_t len){
79 int16_t i=0;
80 for(i=0; i<len; i++){
81 log_err("0x%02x ", *(uchars+i));
82 }
83 }
84 #endif
85
86 static void TestCodeUnitValues(void);
87 static void TestCharLength(void);
88 static void TestGetChar(void);
89 static void TestNextPrevChar(void);
90 static void TestNulTerminated(void);
91 static void TestNextPrevNonCharacters(void);
92 static void TestNextPrevCharUnsafe(void);
93 static void TestFwdBack(void);
94 static void TestFwdBackUnsafe(void);
95 static void TestSetChar(void);
96 static void TestSetCharUnsafe(void);
97 static void TestTruncateIfIncomplete(void);
98 static void TestAppendChar(void);
99 static void TestAppend(void);
100 static void TestSurrogates(void);
101
102 void addUTF8Test(TestNode** root);
103
104 void
addUTF8Test(TestNode ** root)105 addUTF8Test(TestNode** root)
106 {
107 addTest(root, &TestCodeUnitValues, "utf8tst/TestCodeUnitValues");
108 addTest(root, &TestCharLength, "utf8tst/TestCharLength");
109 addTest(root, &TestGetChar, "utf8tst/TestGetChar");
110 addTest(root, &TestNextPrevChar, "utf8tst/TestNextPrevChar");
111 addTest(root, &TestNulTerminated, "utf8tst/TestNulTerminated");
112 addTest(root, &TestNextPrevNonCharacters, "utf8tst/TestNextPrevNonCharacters");
113 addTest(root, &TestNextPrevCharUnsafe, "utf8tst/TestNextPrevCharUnsafe");
114 addTest(root, &TestFwdBack, "utf8tst/TestFwdBack");
115 addTest(root, &TestFwdBackUnsafe, "utf8tst/TestFwdBackUnsafe");
116 addTest(root, &TestSetChar, "utf8tst/TestSetChar");
117 addTest(root, &TestSetCharUnsafe, "utf8tst/TestSetCharUnsafe");
118 addTest(root, &TestTruncateIfIncomplete, "utf8tst/TestTruncateIfIncomplete");
119 addTest(root, &TestAppendChar, "utf8tst/TestAppendChar");
120 addTest(root, &TestAppend, "utf8tst/TestAppend");
121 addTest(root, &TestSurrogates, "utf8tst/TestSurrogates");
122 }
123
TestCodeUnitValues()124 static void TestCodeUnitValues()
125 {
126 static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc2, 0xc4, 0xf0, 0xf4, 0x80, 0x81, 0xbc, 0xbe,};
127
128 int16_t i;
129 for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
130 uint8_t c=codeunit[i];
131 log_verbose("Testing code unit value of %x\n", c);
132 if(i<4){
133 if(
134 #if !U_HIDE_OBSOLETE_UTF_OLD_H
135 !UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || UTF8_IS_TRAIL(c) ||
136 #endif
137 !U8_IS_SINGLE(c) || U8_IS_LEAD(c) || U8_IS_TRAIL(c)) {
138 log_err("ERROR: 0x%02x is a single byte but results in single: %c lead: %c trail: %c\n",
139 c, U8_IS_SINGLE(c) ? 'y' : 'n', U8_IS_LEAD(c) ? 'y' : 'n', U8_IS_TRAIL(c) ? 'y' : 'n');
140 }
141 } else if(i< 8){
142 if(
143 #if !U_HIDE_OBSOLETE_UTF_OLD_H
144 !UTF8_IS_LEAD(c) || UTF8_IS_SINGLE(c) || UTF8_IS_TRAIL(c) ||
145 #endif
146 !U8_IS_LEAD(c) || U8_IS_SINGLE(c) || U8_IS_TRAIL(c)) {
147 log_err("ERROR: 0x%02x is a lead byte but results in single: %c lead: %c trail: %c\n",
148 c, U8_IS_SINGLE(c) ? 'y' : 'n', U8_IS_LEAD(c) ? 'y' : 'n', U8_IS_TRAIL(c) ? 'y' : 'n');
149 }
150 } else if(i< 12){
151 if(
152 #if !U_HIDE_OBSOLETE_UTF_OLD_H
153 !UTF8_IS_TRAIL(c) || UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) ||
154 #endif
155 !U8_IS_TRAIL(c) || U8_IS_SINGLE(c) || U8_IS_LEAD(c)){
156 log_err("ERROR: 0x%02x is a trail byte but results in single: %c lead: %c trail: %c\n",
157 c, U8_IS_SINGLE(c) ? 'y' : 'n', U8_IS_LEAD(c) ? 'y' : 'n', U8_IS_TRAIL(c) ? 'y' : 'n');
158 }
159 }
160 }
161 }
162
TestCharLength()163 static void TestCharLength()
164 {
165 static const uint32_t codepoint[]={
166 1, 0x0061,
167 1, 0x007f,
168 2, 0x016f,
169 2, 0x07ff,
170 3, 0x0865,
171 3, 0x20ac,
172 4, 0x20402,
173 4, 0x23456,
174 4, 0x24506,
175 4, 0x20402,
176 4, 0x10402,
177 3, 0xd7ff,
178 3, 0xe000,
179
180 };
181
182 int16_t i;
183 #if !U_HIDE_OBSOLETE_UTF_OLD_H
184 UBool multiple;
185 #endif
186 for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){
187 UChar32 c=codepoint[i+1];
188 if(
189 #if !U_HIDE_OBSOLETE_UTF_OLD_H
190 UTF8_CHAR_LENGTH(c) != (uint16_t)codepoint[i] ||
191 #endif
192 U8_LENGTH(c) != (uint16_t)codepoint[i]) {
193 log_err("The no: of code units for %lx:- Expected: %d Got: %d\n", c, codepoint[i], U8_LENGTH(c));
194 }else{
195 log_verbose("The no: of code units for %lx is %d\n",c, U8_LENGTH(c));
196 }
197 #if !U_HIDE_OBSOLETE_UTF_OLD_H
198 multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
199 if(UTF8_NEED_MULTIPLE_UCHAR(c) != multiple){
200 log_err("ERROR: UTF8_NEED_MULTIPLE_UCHAR failed for %lx\n", c);
201 }
202 #endif
203 }
204 }
205
TestGetChar()206 static void TestGetChar()
207 {
208 static const uint8_t input[]={
209 /* code unit,*/
210 0x61,
211 0x7f,
212 0xe4,
213 0xba,
214 0x8c,
215 0xF0,
216 0x90,
217 0x90,
218 0x81,
219 0xc0,
220 0x65,
221 0x31,
222 0x9a,
223 0xc9
224 };
225 static const UChar32 result[]={
226 /* codepoint-unsafe, codepoint-safe(not strict) codepoint-safe(strict) */
227 0x61, 0x61, 0x61,
228 0x7f, 0x7f, 0x7f,
229 0x4e8c, 0x4e8c, 0x4e8c,
230 0x4e8c, 0x4e8c, 0x4e8c ,
231 0x4e8c, 0x4e8c, 0x4e8c,
232 0x10401, 0x10401, 0x10401 ,
233 0x10401, 0x10401, 0x10401 ,
234 0x10401, 0x10401, 0x10401 ,
235 0x10401, 0x10401, 0x10401,
236 -1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
237 0x65, 0x65, 0x65,
238 0x31, 0x31, 0x31,
239 -1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
240 -1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1
241 };
242 uint16_t i=0;
243 UChar32 c, expected;
244 uint32_t offset=0;
245
246 for(offset=0; offset<sizeof(input); offset++) {
247 expected = result[i];
248 if (expected >= 0 && offset < sizeof(input) - 1) {
249 #if !U_HIDE_OBSOLETE_UTF_OLD_H
250 UTF8_GET_CHAR_UNSAFE(input, offset, c);
251 if(c != expected) {
252 log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
253 offset, expected, c);
254
255 }
256 #endif
257 U8_GET_UNSAFE(input, offset, c);
258 if(c != expected) {
259 log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
260 offset, expected, c);
261
262 }
263 }
264 expected=result[i+1];
265 #if !U_HIDE_OBSOLETE_UTF_OLD_H
266 UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE);
267 if(c != expected){
268 log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
269 }
270 #endif
271 U8_GET(input, 0, offset, sizeof(input), c);
272 if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
273 if(c != expected){
274 log_err("ERROR: U8_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
275 }
276
277 U8_GET_OR_FFFD(input, 0, offset, sizeof(input), c);
278 if(expected<0) { expected=0xfffd; }
279 if(c != expected){
280 log_err("ERROR: U8_GET_OR_FFFD failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
281 }
282 #if !U_HIDE_OBSOLETE_UTF_OLD_H
283 UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, TRUE);
284 if(c != result[i+2]){
285 log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
286 }
287 #endif
288 i=(uint16_t)(i+3);
289 }
290 }
291
TestNextPrevChar()292 static void TestNextPrevChar() {
293 static const uint8_t input[]={
294 0x61,
295 0xf0, 0x90, 0x90, 0x81,
296 0xc0, 0x80, // non-shortest form
297 0xf3, 0xbe, // truncated
298 0xc2, // truncated
299 0x61,
300 0x81, 0x90, 0x90, 0xf0, // "backwards" sequence
301 0x00
302 };
303 static const UChar32 result[]={
304 /* next_safe_ns next_safe_s prev_safe_ns prev_safe_s */
305 0x0061, 0x0061, 0x0000, 0x0000,
306 0x10401, 0x10401, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
307 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
308 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
309 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
310 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x61, 0x61,
311 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
312 UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
313 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
314 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
315 0x61, 0x61, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
316 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x10401, 0x10401,
317 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF_ERROR_VALUE, UTF_ERROR_VALUE,
318 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
319 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
320 0x0000, 0x0000, 0x0061, 0x0061
321 };
322 static const int32_t movedOffset[]={
323 /* next_safe prev_safe_s */
324 1, 15,
325 5, 14,
326 3, 13,
327 4, 12,
328 5, 11,
329 6, 10,
330 7, 9,
331 9, 7,
332 9, 7,
333 10, 6,
334 11, 5,
335 12, 1,
336 13, 1,
337 14, 1,
338 15, 1,
339 16, 0,
340 };
341
342 UChar32 c, expected;
343 uint32_t i=0, j=0;
344 uint32_t offset=0;
345 int32_t setOffset=0;
346 for(offset=0; offset<sizeof(input); offset++){
347 expected=result[i]; // next_safe_ns
348 #if !U_HIDE_OBSOLETE_UTF_OLD_H
349 setOffset=offset;
350 UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
351 if(setOffset != movedOffset[j]) {
352 log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
353 offset, movedOffset[j], setOffset);
354 }
355 if(c != expected) {
356 log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
357 }
358 #endif
359 setOffset=offset;
360 U8_NEXT(input, setOffset, sizeof(input), c);
361 if(setOffset != movedOffset[j]) {
362 log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
363 offset, movedOffset[j], setOffset);
364 }
365 if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
366 if(c != expected) {
367 log_err("ERROR: U8_NEXT failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
368 }
369
370 setOffset=offset;
371 U8_NEXT_OR_FFFD(input, setOffset, sizeof(input), c);
372 if(setOffset != movedOffset[j]) {
373 log_err("ERROR: U8_NEXT_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
374 offset, movedOffset[j], setOffset);
375 }
376 if(expected<0) { expected=0xfffd; }
377 if(c != expected) {
378 log_err("ERROR: U8_NEXT_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
379 }
380 #if !U_HIDE_OBSOLETE_UTF_OLD_H
381 setOffset=offset;
382 UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
383 if(setOffset != movedOffset[j]) {
384 log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
385 offset, movedOffset[j], setOffset);
386 }
387 expected=result[i+1]; // next_safe_s
388 if(c != expected) {
389 log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",
390 offset, expected, c);
391 }
392 #endif
393 i=i+4;
394 j=j+2;
395 }
396
397 i=j=0;
398 for(offset=sizeof(input); offset > 0; --offset){
399 expected=result[i+2]; // prev_safe_ns
400 #if !U_HIDE_OBSOLETE_UTF_OLD_H
401 setOffset=offset;
402 UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
403 if(setOffset != movedOffset[j+1]) {
404 log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
405 offset, movedOffset[j+1], setOffset);
406 }
407 if(c != expected) {
408 log_err("ERROR: UTF8_PREV_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
409 }
410 #endif
411 setOffset=offset;
412 U8_PREV(input, 0, setOffset, c);
413 if(setOffset != movedOffset[j+1]) {
414 log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
415 offset, movedOffset[j+1], setOffset);
416 }
417 if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
418 if(c != expected) {
419 log_err("ERROR: U8_PREV failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
420 }
421
422 setOffset=offset;
423 U8_PREV_OR_FFFD(input, 0, setOffset, c);
424 if(setOffset != movedOffset[j+1]) {
425 log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
426 offset, movedOffset[j+1], setOffset);
427 }
428 if(expected<0) { expected=0xfffd; }
429 if(c != expected) {
430 log_err("ERROR: U8_PREV_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
431 }
432 #if !U_HIDE_OBSOLETE_UTF_OLD_H
433 setOffset=offset;
434 UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE);
435 if(setOffset != movedOffset[j+1]) {
436 log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
437 offset, movedOffset[j+1], setOffset);
438 }
439 expected=result[i+3]; // prev_safe_s
440 if(c != expected) {
441 log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",
442 offset, expected, c);
443 }
444 #endif
445 i=i+4;
446 j=j+2;
447 }
448 }
449
450 /* keep this in sync with utf16tst.c's TestNulTerminated() */
TestNulTerminated()451 static void TestNulTerminated() {
452 static const uint8_t input[]={
453 /* 0 */ 0x61,
454 /* 1 */ 0xf0, 0x90, 0x90, 0x81,
455 /* 5 */ 0xc0,
456 /* 6 */ 0x80,
457 /* 7 */ 0xdf, 0x80,
458 /* 9 */ 0xc2,
459 /* 10 */ 0x62,
460 /* 11 */ 0xfd,
461 /* 12 */ 0xbe,
462 /* 13 */ 0xe0, 0xa0, 0x80,
463 /* 16 */ 0xe2, 0x82, 0xac,
464 /* 19 */ 0xf0, 0x90, 0x90,
465 /* 22 */ 0x00
466 /* 23 */
467 };
468 static const UChar32 result[]={
469 0x61,
470 0x10401,
471 U_SENTINEL, // C0 not a lead byte
472 U_SENTINEL, // 80
473 0x7c0,
474 U_SENTINEL, // C2
475 0x62,
476 U_SENTINEL, // FD not a lead byte
477 U_SENTINEL, // BE
478 0x800,
479 0x20ac,
480 U_SENTINEL, // truncated F0 90 90
481 0
482 };
483
484 UChar32 c, c2, expected;
485 int32_t i0, i=0, j, k, expectedIndex;
486 int32_t cpIndex=0;
487 do {
488 i0=i;
489 U8_NEXT(input, i, -1, c);
490 expected=result[cpIndex];
491 if(c!=expected) {
492 log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, expected);
493 }
494 j=i0;
495 U8_NEXT_OR_FFFD(input, j, -1, c);
496 if(expected<0) { expected=0xfffd; }
497 if(c!=expected) {
498 log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0, c, expected);
499 }
500 if(j!=i) {
501 log_err("U8_NEXT_OR_FFFD() moved to index %d but U8_NEXT() moved to %d\n", j, i);
502 }
503 j=i0;
504 U8_FWD_1(input, j, -1);
505 if(j!=i) {
506 log_err("U8_FWD_1() moved to index %d but U8_NEXT() moved to %d\n", j, i);
507 }
508 ++cpIndex;
509 /*
510 * Move by this many code points from the start.
511 * U8_FWD_N() stops at the end of the string, that is, at the NUL if necessary.
512 */
513 expectedIndex= (c==0) ? i-1 : i;
514 k=0;
515 U8_FWD_N(input, k, -1, cpIndex);
516 if(k!=expectedIndex) {
517 log_err("U8_FWD_N(code points from 0) moved to index %d but expected %d\n", k, expectedIndex);
518 }
519 } while(c!=0);
520
521 i=0;
522 do {
523 j=i0=i;
524 U8_NEXT(input, i, -1, c);
525 do {
526 U8_GET(input, 0, j, -1, c2);
527 if(c2!=c) {
528 log_err("U8_NEXT(from %d)=U+%04x != U+%04x=U8_GET(at %d)\n", i0, c, c2, j);
529 }
530 U8_GET_OR_FFFD(input, 0, j, -1, c2);
531 expected= (c>=0) ? c : 0xfffd;
532 if(c2!=expected) {
533 log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U8_GET_OR_FFFD(at %d)\n", i0, expected, c2, j);
534 }
535 /* U8_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */
536 k=j+1;
537 U8_SET_CP_LIMIT(input, 0, k, -1);
538 if(k!=i) {
539 log_err("U8_NEXT() moved to %d but U8_SET_CP_LIMIT(%d) moved to %d\n", i, j+1, k);
540 }
541 } while(++j<i);
542 } while(c!=0);
543 }
544
TestNextPrevNonCharacters()545 static void TestNextPrevNonCharacters() {
546 /* test non-characters */
547 static const uint8_t nonChars[]={
548 0xef, 0xb7, 0x90, /* U+fdd0 */
549 0xef, 0xbf, 0xbf, /* U+feff */
550 0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */
551 0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */
552 0xf4, 0x8f, 0xbf, 0xbe /* U+10fffe */
553 };
554
555 UChar32 ch;
556 int32_t idx;
557
558 for(idx=0; idx<(int32_t)sizeof(nonChars);) {
559 U8_NEXT(nonChars, idx, sizeof(nonChars), ch);
560 if(!U_IS_UNICODE_NONCHAR(ch)) {
561 log_err("U8_NEXT(before %d) failed to read a non-character\n", idx);
562 }
563 }
564 for(idx=(int32_t)sizeof(nonChars); idx>0;) {
565 U8_PREV(nonChars, 0, idx, ch);
566 if(!U_IS_UNICODE_NONCHAR(ch)) {
567 log_err("U8_PREV(at %d) failed to read a non-character\n", idx);
568 }
569 }
570 #if !U_HIDE_OBSOLETE_UTF_OLD_H
571 for(idx=0; idx<(int32_t)sizeof(nonChars);) {
572 UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff;
573 UTF8_NEXT_CHAR_SAFE(nonChars, idx, sizeof(nonChars), ch, TRUE);
574 if(ch!=expected) {
575 log_err("UTF8_NEXT_CHAR_SAFE(strict, before %d) failed to read a non-character\n", idx);
576 }
577 }
578 for(idx=(int32_t)sizeof(nonChars); idx>0;) {
579 UTF8_PREV_CHAR_SAFE(nonChars, 0, idx, ch, TRUE);
580 UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff;
581 if(ch!=expected) {
582 log_err("UTF8_PREV_CHAR_SAFE(strict, at %d) failed to read a non-character\n", idx);
583 }
584 }
585 #endif
586 }
587
TestNextPrevCharUnsafe()588 static void TestNextPrevCharUnsafe() {
589 /*
590 * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
591 * The behavior of _UNSAFE macros for ill-formed strings is undefined.
592 */
593 static const uint8_t input[]={
594 0x61,
595 0xf0, 0x90, 0x90, 0x81,
596 0xc0, 0x80, /* non-shortest form */
597 0xe2, 0x82, 0xac,
598 0xc2, 0xa1,
599 0xf4, 0x8f, 0xbf, 0xbf,
600 0x00
601 };
602 static const UChar32 codePoints[]={
603 0x61,
604 0x10401,
605 -1,
606 0x20ac,
607 0xa1,
608 0x10ffff,
609 0
610 };
611
612 UChar32 c, expected;
613 int32_t i;
614 uint32_t offset;
615 #if !U_HIDE_OBSOLETE_UTF_OLD_H
616 for(i=0, offset=0; offset<sizeof(input); ++i) {
617 UTF8_NEXT_CHAR_UNSAFE(input, offset, c);
618 expected = codePoints[i];
619 if(expected >= 0 && c != expected) {
620 log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
621 offset, expected, c);
622 }
623 if(offset==6) {
624 // The obsolete UTF8_NEXT_CHAR_UNSAFE() skips 1+UTF8_COUNT_TRAIL_BYTES(lead) bytes
625 // while the new one skips C0 80 together.
626 ++offset;
627 }
628 }
629 #endif
630 for(i=0, offset=0; offset<sizeof(input); ++i) {
631 U8_NEXT_UNSAFE(input, offset, c);
632 expected = codePoints[i];
633 if(expected >= 0 && c != expected) {
634 log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
635 offset, expected, c);
636 }
637 }
638 #if !U_HIDE_OBSOLETE_UTF_OLD_H
639 for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
640 UTF8_PREV_CHAR_UNSAFE(input, offset, c);
641 expected = codePoints[i];
642 if(expected >= 0 && c != expected) {
643 log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
644 offset, expected, c);
645 }
646 }
647 #endif
648 for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
649 U8_PREV_UNSAFE(input, offset, c);
650 expected = codePoints[i];
651 if(expected >= 0 && c != expected) {
652 log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
653 offset, expected, c);
654 }
655 }
656 }
657
TestFwdBack()658 static void TestFwdBack() {
659 static const uint8_t input[]={
660 0x61,
661 0xF0, 0x90, 0x90, 0x81,
662 0xff,
663 0x62,
664 0xc0,
665 0x80,
666 0x7f,
667 0x8f,
668 0xc0,
669 0x63,
670 0x81,
671 0x90,
672 0x90,
673 0xF0,
674 0x00
675 };
676 static const uint16_t fwd_safe[] ={1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
677 static const uint16_t back_safe[] ={17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 1, 0};
678
679 static const uint16_t Nvalue[]= {0, 1, 2, 4, 1, 2, 1, 5};
680 static const uint16_t fwd_N_safe[] ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */
681 static const uint16_t back_N_safe[] ={18, 17, 15, 11, 10, 8, 7, 0};
682
683 uint32_t offsafe=0;
684
685 uint32_t i=0;
686 #if !U_HIDE_OBSOLETE_UTF_OLD_H
687 while(offsafe < sizeof(input)){
688 UTF8_FWD_1_SAFE(input, offsafe, sizeof(input));
689 if(offsafe != fwd_safe[i]){
690 log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
691 }
692 i++;
693 }
694 #endif
695 offsafe=0;
696 i=0;
697 while(offsafe < sizeof(input)){
698 U8_FWD_1(input, offsafe, sizeof(input));
699 if(offsafe != fwd_safe[i]){
700 log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
701 }
702 i++;
703 }
704 #if !U_HIDE_OBSOLETE_UTF_OLD_H
705 i=0;
706 offsafe=sizeof(input);
707 while(offsafe > 0){
708 UTF8_BACK_1_SAFE(input, 0, offsafe);
709 if(offsafe != back_safe[i]){
710 log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_safe[i], offsafe);
711 }
712 i++;
713 }
714 #endif
715 i=0;
716 offsafe=sizeof(input);
717 while(offsafe > 0){
718 U8_BACK_1(input, 0, offsafe);
719 if(offsafe != back_safe[i]){
720 log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_safe[i], offsafe);
721 }
722 i++;
723 }
724 #if !U_HIDE_OBSOLETE_UTF_OLD_H
725 offsafe=0;
726 for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
727 UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]);
728 if(offsafe != fwd_N_safe[i]){
729 log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
730 }
731
732 }
733 #endif
734 offsafe=0;
735 for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
736 U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]);
737 if(offsafe != fwd_N_safe[i]){
738 log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
739 }
740
741 }
742 #if !U_HIDE_OBSOLETE_UTF_OLD_H
743 offsafe=sizeof(input);
744 for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
745 UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]);
746 if(offsafe != back_N_safe[i]){
747 log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
748 }
749 }
750 #endif
751 offsafe=sizeof(input);
752 for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
753 U8_BACK_N(input, 0, offsafe, Nvalue[i]);
754 if(offsafe != back_N_safe[i]){
755 log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
756 }
757 }
758 }
759
760 /**
761 * Ticket #13636 - Visual Studio 2017 has problems optimizing this function.
762 * As a workaround, we will turn off optimization just for this function on VS2017 and above.
763 */
764 #if defined(_MSC_VER) && (_MSC_VER > 1900)
765 #pragma optimize( "", off )
766 #endif
767
TestFwdBackUnsafe()768 static void TestFwdBackUnsafe() {
769 /*
770 * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
771 * The behavior of _UNSAFE macros for ill-formed strings is undefined.
772 */
773 static const uint8_t input[]={
774 0x61,
775 0xf0, 0x90, 0x90, 0x81,
776 0xc0, 0x80, /* non-shortest form */
777 0xe2, 0x82, 0xac,
778 0xc2, 0xa1,
779 0xf4, 0x8f, 0xbf, 0xbf,
780 0x00
781 };
782 // forward unsafe skips only C0
783 static const int8_t boundaries[]={ 0, 1, 5, 6, 7, 10, 12, 16, 17 };
784 // backward unsafe skips C0 80 together
785 static const int8_t backBoundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 };
786
787 int32_t offset;
788 int32_t i;
789 #if !U_HIDE_OBSOLETE_UTF_OLD_H
790 for(i=1, offset=0; offset<UPRV_LENGTHOF(input); ++i) {
791 UTF8_FWD_1_UNSAFE(input, offset);
792 if(offset != boundaries[i]){
793 log_err("ERROR: UTF8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
794 }
795 }
796 #endif
797 for(i=1, offset=0; offset<UPRV_LENGTHOF(input); ++i) {
798 U8_FWD_1_UNSAFE(input, offset);
799 if(offset != boundaries[i]){
800 log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
801 }
802 }
803 #if !U_HIDE_OBSOLETE_UTF_OLD_H
804 for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
805 UTF8_BACK_1_UNSAFE(input, offset);
806 if(offset != backBoundaries[i]){
807 log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset);
808 }
809 }
810 #endif
811 for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
812 U8_BACK_1_UNSAFE(input, offset);
813 if(offset != backBoundaries[i]){
814 log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset);
815 }
816 }
817 #if !U_HIDE_OBSOLETE_UTF_OLD_H
818 for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
819 offset=0;
820 UTF8_FWD_N_UNSAFE(input, offset, i);
821 if(offset != boundaries[i]) {
822 log_err("ERROR: UTF8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
823 }
824 }
825 #endif
826 for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
827 offset=0;
828 U8_FWD_N_UNSAFE(input, offset, i);
829 if(offset != boundaries[i]) {
830 log_err("ERROR: U8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
831 }
832 }
833 #if !U_HIDE_OBSOLETE_UTF_OLD_H
834 for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) {
835 int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i;
836 offset=UPRV_LENGTHOF(input);
837 UTF8_BACK_N_UNSAFE(input, offset, i);
838 if(offset != backBoundaries[j]) {
839 log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset);
840 }
841 }
842 #endif
843 for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) {
844 int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i;
845 offset=UPRV_LENGTHOF(input);
846 U8_BACK_N_UNSAFE(input, offset, i);
847 if(offset != backBoundaries[j]) {
848 log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset);
849 }
850 }
851 }
852
853 /**
854 * Ticket #13636 - Turn optimization back on.
855 */
856 #if defined(_MSC_VER) && (_MSC_VER > 1900)
857 #pragma optimize( "", on )
858 #endif
859
TestSetChar()860 static void TestSetChar() {
861 static const uint8_t input[]
862 = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0xfe, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x00 };
863 static const int16_t start_safe[]
864 = {0, 1, 1, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
865 static const int16_t limit_safe[]
866 = {0, 1, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
867
868 uint32_t i=0;
869 int32_t offset=0, setOffset=0;
870 for(offset=0; offset<=UPRV_LENGTHOF(input); offset++){
871 if (offset<UPRV_LENGTHOF(input)){
872 #if !U_HIDE_OBSOLETE_UTF_OLD_H
873 setOffset=offset;
874 UTF8_SET_CHAR_START_SAFE(input, 0, setOffset);
875 if(setOffset != start_safe[i]){
876 log_err("ERROR: UTF8_SET_CHAR_START_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
877 }
878 #endif
879 setOffset=offset;
880 U8_SET_CP_START(input, 0, setOffset);
881 if(setOffset != start_safe[i]){
882 log_err("ERROR: U8_SET_CP_START failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
883 }
884 }
885 #if !U_HIDE_OBSOLETE_UTF_OLD_H
886 setOffset=offset;
887 UTF8_SET_CHAR_LIMIT_SAFE(input,0, setOffset, sizeof(input));
888 if(setOffset != limit_safe[i]){
889 log_err("ERROR: UTF8_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
890 }
891 #endif
892 setOffset=offset;
893 U8_SET_CP_LIMIT(input,0, setOffset, sizeof(input));
894 if(setOffset != limit_safe[i]){
895 log_err("ERROR: U8_SET_CP_LIMIT failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
896 }
897
898 i++;
899 }
900 }
901
TestSetCharUnsafe()902 static void TestSetCharUnsafe() {
903 static const uint8_t input[]
904 = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0x2e, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x80, 0x80, 0x00 };
905 static const int16_t start_unsafe[]
906 = {0, 1, 1, 1, 4, 5, 6, 7, 8, 9, 9, 9, 12, 12, 12, 15 };
907 static const int16_t limit_unsafe[]
908 = {0, 1, 4, 4, 4, 5, 6, 7, 9, 9, 10, 10, 10, 15, 15, 15, 16 };
909
910 uint32_t i=0;
911 int32_t offset=0, setOffset=0;
912 for(offset=0; offset<=UPRV_LENGTHOF(input); offset++){
913 if (offset<UPRV_LENGTHOF(input)){
914 #if !U_HIDE_OBSOLETE_UTF_OLD_H
915 setOffset=offset;
916 UTF8_SET_CHAR_START_UNSAFE(input, setOffset);
917 if(setOffset != start_unsafe[i]){
918 log_err("ERROR: UTF8_SET_CHAR_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
919 }
920 #endif
921 setOffset=offset;
922 U8_SET_CP_START_UNSAFE(input, setOffset);
923 if(setOffset != start_unsafe[i]){
924 log_err("ERROR: U8_SET_CP_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
925 }
926 }
927
928 if (offset != 0) { /* Can't have it go off the end of the array */
929 #if !U_HIDE_OBSOLETE_UTF_OLD_H
930 setOffset=offset;
931 UTF8_SET_CHAR_LIMIT_UNSAFE(input, setOffset);
932 if(setOffset != limit_unsafe[i]){
933 log_err("ERROR: UTF8_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
934 }
935 #endif
936 setOffset=offset;
937 U8_SET_CP_LIMIT_UNSAFE(input, setOffset);
938 if(setOffset != limit_unsafe[i]){
939 log_err("ERROR: U8_SET_CP_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
940 }
941 }
942
943 i++;
944 }
945 }
946
TestTruncateIfIncomplete()947 static void TestTruncateIfIncomplete() {
948 // Difference from U8_SET_CP_START():
949 // U8_TRUNCATE_IF_INCOMPLETE() does not look at s[length].
950 // Therefore, if the last byte is a lead byte, then this macro truncates
951 // even if the byte at the input index cannot continue a valid sequence
952 // (including when that is not a trail byte).
953 // On the other hand, if the last byte is a trail byte, then the two macros behave the same.
954 static const struct {
955 const char *s;
956 int32_t expected;
957 } cases[] = {
958 { "", 0 },
959 { "a", 1 },
960 { "\x80", 1 },
961 { "\xC1", 1 },
962 { "\xC2", 0 },
963 { "\xE0", 0 },
964 { "\xF4", 0 },
965 { "\xF5", 1 },
966 { "\x80\x80", 2 },
967 { "\xC2\xA0", 2 },
968 { "\xE0\x9F", 2 },
969 { "\xE0\xA0", 0 },
970 { "\xED\x9F", 0 },
971 { "\xED\xA0", 2 },
972 { "\xF0\x8F", 2 },
973 { "\xF0\x90", 0 },
974 { "\xF4\x8F", 0 },
975 { "\xF4\x90", 2 },
976 { "\xF5\x80", 2 },
977 { "\x80\x80\x80", 3 },
978 { "\xC2\xA0\x80", 3 },
979 { "\xE0\xA0\x80", 3 },
980 { "\xF0\x8F\x80", 3 },
981 { "\xF0\x90\x80", 0 },
982 { "\xF4\x8F\x80", 0 },
983 { "\xF4\x90\x80", 3 },
984 { "\xF5\x80\x80", 3 },
985 { "\x80\x80\x80\x80", 4 },
986 { "\xC2\xA0\x80\x80", 4 },
987 { "\xE0\xA0\x80\x80", 4 },
988 { "\xF0\x90\x80\x80", 4 },
989 { "\xF5\x80\x80\x80", 4 }
990 };
991 int32_t i;
992 for (i = 0; i < UPRV_LENGTHOF(cases); ++i) {
993 const char *s = cases[i].s;
994 int32_t expected = cases[i].expected;
995 int32_t length = (int32_t)strlen(s);
996 int32_t adjusted = length;
997 U8_TRUNCATE_IF_INCOMPLETE(s, 0, adjusted);
998 if (adjusted != expected) {
999 log_err("ERROR: U8_TRUNCATE_IF_INCOMPLETE failed for i=%d, length=%d. Expected:%d Got:%d\n",
1000 (int)i, (int)length, (int)expected, (int)adjusted);
1001 }
1002 }
1003 }
1004
TestAppendChar()1005 static void TestAppendChar(){
1006 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1007 static const uint8_t s[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00};
1008 static const uint32_t test[]={
1009 /* append-position(unsafe), CHAR to be appended */
1010 0, 0x10401,
1011 2, 0x0028,
1012 2, 0x007f,
1013 3, 0xd801,
1014 1, 0x20402,
1015 8, 0x10401,
1016 5, 0xc0,
1017 5, 0xc1,
1018 5, 0xfd,
1019 6, 0x80,
1020 6, 0x81,
1021 6, 0xbf,
1022 7, 0xfe,
1023
1024 /* append-position(safe), CHAR to be appended */
1025 0, 0x10401,
1026 2, 0x0028,
1027 3, 0x7f,
1028 3, 0xd801, /* illegal for UTF-8 starting with Unicode 3.2 */
1029 1, 0x20402,
1030 9, 0x10401,
1031 5, 0xc0,
1032 5, 0xc1,
1033 5, 0xfd,
1034 6, 0x80,
1035 6, 0x81,
1036 6, 0xbf,
1037 7, 0xfe,
1038
1039 };
1040 static const uint16_t movedOffset[]={
1041 /* offset-moved-to(unsafe) */
1042 4, /*for append-pos: 0 , CHAR 0x10401*/
1043 3,
1044 3,
1045 6,
1046 5,
1047 12,
1048 7,
1049 7,
1050 7,
1051 8,
1052 8,
1053 8,
1054 9,
1055
1056 /* offset-moved-to(safe) */
1057 4, /*for append-pos: 0, CHAR 0x10401*/
1058 3,
1059 4,
1060 6,
1061 5,
1062 11,
1063 7,
1064 7,
1065 7,
1066 8,
1067 8,
1068 8,
1069 9,
1070
1071 };
1072
1073 static const uint8_t result[][11]={
1074 /*unsafe*/
1075 {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1076 {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1077 {0x61, 0x62, 0x7f, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1078 {0x61, 0x62, 0x63, 0xed, 0xa0, 0x81, 0x67, 0x68, 0x69, 0x6a, 0x00},
1079 {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1080 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0xF0, 0x90, 0x90},
1081
1082 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
1083 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
1084 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
1085
1086 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
1087 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
1088 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
1089
1090 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
1091 /*safe*/
1092 {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1093 {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1094 {0x61, 0x62, 0x63, 0x7f, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1095 {0x61, 0x62, 0x63, 0xef, 0xbf, 0xbf, 0x67, 0x68, 0x69, 0x6a, 0x00},
1096 {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1097 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xc2, 0x9f}, /*gets UTF8_ERROR_VALUE_2 which takes 2 bytes 0xc0, 0x9f*/
1098
1099 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
1100 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
1101 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
1102
1103 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
1104 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
1105 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
1106
1107 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
1108
1109 };
1110 uint16_t i, count=0;
1111 uint8_t str[12];
1112 uint32_t offset;
1113 /* UChar32 c=0;*/
1114 uint16_t size=UPRV_LENGTHOF(s);
1115 for(i=0; i<UPRV_LENGTHOF(test); i=(uint16_t)(i+2)){
1116 uprv_memcpy(str, s, size);
1117 offset=test[i];
1118 if(count<13){
1119 UTF8_APPEND_CHAR_UNSAFE(str, offset, test[i+1]);
1120 if(offset != movedOffset[count]){
1121 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",
1122 count, movedOffset[count], offset);
1123
1124 }
1125 if(uprv_memcmp(str, result[count], size) !=0){
1126 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed for count=%d. \nExpected:", count);
1127 printUChars(result[count], size);
1128 log_err("\nGot: ");
1129 printUChars(str, size);
1130 log_err("\n");
1131 }
1132 }else{
1133 UTF8_APPEND_CHAR_SAFE(str, offset, size, test[i+1]);
1134 if(offset != movedOffset[count]){
1135 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",
1136 count, movedOffset[count], offset);
1137
1138 }
1139 if(uprv_memcmp(str, result[count], size) !=0){
1140 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed for count=%d. \nExpected:", count);
1141 printUChars(result[count], size);
1142 log_err("\nGot: ");
1143 printUChars(str, size);
1144 log_err("\n");
1145 }
1146 /*call the API instead of MACRO
1147 uprv_memcpy(str, s, size);
1148 offset=test[i];
1149 c=test[i+1];
1150 if((uint32_t)(c)<=0x7f) {
1151 (str)[(offset)++]=(uint8_t)(c);
1152 } else {
1153 (offset)=utf8_appendCharSafeBody(str, (int32_t)(offset), (int32_t)(size), c);
1154 }
1155 if(offset != movedOffset[count]){
1156 log_err("ERROR: utf8_appendCharSafeBody() failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",
1157 count, movedOffset[count], offset);
1158
1159 }
1160 if(uprv_memcmp(str, result[count], size) !=0){
1161 log_err("ERROR: utf8_appendCharSafeBody() failed for count=%d. \nExpected:", count);
1162 printUChars(result[count], size);
1163 printf("\nGot: ");
1164 printUChars(str, size);
1165 printf("\n");
1166 }
1167 */
1168 }
1169 count++;
1170 }
1171 #endif
1172 }
1173
TestAppend()1174 static void TestAppend() {
1175 static const UChar32 codePoints[]={
1176 0x61, 0xdf, 0x901, 0x3040,
1177 0xac00, 0xd800, 0xdbff, 0xdcde,
1178 0xdffd, 0xe000, 0xffff, 0x10000,
1179 0x12345, 0xe0021, 0x10ffff, 0x110000,
1180 0x234567, 0x7fffffff, -1, -1000,
1181 0, 0x400
1182 };
1183 static const uint8_t expectUnsafe[]={
1184 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80,
1185 0xea, 0xb0, 0x80, 0xed, 0xa0, 0x80, 0xed, 0xaf, 0xbf, 0xed, 0xb3, 0x9e,
1186 0xed, 0xbf, 0xbd, 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80,
1187 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */
1188 /* none from this line */
1189 0, 0xd0, 0x80
1190 }, expectSafe[]={
1191 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80,
1192 0xea, 0xb0, 0x80, /* no surrogates */
1193 /* no surrogates */ 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80,
1194 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */
1195 /* none from this line */
1196 0, 0xd0, 0x80
1197 };
1198
1199 uint8_t buffer[100];
1200 UChar32 c;
1201 int32_t i, length;
1202 UBool isError, expectIsError, wrongIsError;
1203
1204 length=0;
1205 for(i=0; i<UPRV_LENGTHOF(codePoints); ++i) {
1206 c=codePoints[i];
1207 if(c<0 || 0x10ffff<c) {
1208 continue; /* skip non-code points for U8_APPEND_UNSAFE */
1209 }
1210
1211 U8_APPEND_UNSAFE(buffer, length, c);
1212 }
1213 if(length!=UPRV_LENGTHOF(expectUnsafe) || 0!=memcmp(buffer, expectUnsafe, length)) {
1214 log_err("U8_APPEND_UNSAFE did not generate the expected output\n");
1215 }
1216
1217 length=0;
1218 wrongIsError=FALSE;
1219 for(i=0; i<UPRV_LENGTHOF(codePoints); ++i) {
1220 c=codePoints[i];
1221 expectIsError= c<0 || 0x10ffff<c || U_IS_SURROGATE(c);
1222 isError=FALSE;
1223
1224 U8_APPEND(buffer, length, UPRV_LENGTHOF(buffer), c, isError);
1225 wrongIsError|= isError!=expectIsError;
1226 }
1227 if(wrongIsError) {
1228 log_err("U8_APPEND did not set isError correctly\n");
1229 }
1230 if(length!=UPRV_LENGTHOF(expectSafe) || 0!=memcmp(buffer, expectSafe, length)) {
1231 log_err("U8_APPEND did not generate the expected output\n");
1232 }
1233 }
1234
1235 static void
TestSurrogates()1236 TestSurrogates() {
1237 static const uint8_t b[]={
1238 0xc3, 0x9f, /* 00DF */
1239 0xed, 0x9f, 0xbf, /* D7FF */
1240 0xed, 0xa0, 0x81, /* D801 */
1241 0xed, 0xbf, 0xbe, /* DFFE */
1242 0xee, 0x80, 0x80, /* E000 */
1243 0xf0, 0x97, 0xbf, 0xbe /* 17FFE */
1244 };
1245 static const UChar32 cp[]={
1246 0xdf, 0xd7ff, 0xd801, 0xdffe, 0xe000, 0x17ffe
1247 };
1248
1249 UChar32 cu, cs, cl;
1250 int32_t i, j, k, iu, is, il, length;
1251
1252 k=0; /* index into cp[] */
1253 length=UPRV_LENGTHOF(b);
1254 for(i=0; i<length;) {
1255 j=i;
1256 U8_NEXT_UNSAFE(b, j, cu);
1257 iu=j;
1258
1259 j=i;
1260 U8_NEXT(b, j, length, cs);
1261 is=j;
1262
1263 j=i;
1264 L8_NEXT(b, j, length, cl);
1265 il=j;
1266
1267 if(cu!=cp[k]) {
1268 log_err("U8_NEXT_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
1269 }
1270
1271 /* U8_NEXT() returns <0 for surrogate code points */
1272 if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
1273 log_err("U8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
1274 }
1275
1276 /* L8_NEXT() returns surrogate code points like U8_NEXT_UNSAFE() */
1277 if(cl!=cu) {
1278 log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
1279 }
1280
1281 // U8_NEXT() skips only the first byte of a surrogate byte sequence.
1282 if(U_IS_SURROGATE(cu) ? is!=(i+1) : is!=iu) {
1283 log_err("U8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1284 }
1285 if(il!=iu) {
1286 log_err("L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1287 }
1288
1289 ++k; /* next code point */
1290 i=iu; /* advance by one UTF-8 sequence */
1291 }
1292
1293 while(i>0) {
1294 --k; /* previous code point */
1295
1296 j=i;
1297 U8_PREV_UNSAFE(b, j, cu);
1298 iu=j;
1299
1300 j=i;
1301 U8_PREV(b, 0, j, cs);
1302 is=j;
1303
1304 j=i;
1305 L8_PREV(b, 0, j, cl);
1306 il=j;
1307
1308 if(cu!=cp[k]) {
1309 log_err("U8_PREV_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
1310 }
1311
1312 /* U8_PREV() returns <0 for surrogate code points */
1313 if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
1314 log_err("U8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
1315 }
1316
1317 /* L8_PREV() returns surrogate code points like U8_PREV_UNSAFE() */
1318 if(cl!=cu) {
1319 log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
1320 }
1321
1322 // U8_PREV() skips only the last byte of a surrogate byte sequence.
1323 if(U_IS_SURROGATE(cu) ? is!=(i-1) : is!=iu) {
1324 log_err("U8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1325 }
1326 if(il !=iu) {
1327 log_err("L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1328 }
1329
1330 i=iu; /* go back by one UTF-8 sequence */
1331 }
1332 }
1333