1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 1998-2014, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /*
9 * File utf8tst.c
10 *
11 * Modification History:
12 *
13 * Date Name Description
14 * 07/24/2000 Madhu Creation
15 *******************************************************************************
16 */
17
18 #include <stdbool.h>
19
20 #include "unicode/utypes.h"
21 #include "unicode/utf8.h"
22 #include "unicode/utf_old.h"
23 #include "cmemory.h"
24 #include "cintltst.h"
25
26 /* lenient UTF-8 ------------------------------------------------------------ */
27
28 /*
29 * Lenient UTF-8 differs from conformant UTF-8 in that it allows surrogate
30 * code points with their "natural" encoding.
31 * Effectively, this allows a mix of UTF-8 and CESU-8 as well as encodings of
32 * single surrogates.
33 *
34 * This is not conformant with UTF-8.
35 *
36 * Supplementary code points may be encoded as pairs of 3-byte sequences, but
37 * the macros below do not attempt to assemble such pairs.
38 */
39
40 #define L8_NEXT(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
41 (c)=(uint8_t)(s)[(i)++]; \
42 if((c)>=0x80) { \
43 if(U8_IS_LEAD(c)) { \
44 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
45 } else { \
46 (c)=U_SENTINEL; \
47 } \
48 } \
49 } UPRV_BLOCK_MACRO_END
50
51 #define L8_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
52 (c)=(uint8_t)(s)[--(i)]; \
53 if((c)>=0x80) { \
54 if((c)<=0xbf) { \
55 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
56 } else { \
57 (c)=U_SENTINEL; \
58 } \
59 } \
60 } UPRV_BLOCK_MACRO_END
61
62 /* -------------------------------------------------------------------------- */
63
64 // Obsolete macros from obsolete unicode/utf_old.h, for some old test data.
65 #ifndef UTF8_ERROR_VALUE_1
66 # define UTF8_ERROR_VALUE_1 0x15
67 #endif
68 #ifndef UTF8_ERROR_VALUE_2
69 # define UTF8_ERROR_VALUE_2 0x9f
70 #endif
71 #ifndef UTF_ERROR_VALUE
72 # define UTF_ERROR_VALUE 0xffff
73 #endif
74 #ifndef UTF_IS_ERROR
75 # define UTF_IS_ERROR(c) \
76 (((c)&0xfffe)==0xfffe || (c)==UTF8_ERROR_VALUE_1 || (c)==UTF8_ERROR_VALUE_2)
77 #endif
78
79 #if !U_HIDE_OBSOLETE_UTF_OLD_H
printUChars(const uint8_t * uchars,int16_t len)80 static void printUChars(const uint8_t *uchars, int16_t len){
81 int16_t i=0;
82 for(i=0; i<len; i++){
83 log_err("0x%02x ", *(uchars+i));
84 }
85 }
86 #endif
87
88 static void TestCodeUnitValues(void);
89 static void TestCharLength(void);
90 static void TestGetChar(void);
91 static void TestNextPrevChar(void);
92 static void TestNulTerminated(void);
93 static void TestNextPrevNonCharacters(void);
94 static void TestNextPrevCharUnsafe(void);
95 static void TestFwdBack(void);
96 static void TestFwdBackUnsafe(void);
97 static void TestSetChar(void);
98 static void TestSetCharUnsafe(void);
99 static void TestTruncateIfIncomplete(void);
100 static void TestAppendChar(void);
101 static void TestAppend(void);
102 static void TestSurrogates(void);
103
104 void addUTF8Test(TestNode** root);
105
106 void
addUTF8Test(TestNode ** root)107 addUTF8Test(TestNode** root)
108 {
109 addTest(root, &TestCodeUnitValues, "utf8tst/TestCodeUnitValues");
110 addTest(root, &TestCharLength, "utf8tst/TestCharLength");
111 addTest(root, &TestGetChar, "utf8tst/TestGetChar");
112 addTest(root, &TestNextPrevChar, "utf8tst/TestNextPrevChar");
113 addTest(root, &TestNulTerminated, "utf8tst/TestNulTerminated");
114 addTest(root, &TestNextPrevNonCharacters, "utf8tst/TestNextPrevNonCharacters");
115 addTest(root, &TestNextPrevCharUnsafe, "utf8tst/TestNextPrevCharUnsafe");
116 addTest(root, &TestFwdBack, "utf8tst/TestFwdBack");
117 addTest(root, &TestFwdBackUnsafe, "utf8tst/TestFwdBackUnsafe");
118 addTest(root, &TestSetChar, "utf8tst/TestSetChar");
119 addTest(root, &TestSetCharUnsafe, "utf8tst/TestSetCharUnsafe");
120 addTest(root, &TestTruncateIfIncomplete, "utf8tst/TestTruncateIfIncomplete");
121 addTest(root, &TestAppendChar, "utf8tst/TestAppendChar");
122 addTest(root, &TestAppend, "utf8tst/TestAppend");
123 addTest(root, &TestSurrogates, "utf8tst/TestSurrogates");
124 }
125
TestCodeUnitValues()126 static void TestCodeUnitValues()
127 {
128 static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc2, 0xc4, 0xf0, 0xf4, 0x80, 0x81, 0xbc, 0xbe,};
129
130 int16_t i;
131 for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
132 uint8_t c=codeunit[i];
133 log_verbose("Testing code unit value of %x\n", c);
134 if(i<4){
135 if(
136 #if !U_HIDE_OBSOLETE_UTF_OLD_H
137 !UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || UTF8_IS_TRAIL(c) ||
138 #endif
139 !U8_IS_SINGLE(c) || U8_IS_LEAD(c) || U8_IS_TRAIL(c)) {
140 log_err("ERROR: 0x%02x is a single byte but results in single: %c lead: %c trail: %c\n",
141 c, U8_IS_SINGLE(c) ? 'y' : 'n', U8_IS_LEAD(c) ? 'y' : 'n', U8_IS_TRAIL(c) ? 'y' : 'n');
142 }
143 } else if(i< 8){
144 if(
145 #if !U_HIDE_OBSOLETE_UTF_OLD_H
146 !UTF8_IS_LEAD(c) || UTF8_IS_SINGLE(c) || UTF8_IS_TRAIL(c) ||
147 #endif
148 !U8_IS_LEAD(c) || U8_IS_SINGLE(c) || U8_IS_TRAIL(c)) {
149 log_err("ERROR: 0x%02x is a lead byte but results in single: %c lead: %c trail: %c\n",
150 c, U8_IS_SINGLE(c) ? 'y' : 'n', U8_IS_LEAD(c) ? 'y' : 'n', U8_IS_TRAIL(c) ? 'y' : 'n');
151 }
152 } else if(i< 12){
153 if(
154 #if !U_HIDE_OBSOLETE_UTF_OLD_H
155 !UTF8_IS_TRAIL(c) || UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) ||
156 #endif
157 !U8_IS_TRAIL(c) || U8_IS_SINGLE(c) || U8_IS_LEAD(c)){
158 log_err("ERROR: 0x%02x is a trail byte but results in single: %c lead: %c trail: %c\n",
159 c, U8_IS_SINGLE(c) ? 'y' : 'n', U8_IS_LEAD(c) ? 'y' : 'n', U8_IS_TRAIL(c) ? 'y' : 'n');
160 }
161 }
162 }
163 }
164
TestCharLength()165 static void TestCharLength()
166 {
167 static const uint32_t codepoint[]={
168 1, 0x0061,
169 1, 0x007f,
170 2, 0x016f,
171 2, 0x07ff,
172 3, 0x0865,
173 3, 0x20ac,
174 4, 0x20402,
175 4, 0x23456,
176 4, 0x24506,
177 4, 0x20402,
178 4, 0x10402,
179 3, 0xd7ff,
180 3, 0xe000,
181
182 };
183
184 int16_t i;
185 #if !U_HIDE_OBSOLETE_UTF_OLD_H
186 UBool multiple;
187 #endif
188 for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){
189 UChar32 c=codepoint[i+1];
190 if(
191 #if !U_HIDE_OBSOLETE_UTF_OLD_H
192 UTF8_CHAR_LENGTH(c) != (uint16_t)codepoint[i] ||
193 #endif
194 U8_LENGTH(c) != (uint16_t)codepoint[i]) {
195 log_err("The no: of code units for %lx:- Expected: %d Got: %d\n", c, codepoint[i], U8_LENGTH(c));
196 }else{
197 log_verbose("The no: of code units for %lx is %d\n",c, U8_LENGTH(c));
198 }
199 #if !U_HIDE_OBSOLETE_UTF_OLD_H
200 multiple=(UBool)(codepoint[i] == 1 ? false : true);
201 if(UTF8_NEED_MULTIPLE_UCHAR(c) != multiple){
202 log_err("ERROR: UTF8_NEED_MULTIPLE_UCHAR failed for %lx\n", c);
203 }
204 #endif
205 }
206 }
207
TestGetChar()208 static void TestGetChar()
209 {
210 static const uint8_t input[]={
211 /* code unit,*/
212 0x61,
213 0x7f,
214 0xe4,
215 0xba,
216 0x8c,
217 0xF0,
218 0x90,
219 0x90,
220 0x81,
221 0xc0,
222 0x65,
223 0x31,
224 0x9a,
225 0xc9
226 };
227 static const UChar32 result[]={
228 /* codepoint-unsafe, codepoint-safe(not strict) codepoint-safe(strict) */
229 0x61, 0x61, 0x61,
230 0x7f, 0x7f, 0x7f,
231 0x4e8c, 0x4e8c, 0x4e8c,
232 0x4e8c, 0x4e8c, 0x4e8c ,
233 0x4e8c, 0x4e8c, 0x4e8c,
234 0x10401, 0x10401, 0x10401 ,
235 0x10401, 0x10401, 0x10401 ,
236 0x10401, 0x10401, 0x10401 ,
237 0x10401, 0x10401, 0x10401,
238 -1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
239 0x65, 0x65, 0x65,
240 0x31, 0x31, 0x31,
241 -1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
242 -1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1
243 };
244 uint16_t i=0;
245 UChar32 c, expected;
246 uint32_t offset=0;
247
248 for(offset=0; offset<sizeof(input); offset++) {
249 expected = result[i];
250 if (expected >= 0 && offset < sizeof(input) - 1) {
251 #if !U_HIDE_OBSOLETE_UTF_OLD_H
252 UTF8_GET_CHAR_UNSAFE(input, offset, c);
253 if(c != expected) {
254 log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
255 offset, expected, c);
256
257 }
258 #endif
259 U8_GET_UNSAFE(input, offset, c);
260 if(c != expected) {
261 log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
262 offset, expected, c);
263
264 }
265 }
266 expected=result[i+1];
267 #if !U_HIDE_OBSOLETE_UTF_OLD_H
268 UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, false);
269 if(c != expected){
270 log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
271 }
272 #endif
273 U8_GET(input, 0, offset, sizeof(input), c);
274 if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
275 if(c != expected){
276 log_err("ERROR: U8_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
277 }
278
279 U8_GET_OR_FFFD(input, 0, offset, sizeof(input), c);
280 if(expected<0) { expected=0xfffd; }
281 if(c != expected){
282 log_err("ERROR: U8_GET_OR_FFFD failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
283 }
284 #if !U_HIDE_OBSOLETE_UTF_OLD_H
285 UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, true);
286 if(c != result[i+2]){
287 log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
288 }
289 #endif
290 i=(uint16_t)(i+3);
291 }
292 }
293
TestNextPrevChar()294 static void TestNextPrevChar() {
295 static const uint8_t input[]={
296 0x61,
297 0xf0, 0x90, 0x90, 0x81,
298 0xc0, 0x80, // non-shortest form
299 0xf3, 0xbe, // truncated
300 0xc2, // truncated
301 0x61,
302 0x81, 0x90, 0x90, 0xf0, // "backwards" sequence
303 0x00
304 };
305 static const UChar32 result[]={
306 /* next_safe_ns next_safe_s prev_safe_ns prev_safe_s */
307 0x0061, 0x0061, 0x0000, 0x0000,
308 0x10401, 0x10401, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
309 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
310 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
311 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
312 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x61, 0x61,
313 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
314 UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
315 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
316 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
317 0x61, 0x61, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
318 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x10401, 0x10401,
319 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF_ERROR_VALUE, UTF_ERROR_VALUE,
320 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
321 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
322 0x0000, 0x0000, 0x0061, 0x0061
323 };
324 static const int32_t movedOffset[]={
325 /* next_safe prev_safe_s */
326 1, 15,
327 5, 14,
328 3, 13,
329 4, 12,
330 5, 11,
331 6, 10,
332 7, 9,
333 9, 7,
334 9, 7,
335 10, 6,
336 11, 5,
337 12, 1,
338 13, 1,
339 14, 1,
340 15, 1,
341 16, 0,
342 };
343
344 UChar32 c, expected;
345 uint32_t i=0, j=0;
346 uint32_t offset=0;
347 int32_t setOffset=0;
348 for(offset=0; offset<sizeof(input); offset++){
349 expected=result[i]; // next_safe_ns
350 #if !U_HIDE_OBSOLETE_UTF_OLD_H
351 setOffset=offset;
352 UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, false);
353 if(setOffset != movedOffset[j]) {
354 log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
355 offset, movedOffset[j], setOffset);
356 }
357 if(c != expected) {
358 log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
359 }
360 #endif
361 setOffset=offset;
362 U8_NEXT(input, setOffset, sizeof(input), c);
363 if(setOffset != movedOffset[j]) {
364 log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
365 offset, movedOffset[j], setOffset);
366 }
367 if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
368 if(c != expected) {
369 log_err("ERROR: U8_NEXT failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
370 }
371
372 setOffset=offset;
373 U8_NEXT_OR_FFFD(input, setOffset, sizeof(input), c);
374 if(setOffset != movedOffset[j]) {
375 log_err("ERROR: U8_NEXT_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
376 offset, movedOffset[j], setOffset);
377 }
378 if(expected<0) { expected=0xfffd; }
379 if(c != expected) {
380 log_err("ERROR: U8_NEXT_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
381 }
382 #if !U_HIDE_OBSOLETE_UTF_OLD_H
383 setOffset=offset;
384 UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, true);
385 if(setOffset != movedOffset[j]) {
386 log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
387 offset, movedOffset[j], setOffset);
388 }
389 expected=result[i+1]; // next_safe_s
390 if(c != expected) {
391 log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",
392 offset, expected, c);
393 }
394 #endif
395 i=i+4;
396 j=j+2;
397 }
398
399 i=j=0;
400 for(offset=sizeof(input); offset > 0; --offset){
401 expected=result[i+2]; // prev_safe_ns
402 #if !U_HIDE_OBSOLETE_UTF_OLD_H
403 setOffset=offset;
404 UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, false);
405 if(setOffset != movedOffset[j+1]) {
406 log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
407 offset, movedOffset[j+1], setOffset);
408 }
409 if(c != expected) {
410 log_err("ERROR: UTF8_PREV_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
411 }
412 #endif
413 setOffset=offset;
414 U8_PREV(input, 0, setOffset, c);
415 if(setOffset != movedOffset[j+1]) {
416 log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
417 offset, movedOffset[j+1], setOffset);
418 }
419 if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
420 if(c != expected) {
421 log_err("ERROR: U8_PREV failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
422 }
423
424 setOffset=offset;
425 U8_PREV_OR_FFFD(input, 0, setOffset, c);
426 if(setOffset != movedOffset[j+1]) {
427 log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
428 offset, movedOffset[j+1], setOffset);
429 }
430 if(expected<0) { expected=0xfffd; }
431 if(c != expected) {
432 log_err("ERROR: U8_PREV_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
433 }
434 #if !U_HIDE_OBSOLETE_UTF_OLD_H
435 setOffset=offset;
436 UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, true);
437 if(setOffset != movedOffset[j+1]) {
438 log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
439 offset, movedOffset[j+1], setOffset);
440 }
441 expected=result[i+3]; // prev_safe_s
442 if(c != expected) {
443 log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",
444 offset, expected, c);
445 }
446 #endif
447 i=i+4;
448 j=j+2;
449 }
450 }
451
452 /* keep this in sync with utf16tst.c's TestNulTerminated() */
TestNulTerminated()453 static void TestNulTerminated() {
454 static const uint8_t input[]={
455 /* 0 */ 0x61,
456 /* 1 */ 0xf0, 0x90, 0x90, 0x81,
457 /* 5 */ 0xc0,
458 /* 6 */ 0x80,
459 /* 7 */ 0xdf, 0x80,
460 /* 9 */ 0xc2,
461 /* 10 */ 0x62,
462 /* 11 */ 0xfd,
463 /* 12 */ 0xbe,
464 /* 13 */ 0xe0, 0xa0, 0x80,
465 /* 16 */ 0xe2, 0x82, 0xac,
466 /* 19 */ 0xf0, 0x90, 0x90,
467 /* 22 */ 0x00
468 /* 23 */
469 };
470 static const UChar32 result[]={
471 0x61,
472 0x10401,
473 U_SENTINEL, // C0 not a lead byte
474 U_SENTINEL, // 80
475 0x7c0,
476 U_SENTINEL, // C2
477 0x62,
478 U_SENTINEL, // FD not a lead byte
479 U_SENTINEL, // BE
480 0x800,
481 0x20ac,
482 U_SENTINEL, // truncated F0 90 90
483 0
484 };
485
486 UChar32 c, c2, expected;
487 int32_t i0, i=0, j, k, expectedIndex;
488 int32_t cpIndex=0;
489 do {
490 i0=i;
491 U8_NEXT(input, i, -1, c);
492 expected=result[cpIndex];
493 if(c!=expected) {
494 log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, expected);
495 }
496 j=i0;
497 U8_NEXT_OR_FFFD(input, j, -1, c);
498 if(expected<0) { expected=0xfffd; }
499 if(c!=expected) {
500 log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0, c, expected);
501 }
502 if(j!=i) {
503 log_err("U8_NEXT_OR_FFFD() moved to index %d but U8_NEXT() moved to %d\n", j, i);
504 }
505 j=i0;
506 U8_FWD_1(input, j, -1);
507 if(j!=i) {
508 log_err("U8_FWD_1() moved to index %d but U8_NEXT() moved to %d\n", j, i);
509 }
510 ++cpIndex;
511 /*
512 * Move by this many code points from the start.
513 * U8_FWD_N() stops at the end of the string, that is, at the NUL if necessary.
514 */
515 expectedIndex= (c==0) ? i-1 : i;
516 k=0;
517 U8_FWD_N(input, k, -1, cpIndex);
518 if(k!=expectedIndex) {
519 log_err("U8_FWD_N(code points from 0) moved to index %d but expected %d\n", k, expectedIndex);
520 }
521 } while(c!=0);
522
523 i=0;
524 do {
525 j=i0=i;
526 U8_NEXT(input, i, -1, c);
527 do {
528 U8_GET(input, 0, j, -1, c2);
529 if(c2!=c) {
530 log_err("U8_NEXT(from %d)=U+%04x != U+%04x=U8_GET(at %d)\n", i0, c, c2, j);
531 }
532 U8_GET_OR_FFFD(input, 0, j, -1, c2);
533 expected= (c>=0) ? c : 0xfffd;
534 if(c2!=expected) {
535 log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U8_GET_OR_FFFD(at %d)\n", i0, expected, c2, j);
536 }
537 /* U8_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */
538 k=j+1;
539 U8_SET_CP_LIMIT(input, 0, k, -1);
540 if(k!=i) {
541 log_err("U8_NEXT() moved to %d but U8_SET_CP_LIMIT(%d) moved to %d\n", i, j+1, k);
542 }
543 } while(++j<i);
544 } while(c!=0);
545 }
546
TestNextPrevNonCharacters()547 static void TestNextPrevNonCharacters() {
548 /* test non-characters */
549 static const uint8_t nonChars[]={
550 0xef, 0xb7, 0x90, /* U+fdd0 */
551 0xef, 0xbf, 0xbf, /* U+feff */
552 0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */
553 0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */
554 0xf4, 0x8f, 0xbf, 0xbe /* U+10fffe */
555 };
556
557 UChar32 ch;
558 int32_t idx;
559
560 for(idx=0; idx<(int32_t)sizeof(nonChars);) {
561 U8_NEXT(nonChars, idx, sizeof(nonChars), ch);
562 if(!U_IS_UNICODE_NONCHAR(ch)) {
563 log_err("U8_NEXT(before %d) failed to read a non-character\n", idx);
564 }
565 }
566 for(idx=(int32_t)sizeof(nonChars); idx>0;) {
567 U8_PREV(nonChars, 0, idx, ch);
568 if(!U_IS_UNICODE_NONCHAR(ch)) {
569 log_err("U8_PREV(at %d) failed to read a non-character\n", idx);
570 }
571 }
572 #if !U_HIDE_OBSOLETE_UTF_OLD_H
573 for(idx=0; idx<(int32_t)sizeof(nonChars);) {
574 UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff;
575 UTF8_NEXT_CHAR_SAFE(nonChars, idx, sizeof(nonChars), ch, true);
576 if(ch!=expected) {
577 log_err("UTF8_NEXT_CHAR_SAFE(strict, before %d) failed to read a non-character\n", idx);
578 }
579 }
580 for(idx=(int32_t)sizeof(nonChars); idx>0;) {
581 UTF8_PREV_CHAR_SAFE(nonChars, 0, idx, ch, true);
582 UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff;
583 if(ch!=expected) {
584 log_err("UTF8_PREV_CHAR_SAFE(strict, at %d) failed to read a non-character\n", idx);
585 }
586 }
587 #endif
588 }
589
TestNextPrevCharUnsafe()590 static void TestNextPrevCharUnsafe() {
591 /*
592 * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
593 * The behavior of _UNSAFE macros for ill-formed strings is undefined.
594 */
595 static const uint8_t input[]={
596 0x61,
597 0xf0, 0x90, 0x90, 0x81,
598 0xc0, 0x80, /* non-shortest form */
599 0xe2, 0x82, 0xac,
600 0xc2, 0xa1,
601 0xf4, 0x8f, 0xbf, 0xbf,
602 0x00
603 };
604 static const UChar32 codePoints[]={
605 0x61,
606 0x10401,
607 -1,
608 0x20ac,
609 0xa1,
610 0x10ffff,
611 0
612 };
613
614 UChar32 c, expected;
615 int32_t i;
616 uint32_t offset;
617 #if !U_HIDE_OBSOLETE_UTF_OLD_H
618 for(i=0, offset=0; offset<sizeof(input); ++i) {
619 UTF8_NEXT_CHAR_UNSAFE(input, offset, c);
620 expected = codePoints[i];
621 if(expected >= 0 && c != expected) {
622 log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
623 offset, expected, c);
624 }
625 if(offset==6) {
626 // The obsolete UTF8_NEXT_CHAR_UNSAFE() skips 1+UTF8_COUNT_TRAIL_BYTES(lead) bytes
627 // while the new one skips C0 80 together.
628 ++offset;
629 }
630 }
631 #endif
632 for(i=0, offset=0; offset<sizeof(input); ++i) {
633 U8_NEXT_UNSAFE(input, offset, c);
634 expected = codePoints[i];
635 if(expected >= 0 && c != expected) {
636 log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
637 offset, expected, c);
638 }
639 }
640 #if !U_HIDE_OBSOLETE_UTF_OLD_H
641 for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
642 UTF8_PREV_CHAR_UNSAFE(input, offset, c);
643 expected = codePoints[i];
644 if(expected >= 0 && c != expected) {
645 log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
646 offset, expected, c);
647 }
648 }
649 #endif
650 for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
651 U8_PREV_UNSAFE(input, offset, c);
652 expected = codePoints[i];
653 if(expected >= 0 && c != expected) {
654 log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
655 offset, expected, c);
656 }
657 }
658 }
659
TestFwdBack()660 static void TestFwdBack() {
661 static const uint8_t input[]={
662 0x61,
663 0xF0, 0x90, 0x90, 0x81,
664 0xff,
665 0x62,
666 0xc0,
667 0x80,
668 0x7f,
669 0x8f,
670 0xc0,
671 0x63,
672 0x81,
673 0x90,
674 0x90,
675 0xF0,
676 0x00
677 };
678 static const uint16_t fwd_safe[] ={1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
679 static const uint16_t back_safe[] ={17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 1, 0};
680
681 static const uint16_t Nvalue[]= {0, 1, 2, 4, 1, 2, 1, 5};
682 static const uint16_t fwd_N_safe[] ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */
683 static const uint16_t back_N_safe[] ={18, 17, 15, 11, 10, 8, 7, 0};
684
685 uint32_t offsafe=0;
686
687 uint32_t i=0;
688 #if !U_HIDE_OBSOLETE_UTF_OLD_H
689 while(offsafe < sizeof(input)){
690 UTF8_FWD_1_SAFE(input, offsafe, sizeof(input));
691 if(offsafe != fwd_safe[i]){
692 log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
693 }
694 i++;
695 }
696 #endif
697 offsafe=0;
698 i=0;
699 while(offsafe < sizeof(input)){
700 U8_FWD_1(input, offsafe, sizeof(input));
701 if(offsafe != fwd_safe[i]){
702 log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
703 }
704 i++;
705 }
706 #if !U_HIDE_OBSOLETE_UTF_OLD_H
707 i=0;
708 offsafe=sizeof(input);
709 while(offsafe > 0){
710 UTF8_BACK_1_SAFE(input, 0, offsafe);
711 if(offsafe != back_safe[i]){
712 log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_safe[i], offsafe);
713 }
714 i++;
715 }
716 #endif
717 i=0;
718 offsafe=sizeof(input);
719 while(offsafe > 0){
720 U8_BACK_1(input, 0, offsafe);
721 if(offsafe != back_safe[i]){
722 log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_safe[i], offsafe);
723 }
724 i++;
725 }
726 #if !U_HIDE_OBSOLETE_UTF_OLD_H
727 offsafe=0;
728 for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
729 UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]);
730 if(offsafe != fwd_N_safe[i]){
731 log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
732 }
733
734 }
735 #endif
736 offsafe=0;
737 for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
738 U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]);
739 if(offsafe != fwd_N_safe[i]){
740 log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
741 }
742
743 }
744 #if !U_HIDE_OBSOLETE_UTF_OLD_H
745 offsafe=sizeof(input);
746 for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
747 UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]);
748 if(offsafe != back_N_safe[i]){
749 log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
750 }
751 }
752 #endif
753 offsafe=sizeof(input);
754 for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
755 U8_BACK_N(input, 0, offsafe, Nvalue[i]);
756 if(offsafe != back_N_safe[i]){
757 log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
758 }
759 }
760 }
761
762 /**
763 * Ticket #13636 - The optimizer in Visual Studio 2017 has problems optimizing this function.
764 * As a work-around, optimization is disabled for this function on VS2017.
765 * This work-around should be removed once the following versions of Visual Studio are no
766 * longer supported: All versions of VS2017, and versions of VS2019 below 16.4.
767 */
768 #if defined(_MSC_VER) && (_MSC_VER > 1900) && (_MSC_VER < 1924)
769 #pragma optimize( "", off )
770 #endif
771
TestFwdBackUnsafe()772 static void TestFwdBackUnsafe() {
773 /*
774 * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
775 * The behavior of _UNSAFE macros for ill-formed strings is undefined.
776 */
777 static const uint8_t input[]={
778 0x61,
779 0xf0, 0x90, 0x90, 0x81,
780 0xc0, 0x80, /* non-shortest form */
781 0xe2, 0x82, 0xac,
782 0xc2, 0xa1,
783 0xf4, 0x8f, 0xbf, 0xbf,
784 0x00
785 };
786 // forward unsafe skips only C0
787 static const int8_t boundaries[]={ 0, 1, 5, 6, 7, 10, 12, 16, 17 };
788 // backward unsafe skips C0 80 together
789 static const int8_t backBoundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 };
790
791 int32_t offset;
792 int32_t i;
793 #if !U_HIDE_OBSOLETE_UTF_OLD_H
794 for(i=1, offset=0; offset<UPRV_LENGTHOF(input); ++i) {
795 UTF8_FWD_1_UNSAFE(input, offset);
796 if(offset != boundaries[i]){
797 log_err("ERROR: UTF8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
798 }
799 }
800 #endif
801 for(i=1, offset=0; offset<UPRV_LENGTHOF(input); ++i) {
802 U8_FWD_1_UNSAFE(input, offset);
803 if(offset != boundaries[i]){
804 log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
805 }
806 }
807 #if !U_HIDE_OBSOLETE_UTF_OLD_H
808 for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
809 UTF8_BACK_1_UNSAFE(input, offset);
810 if(offset != backBoundaries[i]){
811 log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset);
812 }
813 }
814 #endif
815 for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
816 U8_BACK_1_UNSAFE(input, offset);
817 if(offset != backBoundaries[i]){
818 log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset);
819 }
820 }
821 #if !U_HIDE_OBSOLETE_UTF_OLD_H
822 for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
823 offset=0;
824 UTF8_FWD_N_UNSAFE(input, offset, i);
825 if(offset != boundaries[i]) {
826 log_err("ERROR: UTF8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
827 }
828 }
829 #endif
830 for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
831 offset=0;
832 U8_FWD_N_UNSAFE(input, offset, i);
833 if(offset != boundaries[i]) {
834 log_err("ERROR: U8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
835 }
836 }
837 #if !U_HIDE_OBSOLETE_UTF_OLD_H
838 for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) {
839 int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i;
840 offset=UPRV_LENGTHOF(input);
841 UTF8_BACK_N_UNSAFE(input, offset, i);
842 if(offset != backBoundaries[j]) {
843 log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset);
844 }
845 }
846 #endif
847 for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) {
848 int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i;
849 offset=UPRV_LENGTHOF(input);
850 U8_BACK_N_UNSAFE(input, offset, i);
851 if(offset != backBoundaries[j]) {
852 log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset);
853 }
854 }
855 }
856
857 // Ticket #13636 - Turn optimization back on.
858 #if defined(_MSC_VER) && (_MSC_VER > 1900) && (_MSC_VER < 1924)
859 #pragma optimize( "", on )
860 #endif
861
TestSetChar()862 static void TestSetChar() {
863 static const uint8_t input[]
864 = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0xfe, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x00 };
865 static const int16_t start_safe[]
866 = {0, 1, 1, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
867 static const int16_t limit_safe[]
868 = {0, 1, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
869
870 uint32_t i=0;
871 int32_t offset=0, setOffset=0;
872 for(offset=0; offset<=UPRV_LENGTHOF(input); offset++){
873 if (offset<UPRV_LENGTHOF(input)){
874 #if !U_HIDE_OBSOLETE_UTF_OLD_H
875 setOffset=offset;
876 UTF8_SET_CHAR_START_SAFE(input, 0, setOffset);
877 if(setOffset != start_safe[i]){
878 log_err("ERROR: UTF8_SET_CHAR_START_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
879 }
880 #endif
881 setOffset=offset;
882 U8_SET_CP_START(input, 0, setOffset);
883 if(setOffset != start_safe[i]){
884 log_err("ERROR: U8_SET_CP_START failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
885 }
886 }
887 #if !U_HIDE_OBSOLETE_UTF_OLD_H
888 setOffset=offset;
889 UTF8_SET_CHAR_LIMIT_SAFE(input,0, setOffset, (int32_t)sizeof(input));
890 if(setOffset != limit_safe[i]){
891 log_err("ERROR: UTF8_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
892 }
893 #endif
894 setOffset=offset;
895 U8_SET_CP_LIMIT(input,0, setOffset, (int32_t)sizeof(input));
896 if(setOffset != limit_safe[i]){
897 log_err("ERROR: U8_SET_CP_LIMIT failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
898 }
899
900 i++;
901 }
902 }
903
TestSetCharUnsafe()904 static void TestSetCharUnsafe() {
905 static const uint8_t input[]
906 = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0x2e, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x80, 0x80, 0x00 };
907 static const int16_t start_unsafe[]
908 = {0, 1, 1, 1, 4, 5, 6, 7, 8, 9, 9, 9, 12, 12, 12, 15 };
909 static const int16_t limit_unsafe[]
910 = {0, 1, 4, 4, 4, 5, 6, 7, 9, 9, 10, 10, 10, 15, 15, 15, 16 };
911
912 uint32_t i=0;
913 int32_t offset=0, setOffset=0;
914 for(offset=0; offset<=UPRV_LENGTHOF(input); offset++){
915 if (offset<UPRV_LENGTHOF(input)){
916 #if !U_HIDE_OBSOLETE_UTF_OLD_H
917 setOffset=offset;
918 UTF8_SET_CHAR_START_UNSAFE(input, setOffset);
919 if(setOffset != start_unsafe[i]){
920 log_err("ERROR: UTF8_SET_CHAR_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
921 }
922 #endif
923 setOffset=offset;
924 U8_SET_CP_START_UNSAFE(input, setOffset);
925 if(setOffset != start_unsafe[i]){
926 log_err("ERROR: U8_SET_CP_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
927 }
928 }
929
930 if (offset != 0) { /* Can't have it go off the end of the array */
931 #if !U_HIDE_OBSOLETE_UTF_OLD_H
932 setOffset=offset;
933 UTF8_SET_CHAR_LIMIT_UNSAFE(input, setOffset);
934 if(setOffset != limit_unsafe[i]){
935 log_err("ERROR: UTF8_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
936 }
937 #endif
938 setOffset=offset;
939 U8_SET_CP_LIMIT_UNSAFE(input, setOffset);
940 if(setOffset != limit_unsafe[i]){
941 log_err("ERROR: U8_SET_CP_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
942 }
943 }
944
945 i++;
946 }
947 }
948
TestTruncateIfIncomplete()949 static void TestTruncateIfIncomplete() {
950 // Difference from U8_SET_CP_START():
951 // U8_TRUNCATE_IF_INCOMPLETE() does not look at s[length].
952 // Therefore, if the last byte is a lead byte, then this macro truncates
953 // even if the byte at the input index cannot continue a valid sequence
954 // (including when that is not a trail byte).
955 // On the other hand, if the last byte is a trail byte, then the two macros behave the same.
956 static const struct {
957 const char *s;
958 int32_t expected;
959 } cases[] = {
960 { "", 0 },
961 { "a", 1 },
962 { "\x80", 1 },
963 { "\xC1", 1 },
964 { "\xC2", 0 },
965 { "\xE0", 0 },
966 { "\xF4", 0 },
967 { "\xF5", 1 },
968 { "\x80\x80", 2 },
969 { "\xC2\xA0", 2 },
970 { "\xE0\x9F", 2 },
971 { "\xE0\xA0", 0 },
972 { "\xED\x9F", 0 },
973 { "\xED\xA0", 2 },
974 { "\xF0\x8F", 2 },
975 { "\xF0\x90", 0 },
976 { "\xF4\x8F", 0 },
977 { "\xF4\x90", 2 },
978 { "\xF5\x80", 2 },
979 { "\x80\x80\x80", 3 },
980 { "\xC2\xA0\x80", 3 },
981 { "\xE0\xA0\x80", 3 },
982 { "\xF0\x8F\x80", 3 },
983 { "\xF0\x90\x80", 0 },
984 { "\xF4\x8F\x80", 0 },
985 { "\xF4\x90\x80", 3 },
986 { "\xF5\x80\x80", 3 },
987 { "\x80\x80\x80\x80", 4 },
988 { "\xC2\xA0\x80\x80", 4 },
989 { "\xE0\xA0\x80\x80", 4 },
990 { "\xF0\x90\x80\x80", 4 },
991 { "\xF5\x80\x80\x80", 4 }
992 };
993 int32_t i;
994 for (i = 0; i < UPRV_LENGTHOF(cases); ++i) {
995 const char *s = cases[i].s;
996 int32_t expected = cases[i].expected;
997 int32_t length = (int32_t)strlen(s);
998 int32_t adjusted = length;
999 U8_TRUNCATE_IF_INCOMPLETE(s, 0, adjusted);
1000 if (adjusted != expected) {
1001 log_err("ERROR: U8_TRUNCATE_IF_INCOMPLETE failed for i=%d, length=%d. Expected:%d Got:%d\n",
1002 (int)i, (int)length, (int)expected, (int)adjusted);
1003 }
1004 }
1005 }
1006
TestAppendChar()1007 static void TestAppendChar(){
1008 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1009 static const uint8_t s[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00};
1010 static const uint32_t test[]={
1011 /* append-position(unsafe), CHAR to be appended */
1012 0, 0x10401,
1013 2, 0x0028,
1014 2, 0x007f,
1015 3, 0xd801,
1016 1, 0x20402,
1017 8, 0x10401,
1018 5, 0xc0,
1019 5, 0xc1,
1020 5, 0xfd,
1021 6, 0x80,
1022 6, 0x81,
1023 6, 0xbf,
1024 7, 0xfe,
1025
1026 /* append-position(safe), CHAR to be appended */
1027 0, 0x10401,
1028 2, 0x0028,
1029 3, 0x7f,
1030 3, 0xd801, /* illegal for UTF-8 starting with Unicode 3.2 */
1031 1, 0x20402,
1032 9, 0x10401,
1033 5, 0xc0,
1034 5, 0xc1,
1035 5, 0xfd,
1036 6, 0x80,
1037 6, 0x81,
1038 6, 0xbf,
1039 7, 0xfe,
1040
1041 };
1042 static const uint16_t movedOffset[]={
1043 /* offset-moved-to(unsafe) */
1044 4, /*for append-pos: 0 , CHAR 0x10401*/
1045 3,
1046 3,
1047 6,
1048 5,
1049 12,
1050 7,
1051 7,
1052 7,
1053 8,
1054 8,
1055 8,
1056 9,
1057
1058 /* offset-moved-to(safe) */
1059 4, /*for append-pos: 0, CHAR 0x10401*/
1060 3,
1061 4,
1062 6,
1063 5,
1064 11,
1065 7,
1066 7,
1067 7,
1068 8,
1069 8,
1070 8,
1071 9,
1072
1073 };
1074
1075 static const uint8_t result[][11]={
1076 /*unsafe*/
1077 {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1078 {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1079 {0x61, 0x62, 0x7f, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1080 {0x61, 0x62, 0x63, 0xed, 0xa0, 0x81, 0x67, 0x68, 0x69, 0x6a, 0x00},
1081 {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1082 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0xF0, 0x90, 0x90},
1083
1084 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
1085 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
1086 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
1087
1088 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
1089 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
1090 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
1091
1092 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
1093 /*safe*/
1094 {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1095 {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1096 {0x61, 0x62, 0x63, 0x7f, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1097 {0x61, 0x62, 0x63, 0xef, 0xbf, 0xbf, 0x67, 0x68, 0x69, 0x6a, 0x00},
1098 {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1099 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xc2, 0x9f}, /*gets UTF8_ERROR_VALUE_2 which takes 2 bytes 0xc0, 0x9f*/
1100
1101 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
1102 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
1103 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
1104
1105 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
1106 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
1107 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
1108
1109 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
1110
1111 };
1112 uint16_t i, count=0;
1113 uint8_t str[12];
1114 uint32_t offset;
1115 /* UChar32 c=0;*/
1116 uint16_t size=UPRV_LENGTHOF(s);
1117 for(i=0; i<UPRV_LENGTHOF(test); i=(uint16_t)(i+2)){
1118 uprv_memcpy(str, s, size);
1119 offset=test[i];
1120 if(count<13){
1121 UTF8_APPEND_CHAR_UNSAFE(str, offset, test[i+1]);
1122 if(offset != movedOffset[count]){
1123 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",
1124 count, movedOffset[count], offset);
1125
1126 }
1127 if(uprv_memcmp(str, result[count], size) !=0){
1128 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed for count=%d. \nExpected:", count);
1129 printUChars(result[count], size);
1130 log_err("\nGot: ");
1131 printUChars(str, size);
1132 log_err("\n");
1133 }
1134 }else{
1135 UTF8_APPEND_CHAR_SAFE(str, offset, size, test[i+1]);
1136 if(offset != movedOffset[count]){
1137 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",
1138 count, movedOffset[count], offset);
1139
1140 }
1141 if(uprv_memcmp(str, result[count], size) !=0){
1142 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed for count=%d. \nExpected:", count);
1143 printUChars(result[count], size);
1144 log_err("\nGot: ");
1145 printUChars(str, size);
1146 log_err("\n");
1147 }
1148 /*call the API instead of MACRO
1149 uprv_memcpy(str, s, size);
1150 offset=test[i];
1151 c=test[i+1];
1152 if((uint32_t)(c)<=0x7f) {
1153 (str)[(offset)++]=(uint8_t)(c);
1154 } else {
1155 (offset)=utf8_appendCharSafeBody(str, (int32_t)(offset), (int32_t)(size), c);
1156 }
1157 if(offset != movedOffset[count]){
1158 log_err("ERROR: utf8_appendCharSafeBody() failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",
1159 count, movedOffset[count], offset);
1160
1161 }
1162 if(uprv_memcmp(str, result[count], size) !=0){
1163 log_err("ERROR: utf8_appendCharSafeBody() failed for count=%d. \nExpected:", count);
1164 printUChars(result[count], size);
1165 printf("\nGot: ");
1166 printUChars(str, size);
1167 printf("\n");
1168 }
1169 */
1170 }
1171 count++;
1172 }
1173 #endif
1174 }
1175
TestAppend()1176 static void TestAppend() {
1177 static const UChar32 codePoints[]={
1178 0x61, 0xdf, 0x901, 0x3040,
1179 0xac00, 0xd800, 0xdbff, 0xdcde,
1180 0xdffd, 0xe000, 0xffff, 0x10000,
1181 0x12345, 0xe0021, 0x10ffff, 0x110000,
1182 0x234567, 0x7fffffff, -1, -1000,
1183 0, 0x400
1184 };
1185 static const uint8_t expectUnsafe[]={
1186 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80,
1187 0xea, 0xb0, 0x80, 0xed, 0xa0, 0x80, 0xed, 0xaf, 0xbf, 0xed, 0xb3, 0x9e,
1188 0xed, 0xbf, 0xbd, 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80,
1189 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */
1190 /* none from this line */
1191 0, 0xd0, 0x80
1192 }, expectSafe[]={
1193 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80,
1194 0xea, 0xb0, 0x80, /* no surrogates */
1195 /* no surrogates */ 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80,
1196 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */
1197 /* none from this line */
1198 0, 0xd0, 0x80
1199 };
1200
1201 uint8_t buffer[100];
1202 UChar32 c;
1203 int32_t i, length;
1204 UBool isError, expectIsError, wrongIsError;
1205
1206 length=0;
1207 for(i=0; i<UPRV_LENGTHOF(codePoints); ++i) {
1208 c=codePoints[i];
1209 if(c<0 || 0x10ffff<c) {
1210 continue; /* skip non-code points for U8_APPEND_UNSAFE */
1211 }
1212
1213 U8_APPEND_UNSAFE(buffer, length, c);
1214 }
1215 if(length!=UPRV_LENGTHOF(expectUnsafe) || 0!=memcmp(buffer, expectUnsafe, length)) {
1216 log_err("U8_APPEND_UNSAFE did not generate the expected output\n");
1217 }
1218
1219 length=0;
1220 wrongIsError=false;
1221 for(i=0; i<UPRV_LENGTHOF(codePoints); ++i) {
1222 c=codePoints[i];
1223 expectIsError= c<0 || 0x10ffff<c || U_IS_SURROGATE(c);
1224 isError=false;
1225
1226 U8_APPEND(buffer, length, UPRV_LENGTHOF(buffer), c, isError);
1227 wrongIsError|= isError!=expectIsError;
1228 }
1229 if(wrongIsError) {
1230 log_err("U8_APPEND did not set isError correctly\n");
1231 }
1232 if(length!=UPRV_LENGTHOF(expectSafe) || 0!=memcmp(buffer, expectSafe, length)) {
1233 log_err("U8_APPEND did not generate the expected output\n");
1234 }
1235 }
1236
1237 static void
TestSurrogates()1238 TestSurrogates() {
1239 static const uint8_t b[]={
1240 0xc3, 0x9f, /* 00DF */
1241 0xed, 0x9f, 0xbf, /* D7FF */
1242 0xed, 0xa0, 0x81, /* D801 */
1243 0xed, 0xbf, 0xbe, /* DFFE */
1244 0xee, 0x80, 0x80, /* E000 */
1245 0xf0, 0x97, 0xbf, 0xbe /* 17FFE */
1246 };
1247 static const UChar32 cp[]={
1248 0xdf, 0xd7ff, 0xd801, 0xdffe, 0xe000, 0x17ffe
1249 };
1250
1251 UChar32 cu, cs, cl;
1252 int32_t i, j, k, iu, is, il, length;
1253
1254 k=0; /* index into cp[] */
1255 length=UPRV_LENGTHOF(b);
1256 for(i=0; i<length;) {
1257 j=i;
1258 U8_NEXT_UNSAFE(b, j, cu);
1259 iu=j;
1260
1261 j=i;
1262 U8_NEXT(b, j, length, cs);
1263 is=j;
1264
1265 j=i;
1266 L8_NEXT(b, j, length, cl);
1267 il=j;
1268
1269 if(cu!=cp[k]) {
1270 log_err("U8_NEXT_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
1271 }
1272
1273 /* U8_NEXT() returns <0 for surrogate code points */
1274 if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
1275 log_err("U8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
1276 }
1277
1278 /* L8_NEXT() returns surrogate code points like U8_NEXT_UNSAFE() */
1279 if(cl!=cu) {
1280 log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
1281 }
1282
1283 // U8_NEXT() skips only the first byte of a surrogate byte sequence.
1284 if(U_IS_SURROGATE(cu) ? is!=(i+1) : is!=iu) {
1285 log_err("U8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1286 }
1287 if(il!=iu) {
1288 log_err("L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1289 }
1290
1291 ++k; /* next code point */
1292 i=iu; /* advance by one UTF-8 sequence */
1293 }
1294
1295 while(i>0) {
1296 --k; /* previous code point */
1297
1298 j=i;
1299 U8_PREV_UNSAFE(b, j, cu);
1300 iu=j;
1301
1302 j=i;
1303 U8_PREV(b, 0, j, cs);
1304 is=j;
1305
1306 j=i;
1307 L8_PREV(b, 0, j, cl);
1308 il=j;
1309
1310 if(cu!=cp[k]) {
1311 log_err("U8_PREV_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
1312 }
1313
1314 /* U8_PREV() returns <0 for surrogate code points */
1315 if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
1316 log_err("U8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
1317 }
1318
1319 /* L8_PREV() returns surrogate code points like U8_PREV_UNSAFE() */
1320 if(cl!=cu) {
1321 log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
1322 }
1323
1324 // U8_PREV() skips only the last byte of a surrogate byte sequence.
1325 if(U_IS_SURROGATE(cu) ? is!=(i-1) : is!=iu) {
1326 log_err("U8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1327 }
1328 if(il !=iu) {
1329 log_err("L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1330 }
1331
1332 i=iu; /* go back by one UTF-8 sequence */
1333 }
1334 }
1335