1 /********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1998-2006, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6 /*
7 * File test.c
8 *
9 * Modification History:
10 *
11 * Date Name Description
12 * 07/24/2000 Madhu Creation
13 *******************************************************************************
14 */
15
16 #include "unicode/utypes.h"
17 #include "unicode/utf8.h"
18 #include "cmemory.h"
19 #include "cintltst.h"
20
21 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
22
23 /* lenient UTF-8 ------------------------------------------------------------ */
24
25 /*
26 * Lenient UTF-8 differs from conformant UTF-8 in that it allows surrogate
27 * code points with their "natural" encoding.
28 * Effectively, this allows a mix of UTF-8 and CESU-8 as well as encodings of
29 * single surrogates.
30 *
31 * This is not conformant with UTF-8.
32 *
33 * Supplementary code points may be encoded as pairs of 3-byte sequences, but
34 * the macros below do not attempt to assemble such pairs.
35 */
36
37 #define L8_NEXT(s, i, length, c) { \
38 (c)=(uint8_t)(s)[(i)++]; \
39 if((c)>=0x80) { \
40 if(U8_IS_LEAD(c)) { \
41 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
42 } else { \
43 (c)=U_SENTINEL; \
44 } \
45 } \
46 }
47
48 #define L8_PREV(s, start, i, c) { \
49 (c)=(uint8_t)(s)[--(i)]; \
50 if((c)>=0x80) { \
51 if((c)<=0xbf) { \
52 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
53 } else { \
54 (c)=U_SENTINEL; \
55 } \
56 } \
57 }
58
59 /* -------------------------------------------------------------------------- */
60
61 static void printUChars(const uint8_t *uchars, int16_t len);
62
63 static void TestCodeUnitValues(void);
64 static void TestCharLength(void);
65 static void TestGetChar(void);
66 static void TestNextPrevChar(void);
67 static void TestFwdBack(void);
68 static void TestSetChar(void);
69 static void TestAppendChar(void);
70 static void TestAppend(void);
71 static void TestSurrogates(void);
72
73 void addUTF8Test(TestNode** root);
74
75 void
addUTF8Test(TestNode ** root)76 addUTF8Test(TestNode** root)
77 {
78 addTest(root, &TestCodeUnitValues, "utf8tst/TestCodeUnitValues");
79 addTest(root, &TestCharLength, "utf8tst/TestCharLength" );
80 addTest(root, &TestGetChar, "utf8tst/TestGetChar" );
81 addTest(root, &TestNextPrevChar, "utf8tst/TestNextPrevChar" );
82 addTest(root, &TestFwdBack, "utf8tst/TestFwdBack" );
83 addTest(root, &TestSetChar, "utf8tst/TestSetChar" );
84 addTest(root, &TestAppendChar, "utf8tst/TestAppendChar" );
85 addTest(root, &TestAppend, "utf8tst/TestAppend" );
86 addTest(root, &TestSurrogates, "utf8tst/TestSurrogates" );
87 }
88
TestCodeUnitValues()89 static void TestCodeUnitValues()
90 {
91 static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc0, 0xc4, 0xf0, 0xfd, 0x80, 0x81, 0xbc, 0xbe,};
92
93 int16_t i;
94 for(i=0; i<sizeof(codeunit)/sizeof(codeunit[0]); i++){
95 uint8_t c=codeunit[i];
96 log_verbose("Testing code unit value of %x\n", c);
97 if(i<4){
98 if(!UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || UTF8_IS_TRAIL(c) || !U8_IS_SINGLE(c) || U8_IS_LEAD(c) || U8_IS_TRAIL(c)){
99 log_err("ERROR: 0x%02x is a single byte but results in single: %c lead: %c trail: %c\n",
100 c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
101 }
102 } else if(i< 8){
103 if(!UTF8_IS_LEAD(c) || UTF8_IS_SINGLE(c) || UTF8_IS_TRAIL(c) || !U8_IS_LEAD(c) || U8_IS_SINGLE(c) || U8_IS_TRAIL(c)){
104 log_err("ERROR: 0x%02x is a lead byte but results in single: %c lead: %c trail: %c\n",
105 c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
106 }
107 } else if(i< 12){
108 if(!UTF8_IS_TRAIL(c) || UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || !U8_IS_TRAIL(c) || U8_IS_SINGLE(c) || U8_IS_LEAD(c)){
109 log_err("ERROR: 0x%02x is a trail byte but results in single: %c lead: %c trail: %c\n",
110 c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
111 }
112 }
113 }
114 }
115
TestCharLength()116 static void TestCharLength()
117 {
118 static const uint32_t codepoint[]={
119 1, 0x0061,
120 1, 0x007f,
121 2, 0x016f,
122 2, 0x07ff,
123 3, 0x0865,
124 3, 0x20ac,
125 4, 0x20402,
126 4, 0x23456,
127 4, 0x24506,
128 4, 0x20402,
129 4, 0x10402,
130 3, 0xd7ff,
131 3, 0xe000,
132
133 };
134
135 int16_t i;
136 UBool multiple;
137 for(i=0; i<sizeof(codepoint)/sizeof(codepoint[0]); i=(int16_t)(i+2)){
138 UChar32 c=codepoint[i+1];
139 if(UTF8_CHAR_LENGTH(c) != (uint16_t)codepoint[i] || U8_LENGTH(c) != (uint16_t)codepoint[i]){
140 log_err("The no: of code units for %lx:- Expected: %d Got: %d\n", c, codepoint[i], UTF8_CHAR_LENGTH(c));
141 }else{
142 log_verbose("The no: of code units for %lx is %d\n",c, UTF8_CHAR_LENGTH(c) );
143 }
144 multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
145 if(UTF8_NEED_MULTIPLE_UCHAR(c) != multiple){
146 log_err("ERROR: UTF8_NEED_MULTIPLE_UCHAR failed for %lx\n", c);
147 }
148 }
149 }
150
TestGetChar()151 static void TestGetChar()
152 {
153 static const uint8_t input[]={
154 /* code unit,*/
155 0x61,
156 0x7f,
157 0xe4,
158 0xba,
159 0x8c,
160 0xF0,
161 0x90,
162 0x90,
163 0x81,
164 0xc0,
165 0x65,
166 0x31,
167 0x9a,
168 0xc9
169 };
170 static const UChar32 result[]={
171 /*codepoint-unsafe, codepoint-safe(not strict) codepoint-safe(strict)*/
172 0x61, 0x61, 0x61,
173 0x7f, 0x7f, 0x7f,
174 0x4e8c, 0x4e8c, 0x4e8c,
175 0x4e8c, 0x4e8c, 0x4e8c ,
176 0x4e8c, 0x4e8c, 0x4e8c,
177 0x10401, 0x10401, 0x10401 ,
178 0x10401, 0x10401, 0x10401 ,
179 0x10401, 0x10401, 0x10401 ,
180 0x10401, 0x10401, 0x10401,
181 0x25, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
182 0x65, 0x65, 0x65,
183 0x31, 0x31, 0x31,
184 0x31, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
185 0x240, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1
186 };
187 uint16_t i=0;
188 UChar32 c;
189 uint32_t offset=0;
190
191 for(offset=0; offset<sizeof(input); offset++) {
192 if (offset < sizeof(input) - 1) {
193 UTF8_GET_CHAR_UNSAFE(input, offset, c);
194 if(c != result[i]){
195 log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
196
197 }
198
199 U8_GET_UNSAFE(input, offset, c);
200 if(c != result[i]){
201 log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
202
203 }
204 }
205
206 U8_GET(input, 0, offset, sizeof(input), c);
207 if(UTF_IS_ERROR(result[i+1]) ? c >= 0 : c != result[i+1]){
208 log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
209 }
210
211 UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE);
212 if(c != result[i+1]){
213 log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
214 }
215
216 UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, TRUE);
217 if(c != result[i+2]){
218 log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
219 }
220
221 i=(uint16_t)(i+3);
222 }
223 }
224
TestNextPrevChar()225 static void TestNextPrevChar(){
226 static const uint8_t input[]={0x61, 0xf0, 0x90, 0x90, 0x81, 0xc0, 0x80, 0xfd, 0xbe, 0xc2, 0x61, 0x81, 0x90, 0x90, 0xf0, 0x00};
227 static const UChar32 result[]={
228 /*next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns prev_safe_s*/
229 0x0061, 0x0061, 0x0061, 0x0000, 0x0000, 0x0000,
230 0x10401, 0x10401, 0x10401, 0xf0, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
231 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x2841410, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
232 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xa1050, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
233 0x81, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x2841, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
234 0x00, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, 0x61, 0x61, 0x61,
235 0x80, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xc2, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
236 0xfd, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, 0x77e, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
237 0xbe, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xfd, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
238 0xa1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x00, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
239 0x61, 0x61, 0x61, 0xc0, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
240 0x81, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x10401, 0x10401, 0x10401,
241 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x410, UTF_ERROR_VALUE, UTF_ERROR_VALUE,
242 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x410, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
243 0x0840, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xf0, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
244 0x0000, 0x0000, 0x0000, 0x0061, 0x0061, 0x0061
245 };
246 static const int32_t movedOffset[]={
247 /*next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns prev_safe_s*/
248 1, 1, 1, 15, 15, 15,
249 5, 5, 5, 14, 14 , 14,
250 3, 3, 3, 9, 13, 13,
251 4, 4, 4, 9, 12, 12,
252 5, 5, 5, 9, 11, 11,
253 7, 7, 7, 10, 10, 10,
254 7, 7, 7, 9, 9, 9,
255 8, 9, 9, 7, 7, 7,
256 9, 9, 9, 7, 7, 7,
257 11, 10, 10, 5, 5, 5,
258 11, 11, 11, 5, 5, 5,
259 12, 12, 12, 1, 1, 1,
260 13, 13, 13, 1, 1, 1,
261 14, 14, 14, 1, 1, 1,
262 14, 15, 15, 1, 1, 1,
263 14, 16, 16, 0, 0, 0,
264
265
266 };
267
268
269 UChar32 c=0x0000;
270 uint32_t i=0;
271 uint32_t offset=0;
272 int32_t setOffset=0;
273 for(offset=0; offset<sizeof(input); offset++){
274 if (offset < sizeof(input) - 2) { /* Can't have it go off the end of the array based on input */
275 setOffset=offset;
276 UTF8_NEXT_CHAR_UNSAFE(input, setOffset, c);
277 if(setOffset != movedOffset[i]){
278 log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
279 offset, movedOffset[i], setOffset);
280 }
281 if(c != result[i]){
282 log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
283 }
284
285 setOffset=offset;
286 U8_NEXT_UNSAFE(input, setOffset, c);
287 if(setOffset != movedOffset[i]){
288 log_err("ERROR: U8_NEXT_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
289 offset, movedOffset[i], setOffset);
290 }
291 if(c != result[i]){
292 log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
293 }
294 }
295
296 setOffset=offset;
297 UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
298 if(setOffset != movedOffset[i+1]){
299 log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
300 offset, movedOffset[i+1], setOffset);
301 }
302 if(c != result[i+1]){
303 log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
304 }
305
306 setOffset=offset;
307 U8_NEXT(input, setOffset, sizeof(input), c);
308 if(setOffset != movedOffset[i+1]){
309 log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
310 offset, movedOffset[i+1], setOffset);
311 }
312 if(UTF_IS_ERROR(result[i+1]) ? c >= 0 : c != result[i+1]){
313 log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
314 }
315
316 setOffset=offset;
317 UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
318 if(setOffset != movedOffset[i+1]){
319 log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
320 offset, movedOffset[i+2], setOffset);
321 }
322 if(c != result[i+2]){
323 log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
324 }
325
326 i=i+6;
327 }
328
329 i=0;
330 for(offset=sizeof(input); offset > 0; --offset){
331 setOffset=offset;
332 UTF8_PREV_CHAR_UNSAFE(input, setOffset, c);
333 if(setOffset != movedOffset[i+3]){
334 log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
335 offset, movedOffset[i+3], setOffset);
336 }
337 if(c != result[i+3]){
338 log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c);
339 }
340
341 setOffset=offset;
342 UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
343 if(setOffset != movedOffset[i+4]){
344 log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
345 offset, movedOffset[i+4], setOffset);
346 }
347 if(c != result[i+4]){
348 log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
349 }
350
351 setOffset=offset;
352 U8_PREV(input, 0, setOffset, c);
353 if(setOffset != movedOffset[i+4]){
354 log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
355 offset, movedOffset[i+4], setOffset);
356 }
357 if(UTF_IS_ERROR(result[i+4]) ? c >= 0 : c != result[i+4]){
358 log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
359 }
360
361 setOffset=offset;
362 UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE);
363 if(setOffset != movedOffset[i+5]){
364 log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
365 offset, movedOffset[i+5], setOffset);
366 }
367 if(c != result[i+5]){
368 log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c);
369 }
370
371 i=i+6;
372 }
373
374 {
375 /* test non-characters */
376 static const uint8_t nonChars[]={
377 0xef, 0xb7, 0x90, /* U+fdd0 */
378 0xef, 0xbf, 0xbf, /* U+feff */
379 0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */
380 0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */
381 0xf4, 0x8f, 0xbf, 0xbe /* U+10fffe */
382 };
383
384 UChar32 ch;
385 int32_t idx;
386
387 for(idx=0; idx<(int32_t)sizeof(nonChars);) {
388 U8_NEXT(nonChars, idx, sizeof(nonChars), ch);
389 if(!U_IS_UNICODE_NONCHAR(ch)) {
390 log_err("U8_NEXT(before %d) failed to read a non-character\n", idx);
391 }
392 }
393 for(idx=(int32_t)sizeof(nonChars); idx>0;) {
394 U8_PREV(nonChars, 0, idx, ch);
395 if(!U_IS_UNICODE_NONCHAR(ch)) {
396 log_err("U8_PREV(at %d) failed to read a non-character\n", idx);
397 }
398 }
399 }
400 }
401
TestFwdBack()402 static void TestFwdBack(){
403 static const uint8_t input[]={0x61, 0xF0, 0x90, 0x90, 0x81, 0xff, 0x62, 0xc0, 0x80, 0x7f, 0x8f, 0xc0, 0x63, 0x81, 0x90, 0x90, 0xF0, 0x00};
404 static const uint16_t fwd_unsafe[] ={1, 5, 6, 7, 9, 10, 11, 13, 14, 15, 16, 20, };
405 static const uint16_t fwd_safe[] ={1, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
406 static const uint16_t back_unsafe[]={17, 16, 12, 11, 9, 7, 6, 5, 1, 0};
407 static const uint16_t back_safe[] ={17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6, 5, 1, 0};
408
409 static const uint16_t Nvalue[]= {0, 1, 2, 3, 1, 2, 1, 5};
410 static const uint16_t fwd_N_unsafe[] ={0, 1, 6, 10, 11, 14, 15};
411 static const uint16_t fwd_N_safe[] ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */
412 static const uint16_t back_N_unsafe[]={18, 17, 12, 7, 6, 1, 0};
413 static const uint16_t back_N_safe[] ={18, 17, 15, 12, 11, 9, 7, 0};
414
415
416 uint32_t offunsafe=0, offsafe=0;
417
418 uint32_t i=0;
419 while(offunsafe < sizeof(input)){
420 UTF8_FWD_1_UNSAFE(input, offunsafe);
421 if(offunsafe != fwd_unsafe[i]){
422 log_err("ERROR: Forward_unsafe offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe);
423 }
424 i++;
425 }
426
427 i=0;
428 while(offunsafe < sizeof(input)){
429 U8_FWD_1_UNSAFE(input, offunsafe);
430 if(offunsafe != fwd_unsafe[i]){
431 log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe);
432 }
433 i++;
434 }
435
436 i=0;
437 while(offsafe < sizeof(input)){
438 UTF8_FWD_1_SAFE(input, offsafe, sizeof(input));
439 if(offsafe != fwd_safe[i]){
440 log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
441 }
442 i++;
443 }
444
445 i=0;
446 while(offsafe < sizeof(input)){
447 U8_FWD_1(input, offsafe, sizeof(input));
448 if(offsafe != fwd_safe[i]){
449 log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
450 }
451 i++;
452 }
453
454 offunsafe=sizeof(input);
455 i=0;
456 while(offunsafe > 0){
457 UTF8_BACK_1_UNSAFE(input, offunsafe);
458 if(offunsafe != back_unsafe[i]){
459 log_err("ERROR: Backward_unsafe offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe);
460 }
461 i++;
462 }
463
464 offunsafe=sizeof(input);
465 i=0;
466 while(offunsafe > 0){
467 U8_BACK_1_UNSAFE(input, offunsafe);
468 if(offunsafe != back_unsafe[i]){
469 log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe);
470 }
471 i++;
472 }
473
474 i=0;
475 offsafe=sizeof(input);
476 while(offsafe > 0){
477 UTF8_BACK_1_SAFE(input, 0, offsafe);
478 if(offsafe != back_safe[i]){
479 log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_unsafe[i], offsafe);
480 }
481 i++;
482 }
483
484 i=0;
485 offsafe=sizeof(input);
486 while(offsafe > 0){
487 U8_BACK_1(input, 0, offsafe);
488 if(offsafe != back_safe[i]){
489 log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_unsafe[i], offsafe);
490 }
491 i++;
492 }
493
494 offunsafe=0;
495 for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
496 UTF8_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]);
497 if(offunsafe != fwd_N_unsafe[i]){
498 log_err("ERROR: Forward_N_unsafe offset=%d expected:%d, Got:%d\n", i, fwd_N_unsafe[i], offunsafe);
499 }
500 }
501
502 offunsafe=0;
503 for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
504 U8_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]);
505 if(offunsafe != fwd_N_unsafe[i]){
506 log_err("ERROR: U8_FWD_N_UNSAFE offset=%d expected:%d, Got:%d\n", i, fwd_N_unsafe[i], offunsafe);
507 }
508 }
509
510 offsafe=0;
511 for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
512 UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]);
513 if(offsafe != fwd_N_safe[i]){
514 log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
515 }
516
517 }
518
519 offsafe=0;
520 for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
521 U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]);
522 if(offsafe != fwd_N_safe[i]){
523 log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
524 }
525
526 }
527
528 offunsafe=sizeof(input);
529 for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
530 UTF8_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]);
531 if(offunsafe != back_N_unsafe[i]){
532 log_err("ERROR: backward_N_unsafe offset=%d expected:%d, Got:%d\n", i, back_N_unsafe[i], offunsafe);
533 }
534 }
535
536 offunsafe=sizeof(input);
537 for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
538 U8_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]);
539 if(offunsafe != back_N_unsafe[i]){
540 log_err("ERROR: U8_BACK_N_UNSAFE offset=%d expected:%d, Got:%d\n", i, back_N_unsafe[i], offunsafe);
541 }
542 }
543
544 offsafe=sizeof(input);
545 for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
546 UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]);
547 if(offsafe != back_N_safe[i]){
548 log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
549 }
550 }
551
552 offsafe=sizeof(input);
553 for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
554 U8_BACK_N(input, 0, offsafe, Nvalue[i]);
555 if(offsafe != back_N_safe[i]){
556 log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
557 }
558 }
559 }
560
TestSetChar()561 static void TestSetChar(){
562 static const uint8_t input[]
563 = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0xfe, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x00 };
564 static const int16_t start_unsafe[]
565 = {0, 1, 1, 1, 4, 5, 6, 7, 8, 9, 9, 9, 12, 13 };
566 static const int16_t start_safe[]
567 = {0, 1, 1, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 };
568 static const int16_t limit_unsafe[]
569 = {0, 1, 4, 4, 4, 5, 6, 7, 9, 9, 10, 10, 10, 15 };
570 static const int16_t limit_safe[]
571 = {0, 1, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 };
572
573 uint32_t i=0;
574 int32_t offset=0, setOffset=0;
575 for(offset=0; offset<(int32_t)sizeof(input); offset++){
576 setOffset=offset;
577 UTF8_SET_CHAR_START_UNSAFE(input, setOffset);
578 if(setOffset != start_unsafe[i]){
579 log_err("ERROR: UTF8_SET_CHAR_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
580 }
581
582 setOffset=offset;
583 U8_SET_CP_START_UNSAFE(input, setOffset);
584 if(setOffset != start_unsafe[i]){
585 log_err("ERROR: U8_SET_CP_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
586 }
587
588 setOffset=offset;
589 UTF8_SET_CHAR_START_SAFE(input, 0, setOffset);
590 if(setOffset != start_safe[i]){
591 log_err("ERROR: UTF8_SET_CHAR_START_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
592 }
593
594 setOffset=offset;
595 U8_SET_CP_START(input, 0, setOffset);
596 if(setOffset != start_safe[i]){
597 log_err("ERROR: U8_SET_CP_START failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
598 }
599
600 if (offset != 0) { /* Can't have it go off the end of the array */
601 setOffset=offset;
602 UTF8_SET_CHAR_LIMIT_UNSAFE(input, setOffset);
603 if(setOffset != limit_unsafe[i]){
604 log_err("ERROR: UTF8_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
605 }
606
607 setOffset=offset;
608 U8_SET_CP_LIMIT_UNSAFE(input, setOffset);
609 if(setOffset != limit_unsafe[i]){
610 log_err("ERROR: U8_SET_CP_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
611 }
612 }
613
614 setOffset=offset;
615 UTF8_SET_CHAR_LIMIT_SAFE(input,0, setOffset, sizeof(input));
616 if(setOffset != limit_safe[i]){
617 log_err("ERROR: UTF8_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
618 }
619
620 setOffset=offset;
621 U8_SET_CP_LIMIT(input,0, setOffset, sizeof(input));
622 if(setOffset != limit_safe[i]){
623 log_err("ERROR: U8_SET_CP_LIMIT failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
624 }
625
626 i++;
627 }
628 }
629
TestAppendChar()630 static void TestAppendChar(){
631 static const uint8_t s[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00};
632 static const uint32_t test[]={
633 /*append-position(unsafe), CHAR to be appended */
634 0, 0x10401,
635 2, 0x0028,
636 2, 0x007f,
637 3, 0xd801,
638 1, 0x20402,
639 8, 0x10401,
640 5, 0xc0,
641 5, 0xc1,
642 5, 0xfd,
643 6, 0x80,
644 6, 0x81,
645 6, 0xbf,
646 7, 0xfe,
647
648 /*append-position(safe), CHAR to be appended */
649 0, 0x10401,
650 2, 0x0028,
651 3, 0x7f,
652 3, 0xd801, /* illegal for UTF-8 starting with Unicode 3.2 */
653 1, 0x20402,
654 9, 0x10401,
655 5, 0xc0,
656 5, 0xc1,
657 5, 0xfd,
658 6, 0x80,
659 6, 0x81,
660 6, 0xbf,
661 7, 0xfe,
662
663 };
664 static const uint16_t movedOffset[]={
665 /*offset-moved-to(unsafe)*/
666 4, /*for append-pos: 0 , CHAR 0x10401*/
667 3,
668 3,
669 6,
670 5,
671 12,
672 7,
673 7,
674 7,
675 8,
676 8,
677 8,
678 9,
679
680 /*offset-moved-to(safe)*/
681 4, /*for append-pos: 0, CHAR 0x10401*/
682 3,
683 4,
684 6,
685 5,
686 11,
687 7,
688 7,
689 7,
690 8,
691 8,
692 8,
693 9,
694
695 };
696
697 static const uint8_t result[][11]={
698 /*unsafe*/
699 {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
700 {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
701 {0x61, 0x62, 0x7f, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
702 {0x61, 0x62, 0x63, 0xed, 0xa0, 0x81, 0x67, 0x68, 0x69, 0x6a, 0x00},
703 {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
704 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0xF0, 0x90, 0x90},
705
706 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
707 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
708 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
709
710 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
711 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
712 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
713
714 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
715 /*safe*/
716 {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
717 {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
718 {0x61, 0x62, 0x63, 0x7f, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
719 {0x61, 0x62, 0x63, 0xef, 0xbf, 0xbf, 0x67, 0x68, 0x69, 0x6a, 0x00},
720 {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
721 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xc2, 0x9f}, /*gets UTF8_ERROR_VALUE_2 which takes 2 bytes 0xc0, 0x9f*/
722
723 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
724 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
725 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
726
727 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
728 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
729 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
730
731 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
732
733 };
734 uint16_t i, count=0;
735 uint8_t str[12];
736 uint32_t offset;
737 /* UChar32 c=0;*/
738 uint16_t size=sizeof(s)/sizeof(s[0]);
739 for(i=0; i<sizeof(test)/sizeof(test[0]); i=(uint16_t)(i+2)){
740 uprv_memcpy(str, s, size);
741 offset=test[i];
742 if(count<13){
743 UTF8_APPEND_CHAR_UNSAFE(str, offset, test[i+1]);
744 if(offset != movedOffset[count]){
745 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",
746 count, movedOffset[count], offset);
747
748 }
749 if(uprv_memcmp(str, result[count], size) !=0){
750 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed for count=%d. \nExpected:", count);
751 printUChars(result[count], size);
752 log_err("\nGot: ");
753 printUChars(str, size);
754 log_err("\n");
755 }
756 }else{
757 UTF8_APPEND_CHAR_SAFE(str, offset, size, test[i+1]);
758 if(offset != movedOffset[count]){
759 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",
760 count, movedOffset[count], offset);
761
762 }
763 if(uprv_memcmp(str, result[count], size) !=0){
764 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed for count=%d. \nExpected:", count);
765 printUChars(result[count], size);
766 log_err("\nGot: ");
767 printUChars(str, size);
768 log_err("\n");
769 }
770 /*call the API instead of MACRO
771 uprv_memcpy(str, s, size);
772 offset=test[i];
773 c=test[i+1];
774 if((uint32_t)(c)<=0x7f) {
775 (str)[(offset)++]=(uint8_t)(c);
776 } else {
777 (offset)=utf8_appendCharSafeBody(str, (int32_t)(offset), (int32_t)(size), c);
778 }
779 if(offset != movedOffset[count]){
780 log_err("ERROR: utf8_appendCharSafeBody() failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",
781 count, movedOffset[count], offset);
782
783 }
784 if(uprv_memcmp(str, result[count], size) !=0){
785 log_err("ERROR: utf8_appendCharSafeBody() failed for count=%d. \nExpected:", count);
786 printUChars(result[count], size);
787 printf("\nGot: ");
788 printUChars(str, size);
789 printf("\n");
790 }
791 */
792 }
793 count++;
794 }
795
796
797 }
798
TestAppend()799 static void TestAppend() {
800 static const UChar32 codePoints[]={
801 0x61, 0xdf, 0x901, 0x3040,
802 0xac00, 0xd800, 0xdbff, 0xdcde,
803 0xdffd, 0xe000, 0xffff, 0x10000,
804 0x12345, 0xe0021, 0x10ffff, 0x110000,
805 0x234567, 0x7fffffff, -1, -1000,
806 0, 0x400
807 };
808 static const uint8_t expectUnsafe[]={
809 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80,
810 0xea, 0xb0, 0x80, 0xed, 0xa0, 0x80, 0xed, 0xaf, 0xbf, 0xed, 0xb3, 0x9e,
811 0xed, 0xbf, 0xbd, 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80,
812 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */
813 /* none from this line */
814 0, 0xd0, 0x80
815 }, expectSafe[]={
816 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80,
817 0xea, 0xb0, 0x80, /* no surrogates */
818 /* no surrogates */ 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80,
819 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */
820 /* none from this line */
821 0, 0xd0, 0x80
822 };
823
824 uint8_t buffer[100];
825 UChar32 c;
826 int32_t i, length;
827 UBool isError, expectIsError, wrongIsError;
828
829 length=0;
830 for(i=0; i<LENGTHOF(codePoints); ++i) {
831 c=codePoints[i];
832 if(c<0 || 0x10ffff<c) {
833 continue; /* skip non-code points for U8_APPEND_UNSAFE */
834 }
835
836 U8_APPEND_UNSAFE(buffer, length, c);
837 }
838 if(length!=LENGTHOF(expectUnsafe) || 0!=memcmp(buffer, expectUnsafe, length)) {
839 log_err("U8_APPEND_UNSAFE did not generate the expected output\n");
840 }
841
842 length=0;
843 wrongIsError=FALSE;
844 for(i=0; i<LENGTHOF(codePoints); ++i) {
845 c=codePoints[i];
846 expectIsError= c<0 || 0x10ffff<c || U_IS_SURROGATE(c);
847 isError=FALSE;
848
849 U8_APPEND(buffer, length, LENGTHOF(buffer), c, isError);
850 wrongIsError|= isError!=expectIsError;
851 }
852 if(wrongIsError) {
853 log_err("U8_APPEND did not set isError correctly\n");
854 }
855 if(length!=LENGTHOF(expectSafe) || 0!=memcmp(buffer, expectSafe, length)) {
856 log_err("U8_APPEND did not generate the expected output\n");
857 }
858 }
859
860 static void
TestSurrogates()861 TestSurrogates() {
862 static const uint8_t b[]={
863 0xc3, 0x9f, /* 00DF */
864 0xed, 0x9f, 0xbf, /* D7FF */
865 0xed, 0xa0, 0x81, /* D801 */
866 0xed, 0xbf, 0xbe, /* DFFE */
867 0xee, 0x80, 0x80, /* E000 */
868 0xf0, 0x97, 0xbf, 0xbe /* 17FFE */
869 };
870 static const UChar32 cp[]={
871 0xdf, 0xd7ff, 0xd801, 0xdffe, 0xe000, 0x17ffe
872 };
873
874 UChar32 cu, cs, cl;
875 int32_t i, j, k, iu, is, il, length;
876
877 k=0; /* index into cp[] */
878 length=LENGTHOF(b);
879 for(i=0; i<length;) {
880 j=i;
881 U8_NEXT_UNSAFE(b, j, cu);
882 iu=j;
883
884 j=i;
885 U8_NEXT(b, j, length, cs);
886 is=j;
887
888 j=i;
889 L8_NEXT(b, j, length, cl);
890 il=j;
891
892 if(cu!=cp[k]) {
893 log_err("U8_NEXT_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
894 }
895
896 /* U8_NEXT() returns <0 for surrogate code points */
897 if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
898 log_err("U8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
899 }
900
901 /* L8_NEXT() returns surrogate code points like U8_NEXT_UNSAFE() */
902 if(cl!=cu) {
903 log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
904 }
905
906 if(is!=iu || il!=iu) {
907 log_err("U8_NEXT(b[%ld]) or L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
908 }
909
910 ++k; /* next code point */
911 i=iu; /* advance by one UTF-8 sequence */
912 }
913
914 while(i>0) {
915 --k; /* previous code point */
916
917 j=i;
918 U8_PREV_UNSAFE(b, j, cu);
919 iu=j;
920
921 j=i;
922 U8_PREV(b, 0, j, cs);
923 is=j;
924
925 j=i;
926 L8_PREV(b, 0, j, cl);
927 il=j;
928
929 if(cu!=cp[k]) {
930 log_err("U8_PREV_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
931 }
932
933 /* U8_PREV() returns <0 for surrogate code points */
934 if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
935 log_err("U8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
936 }
937
938 /* L8_PREV() returns surrogate code points like U8_PREV_UNSAFE() */
939 if(cl!=cu) {
940 log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
941 }
942
943 if(is!=iu || il !=iu) {
944 log_err("U8_PREV(b[%ld]) or L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
945 }
946
947 i=iu; /* go back by one UTF-8 sequence */
948 }
949 }
950
printUChars(const uint8_t * uchars,int16_t len)951 static void printUChars(const uint8_t *uchars, int16_t len){
952 int16_t i=0;
953 for(i=0; i<len; i++){
954 log_err("0x%02x ", *(uchars+i));
955 }
956 }
957