• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2003-2014, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  convtest.cpp
11 *   encoding:   US-ASCII
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2003jul15
16 *   created by: Markus W. Scherer
17 *
18 *   Test file for data-driven conversion tests.
19 */
20 
21 #include "unicode/utypes.h"
22 
23 #if !UCONFIG_NO_LEGACY_CONVERSION
24 /*
25  * Note: Turning off all of convtest.cpp if !UCONFIG_NO_LEGACY_CONVERSION
26  * is slightly unnecessary - it removes tests for Unicode charsets
27  * like UTF-8 that should work.
28  * However, there is no easy way for the test to detect whether a test case
29  * is for a Unicode charset, so it would be difficult to only exclude those.
30  * Also, regular testing of ICU is done with all modules on, therefore
31  * not testing conversion for a custom configuration like this should be ok.
32  */
33 
34 #include "unicode/ucnv.h"
35 #include "unicode/unistr.h"
36 #include "unicode/parsepos.h"
37 #include "unicode/uniset.h"
38 #include "unicode/ustring.h"
39 #include "unicode/ures.h"
40 #include "convtest.h"
41 #include "cmemory.h"
42 #include "unicode/tstdtmod.h"
43 #include <string.h>
44 #include <stdlib.h>
45 
46 enum {
47     // characters used in test data for callbacks
48     SUB_CB='?',
49     SKIP_CB='0',
50     STOP_CB='.',
51     ESC_CB='&'
52 };
53 
ConversionTest()54 ConversionTest::ConversionTest() {
55     UErrorCode errorCode=U_ZERO_ERROR;
56     utf8Cnv=ucnv_open("UTF-8", &errorCode);
57     ucnv_setToUCallBack(utf8Cnv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
58     if(U_FAILURE(errorCode)) {
59         errln("unable to open UTF-8 converter");
60     }
61 }
62 
~ConversionTest()63 ConversionTest::~ConversionTest() {
64     ucnv_close(utf8Cnv);
65 }
66 
67 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)68 ConversionTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
69     if (exec) logln("TestSuite ConversionTest: ");
70     switch (index) {
71 #if !UCONFIG_NO_FILE_IO
72         case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break;
73         case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break;
74         case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break;
75         case 3: name="TestDefaultIgnorableCallback"; if (exec) TestDefaultIgnorableCallback(); break;
76 #else
77         case 0:
78         case 1:
79         case 2:
80         case 3: name="skip"; break;
81 #endif
82         case 4: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); break;
83         default: name=""; break; //needed to end loop
84     }
85 }
86 
87 // test data interface ----------------------------------------------------- ***
88 
89 void
TestToUnicode()90 ConversionTest::TestToUnicode() {
91     ConversionCase cc;
92     char charset[100], cbopt[4];
93     const char *option;
94     UnicodeString s, unicode;
95     int32_t offsetsLength;
96     UConverterToUCallback callback;
97 
98     TestDataModule *dataModule;
99     TestData *testData;
100     const DataMap *testCase;
101     UErrorCode errorCode;
102     int32_t i;
103 
104     errorCode=U_ZERO_ERROR;
105     dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode);
106     if(U_SUCCESS(errorCode)) {
107         testData=dataModule->createTestData("toUnicode", errorCode);
108         if(U_SUCCESS(errorCode)) {
109             for(i=0; testData->nextCase(testCase, errorCode); ++i) {
110                 if(U_FAILURE(errorCode)) {
111                     errln("error retrieving conversion/toUnicode test case %d - %s",
112                             i, u_errorName(errorCode));
113                     errorCode=U_ZERO_ERROR;
114                     continue;
115                 }
116 
117                 cc.caseNr=i;
118 
119                 s=testCase->getString("charset", errorCode);
120                 s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
121                 cc.charset=charset;
122 
123                 // BEGIN android-added
124                 // To save space, Android does not build full ISO-2022-CN tables.
125                 // We skip the TestGetKeywordValuesForLocale for counting available collations.
126                 if (strlen(charset) >= 8 &&
127                     strncmp(charset+4, "2022-CN", 4) == 0) {
128                     continue;
129                 }
130                 // END android-added
131 
132                 cc.bytes=testCase->getBinary(cc.bytesLength, "bytes", errorCode);
133                 unicode=testCase->getString("unicode", errorCode);
134                 cc.unicode=unicode.getBuffer();
135                 cc.unicodeLength=unicode.length();
136 
137                 offsetsLength=0;
138                 cc.offsets=testCase->getIntVector(offsetsLength, "offsets", errorCode);
139                 if(offsetsLength==0) {
140                     cc.offsets=NULL;
141                 } else if(offsetsLength!=unicode.length()) {
142                     errln("toUnicode[%d] unicode[%d] and offsets[%d] must have the same length",
143                             i, unicode.length(), offsetsLength);
144                     errorCode=U_ILLEGAL_ARGUMENT_ERROR;
145                 }
146 
147                 cc.finalFlush= 0!=testCase->getInt28("flush", errorCode);
148                 cc.fallbacks= 0!=testCase->getInt28("fallbacks", errorCode);
149 
150                 s=testCase->getString("errorCode", errorCode);
151                 if(s==UNICODE_STRING("invalid", 7)) {
152                     cc.outErrorCode=U_INVALID_CHAR_FOUND;
153                 } else if(s==UNICODE_STRING("illegal", 7)) {
154                     cc.outErrorCode=U_ILLEGAL_CHAR_FOUND;
155                 } else if(s==UNICODE_STRING("truncated", 9)) {
156                     cc.outErrorCode=U_TRUNCATED_CHAR_FOUND;
157                 } else if(s==UNICODE_STRING("illesc", 6)) {
158                     cc.outErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
159                 } else if(s==UNICODE_STRING("unsuppesc", 9)) {
160                     cc.outErrorCode=U_UNSUPPORTED_ESCAPE_SEQUENCE;
161                 } else {
162                     cc.outErrorCode=U_ZERO_ERROR;
163                 }
164 
165                 s=testCase->getString("callback", errorCode);
166                 s.extract(0, 0x7fffffff, cbopt, sizeof(cbopt), "");
167                 cc.cbopt=cbopt;
168                 switch(cbopt[0]) {
169                 case SUB_CB:
170                     callback=UCNV_TO_U_CALLBACK_SUBSTITUTE;
171                     break;
172                 case SKIP_CB:
173                     callback=UCNV_TO_U_CALLBACK_SKIP;
174                     break;
175                 case STOP_CB:
176                     callback=UCNV_TO_U_CALLBACK_STOP;
177                     break;
178                 case ESC_CB:
179                     callback=UCNV_TO_U_CALLBACK_ESCAPE;
180                     break;
181                 default:
182                     callback=NULL;
183                     break;
184                 }
185                 option=callback==NULL ? cbopt : cbopt+1;
186                 if(*option==0) {
187                     option=NULL;
188                 }
189 
190                 cc.invalidChars=testCase->getBinary(cc.invalidLength, "invalidChars", errorCode);
191 
192                 if(U_FAILURE(errorCode)) {
193                     errln("error parsing conversion/toUnicode test case %d - %s",
194                             i, u_errorName(errorCode));
195                     errorCode=U_ZERO_ERROR;
196                 } else {
197                     logln("TestToUnicode[%d] %s", i, charset);
198                     ToUnicodeCase(cc, callback, option);
199                 }
200             }
201             delete testData;
202         }
203         delete dataModule;
204     }
205     else {
206         dataerrln("Could not load test conversion data");
207     }
208 }
209 
210 void
TestFromUnicode()211 ConversionTest::TestFromUnicode() {
212     ConversionCase cc;
213     char charset[100], cbopt[4];
214     const char *option;
215     UnicodeString s, unicode, invalidUChars;
216     int32_t offsetsLength, index;
217     UConverterFromUCallback callback;
218 
219     TestDataModule *dataModule;
220     TestData *testData;
221     const DataMap *testCase;
222     const UChar *p;
223     UErrorCode errorCode;
224     int32_t i, length;
225 
226     errorCode=U_ZERO_ERROR;
227     dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode);
228     if(U_SUCCESS(errorCode)) {
229         testData=dataModule->createTestData("fromUnicode", errorCode);
230         if(U_SUCCESS(errorCode)) {
231             for(i=0; testData->nextCase(testCase, errorCode); ++i) {
232                 if(U_FAILURE(errorCode)) {
233                     errln("error retrieving conversion/fromUnicode test case %d - %s",
234                             i, u_errorName(errorCode));
235                     errorCode=U_ZERO_ERROR;
236                     continue;
237                 }
238 
239                 cc.caseNr=i;
240 
241                 s=testCase->getString("charset", errorCode);
242                 s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
243                 cc.charset=charset;
244 
245                 // BEGIN android-added
246                 // To save space, Android does not build full ISO-2022-CN tables.
247                 // We skip the TestGetKeywordValuesForLocale for counting available collations.
248                 if (strlen(charset) >= 8 &&
249                     strncmp(charset+4, "2022-CN", 4) == 0) {
250                     continue;
251                 }
252                 // END android-added
253 
254                 unicode=testCase->getString("unicode", errorCode);
255                 cc.unicode=unicode.getBuffer();
256                 cc.unicodeLength=unicode.length();
257                 cc.bytes=testCase->getBinary(cc.bytesLength, "bytes", errorCode);
258 
259                 offsetsLength=0;
260                 cc.offsets=testCase->getIntVector(offsetsLength, "offsets", errorCode);
261                 if(offsetsLength==0) {
262                     cc.offsets=NULL;
263                 } else if(offsetsLength!=cc.bytesLength) {
264                     errln("fromUnicode[%d] bytes[%d] and offsets[%d] must have the same length",
265                             i, cc.bytesLength, offsetsLength);
266                     errorCode=U_ILLEGAL_ARGUMENT_ERROR;
267                 }
268 
269                 cc.finalFlush= 0!=testCase->getInt28("flush", errorCode);
270                 cc.fallbacks= 0!=testCase->getInt28("fallbacks", errorCode);
271 
272                 s=testCase->getString("errorCode", errorCode);
273                 if(s==UNICODE_STRING("invalid", 7)) {
274                     cc.outErrorCode=U_INVALID_CHAR_FOUND;
275                 } else if(s==UNICODE_STRING("illegal", 7)) {
276                     cc.outErrorCode=U_ILLEGAL_CHAR_FOUND;
277                 } else if(s==UNICODE_STRING("truncated", 9)) {
278                     cc.outErrorCode=U_TRUNCATED_CHAR_FOUND;
279                 } else {
280                     cc.outErrorCode=U_ZERO_ERROR;
281                 }
282 
283                 s=testCase->getString("callback", errorCode);
284                 cc.setSub=0; // default: no subchar
285 
286                 if((index=s.indexOf((UChar)0))>0) {
287                     // read NUL-separated subchar first, if any
288                     // copy the subchar from Latin-1 characters
289                     // start after the NUL
290                     p=s.getTerminatedBuffer();
291                     length=index+1;
292                     p+=length;
293                     length=s.length()-length;
294                     if(length<=0 || length>=(int32_t)sizeof(cc.subchar)) {
295                         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
296                     } else {
297                         int32_t j;
298 
299                         for(j=0; j<length; ++j) {
300                             cc.subchar[j]=(char)p[j];
301                         }
302                         // NUL-terminate the subchar
303                         cc.subchar[j]=0;
304                         cc.setSub=1;
305                     }
306 
307                     // remove the NUL and subchar from s
308                     s.truncate(index);
309                 } else if((index=s.indexOf((UChar)0x3d))>0) /* '=' */ {
310                     // read a substitution string, separated by an equal sign
311                     p=s.getBuffer()+index+1;
312                     length=s.length()-(index+1);
313                     if(length<0 || length>=UPRV_LENGTHOF(cc.subString)) {
314                         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
315                     } else {
316                         u_memcpy(cc.subString, p, length);
317                         // NUL-terminate the subString
318                         cc.subString[length]=0;
319                         cc.setSub=-1;
320                     }
321 
322                     // remove the equal sign and subString from s
323                     s.truncate(index);
324                 }
325 
326                 s.extract(0, 0x7fffffff, cbopt, sizeof(cbopt), "");
327                 cc.cbopt=cbopt;
328                 switch(cbopt[0]) {
329                 case SUB_CB:
330                     callback=UCNV_FROM_U_CALLBACK_SUBSTITUTE;
331                     break;
332                 case SKIP_CB:
333                     callback=UCNV_FROM_U_CALLBACK_SKIP;
334                     break;
335                 case STOP_CB:
336                     callback=UCNV_FROM_U_CALLBACK_STOP;
337                     break;
338                 case ESC_CB:
339                     callback=UCNV_FROM_U_CALLBACK_ESCAPE;
340                     break;
341                 default:
342                     callback=NULL;
343                     break;
344                 }
345                 option=callback==NULL ? cbopt : cbopt+1;
346                 if(*option==0) {
347                     option=NULL;
348                 }
349 
350                 invalidUChars=testCase->getString("invalidUChars", errorCode);
351                 cc.invalidUChars=invalidUChars.getBuffer();
352                 cc.invalidLength=invalidUChars.length();
353 
354                 if(U_FAILURE(errorCode)) {
355                     errln("error parsing conversion/fromUnicode test case %d - %s",
356                             i, u_errorName(errorCode));
357                     errorCode=U_ZERO_ERROR;
358                 } else {
359                     logln("TestFromUnicode[%d] %s", i, charset);
360                     FromUnicodeCase(cc, callback, option);
361                 }
362             }
363             delete testData;
364         }
365         delete dataModule;
366     }
367     else {
368         dataerrln("Could not load test conversion data");
369     }
370 }
371 
372 static const UChar ellipsis[]={ 0x2e, 0x2e, 0x2e };
373 
374 void
TestGetUnicodeSet()375 ConversionTest::TestGetUnicodeSet() {
376     char charset[100];
377     UnicodeString s, map, mapnot;
378     int32_t which;
379 
380     ParsePosition pos;
381     UnicodeSet cnvSet, mapSet, mapnotSet, diffSet;
382     UnicodeSet *cnvSetPtr = &cnvSet;
383     LocalUConverterPointer cnv;
384 
385     TestDataModule *dataModule;
386     TestData *testData;
387     const DataMap *testCase;
388     UErrorCode errorCode;
389     int32_t i;
390 
391     errorCode=U_ZERO_ERROR;
392     dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode);
393     if(U_SUCCESS(errorCode)) {
394         testData=dataModule->createTestData("getUnicodeSet", errorCode);
395         if(U_SUCCESS(errorCode)) {
396             for(i=0; testData->nextCase(testCase, errorCode); ++i) {
397                 if(U_FAILURE(errorCode)) {
398                     errln("error retrieving conversion/getUnicodeSet test case %d - %s",
399                             i, u_errorName(errorCode));
400                     errorCode=U_ZERO_ERROR;
401                     continue;
402                 }
403 
404                 s=testCase->getString("charset", errorCode);
405                 s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
406 
407                 // BEGIN android-added
408                 // To save space, Android does not build full ISO-2022-CN tables.
409                 // We skip the TestGetKeywordValuesForLocale for counting available collations.
410                 if (strlen(charset) >= 8 &&
411                     strncmp(charset+4, "2022-CN", 4) == 0) {
412                     continue;
413                 }
414                 // END android-added
415 
416                 map=testCase->getString("map", errorCode);
417                 mapnot=testCase->getString("mapnot", errorCode);
418 
419                 which=testCase->getInt28("which", errorCode);
420 
421                 if(U_FAILURE(errorCode)) {
422                     errln("error parsing conversion/getUnicodeSet test case %d - %s",
423                             i, u_errorName(errorCode));
424                     errorCode=U_ZERO_ERROR;
425                     continue;
426                 }
427 
428                 // test this test case
429                 mapSet.clear();
430                 mapnotSet.clear();
431 
432                 pos.setIndex(0);
433                 mapSet.applyPattern(map, pos, 0, NULL, errorCode);
434                 if(U_FAILURE(errorCode) || pos.getIndex()!=map.length()) {
435                     errln("error creating the map set for conversion/getUnicodeSet test case %d - %s\n"
436                           "    error index %d  index %d  U+%04x",
437                             i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), map.char32At(pos.getIndex()));
438                     errorCode=U_ZERO_ERROR;
439                     continue;
440                 }
441 
442                 pos.setIndex(0);
443                 mapnotSet.applyPattern(mapnot, pos, 0, NULL, errorCode);
444                 if(U_FAILURE(errorCode) || pos.getIndex()!=mapnot.length()) {
445                     errln("error creating the mapnot set for conversion/getUnicodeSet test case %d - %s\n"
446                           "    error index %d  index %d  U+%04x",
447                             i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), mapnot.char32At(pos.getIndex()));
448                     errorCode=U_ZERO_ERROR;
449                     continue;
450                 }
451 
452                 logln("TestGetUnicodeSet[%d] %s", i, charset);
453 
454                 cnv.adoptInstead(cnv_open(charset, errorCode));
455                 if(U_FAILURE(errorCode)) {
456                     errcheckln(errorCode, "error opening \"%s\" for conversion/getUnicodeSet test case %d - %s",
457                             charset, i, u_errorName(errorCode));
458                     errorCode=U_ZERO_ERROR;
459                     continue;
460                 }
461 
462                 ucnv_getUnicodeSet(cnv.getAlias(), cnvSetPtr->toUSet(), (UConverterUnicodeSet)which, &errorCode);
463 
464                 if(U_FAILURE(errorCode)) {
465                     errln("error in ucnv_getUnicodeSet(\"%s\") for conversion/getUnicodeSet test case %d - %s",
466                             charset, i, u_errorName(errorCode));
467                     errorCode=U_ZERO_ERROR;
468                     continue;
469                 }
470 
471                 // are there items that must be in cnvSet but are not?
472                 (diffSet=mapSet).removeAll(cnvSet);
473                 if(!diffSet.isEmpty()) {
474                     diffSet.toPattern(s, TRUE);
475                     if(s.length()>100) {
476                         s.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
477                     }
478                     errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - conversion/getUnicodeSet test case %d",
479                             charset, i);
480                     errln(s);
481                 }
482 
483                 // are there items that must not be in cnvSet but are?
484                 (diffSet=mapnotSet).retainAll(cnvSet);
485                 if(!diffSet.isEmpty()) {
486                     diffSet.toPattern(s, TRUE);
487                     if(s.length()>100) {
488                         s.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
489                     }
490                     errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - conversion/getUnicodeSet test case %d",
491                             charset, i);
492                     errln(s);
493                 }
494             }
495             delete testData;
496         }
497         delete dataModule;
498     }
499     else {
500         dataerrln("Could not load test conversion data");
501     }
502 }
503 
504 U_CDECL_BEGIN
505 static void U_CALLCONV
getUnicodeSetCallback(const void * context,UConverterFromUnicodeArgs *,const UChar *,int32_t,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * pErrorCode)506 getUnicodeSetCallback(const void *context,
507                       UConverterFromUnicodeArgs * /*fromUArgs*/,
508                       const UChar* /*codeUnits*/,
509                       int32_t /*length*/,
510                       UChar32 codePoint,
511                       UConverterCallbackReason reason,
512                       UErrorCode *pErrorCode) {
513     if(reason<=UCNV_IRREGULAR) {
514         ((UnicodeSet *)context)->remove(codePoint);  // the converter cannot convert this code point
515         *pErrorCode=U_ZERO_ERROR;                    // skip
516     }  // else ignore the reset, close and clone calls.
517 }
518 U_CDECL_END
519 
520 // Compare ucnv_getUnicodeSet() with the set of characters that can be converted.
521 void
TestGetUnicodeSet2()522 ConversionTest::TestGetUnicodeSet2() {
523     // Build a string with all code points.
524     UChar32 cpLimit;
525     int32_t s0Length;
526     if(quick) {
527         cpLimit=s0Length=0x10000;  // BMP only
528     } else {
529         cpLimit=0x110000;
530         s0Length=0x10000+0x200000;  // BMP + surrogate pairs
531     }
532     UChar *s0=new UChar[s0Length];
533     if(s0==NULL) {
534         return;
535     }
536     UChar *s=s0;
537     UChar32 c;
538     UChar c2;
539     // low BMP
540     for(c=0; c<=0xd7ff; ++c) {
541         *s++=(UChar)c;
542     }
543     // trail surrogates
544     for(c=0xdc00; c<=0xdfff; ++c) {
545         *s++=(UChar)c;
546     }
547     // lead surrogates
548     // (after trails so that there is not even one surrogate pair in between)
549     for(c=0xd800; c<=0xdbff; ++c) {
550         *s++=(UChar)c;
551     }
552     // high BMP
553     for(c=0xe000; c<=0xffff; ++c) {
554         *s++=(UChar)c;
555     }
556     // supplementary code points = surrogate pairs
557     if(cpLimit==0x110000) {
558         for(c=0xd800; c<=0xdbff; ++c) {
559             for(c2=0xdc00; c2<=0xdfff; ++c2) {
560                 *s++=(UChar)c;
561                 *s++=c2;
562             }
563         }
564     }
565 
566     static const char *const cnvNames[]={
567         "UTF-8",
568         "UTF-7",
569         "UTF-16",
570         "US-ASCII",
571         "ISO-8859-1",
572         "windows-1252",
573         "Shift-JIS",
574         "ibm-1390",  // EBCDIC_STATEFUL table
575         "ibm-16684",  // DBCS-only extension table based on EBCDIC_STATEFUL table
576         "HZ",
577         "ISO-2022-JP",
578         "JIS7",
579         "ISO-2022-CN",
580         "ISO-2022-CN-EXT",
581         "LMBCS"
582     };
583     LocalUConverterPointer cnv;
584     char buffer[1024];
585     int32_t i;
586     for(i=0; i<UPRV_LENGTHOF(cnvNames); ++i) {
587         UErrorCode errorCode=U_ZERO_ERROR;
588         cnv.adoptInstead(cnv_open(cnvNames[i], errorCode));
589         if(U_FAILURE(errorCode)) {
590             errcheckln(errorCode, "failed to open converter %s - %s", cnvNames[i], u_errorName(errorCode));
591             continue;
592         }
593         UnicodeSet expected;
594         ucnv_setFromUCallBack(cnv.getAlias(), getUnicodeSetCallback, &expected, NULL, NULL, &errorCode);
595         if(U_FAILURE(errorCode)) {
596             errln("failed to set the callback on converter %s - %s", cnvNames[i], u_errorName(errorCode));
597             continue;
598         }
599         UConverterUnicodeSet which;
600         for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUnicodeSet)((int)which+1)) {
601             if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
602                 ucnv_setFallback(cnv.getAlias(), TRUE);
603             }
604             expected.add(0, cpLimit-1);
605             s=s0;
606             UBool flush;
607             do {
608                 char *t=buffer;
609                 flush=(UBool)(s==s0+s0Length);
610                 ucnv_fromUnicode(cnv.getAlias(), &t, buffer+sizeof(buffer), (const UChar **)&s, s0+s0Length, NULL, flush, &errorCode);
611                 if(U_FAILURE(errorCode)) {
612                     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
613                         errorCode=U_ZERO_ERROR;
614                         continue;
615                     } else {
616                         break;  // unexpected error, should not occur
617                     }
618                 }
619             } while(!flush);
620             UnicodeSet set;
621             ucnv_getUnicodeSet(cnv.getAlias(), set.toUSet(), which, &errorCode);
622             if(cpLimit<0x110000) {
623                 set.remove(cpLimit, 0x10ffff);
624             }
625             if(which==UCNV_ROUNDTRIP_SET) {
626                 // ignore PUA code points because they will be converted even if they
627                 // are fallbacks and when other fallbacks are turned off,
628                 // but ucnv_getUnicodeSet(UCNV_ROUNDTRIP_SET) delivers true roundtrips
629                 expected.remove(0xe000, 0xf8ff);
630                 expected.remove(0xf0000, 0xffffd);
631                 expected.remove(0x100000, 0x10fffd);
632                 set.remove(0xe000, 0xf8ff);
633                 set.remove(0xf0000, 0xffffd);
634                 set.remove(0x100000, 0x10fffd);
635             }
636             if(set!=expected) {
637                 // First try to see if we have different sets because ucnv_getUnicodeSet()
638                 // added strings: The above conversion method does not tell us what strings might be convertible.
639                 // Remove strings from the set and compare again.
640                 // Unfortunately, there are no good, direct set methods for finding out whether there are strings
641                 // in the set, nor for enumerating or removing just them.
642                 // Intersect all code points with the set. The intersection will not contain strings.
643                 UnicodeSet temp(0, 0x10ffff);
644                 temp.retainAll(set);
645                 set=temp;
646             }
647             if(set!=expected) {
648                 UnicodeSet diffSet;
649                 UnicodeString out;
650 
651                 // are there items that must be in the set but are not?
652                 (diffSet=expected).removeAll(set);
653                 if(!diffSet.isEmpty()) {
654                     diffSet.toPattern(out, TRUE);
655                     if(out.length()>100) {
656                         out.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
657                     }
658                     errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d",
659                             cnvNames[i], which);
660                     errln(out);
661                 }
662 
663                 // are there items that must not be in the set but are?
664                 (diffSet=set).removeAll(expected);
665                 if(!diffSet.isEmpty()) {
666                     diffSet.toPattern(out, TRUE);
667                     if(out.length()>100) {
668                         out.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
669                     }
670                     errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - which set: %d",
671                             cnvNames[i], which);
672                     errln(out);
673                 }
674             }
675         }
676     }
677 
678     delete [] s0;
679 }
680 
681 // Test all codepoints which has the default ignorable Unicode property are ignored if they have no mapping
682 // If there are any failures, the hard coded list (IS_DEFAULT_IGNORABLE_CODE_POINT) in ucnv_err.c should be updated
683 void
TestDefaultIgnorableCallback()684 ConversionTest::TestDefaultIgnorableCallback() {
685     UErrorCode status = U_ZERO_ERROR;
686     const char *cnv_name = "euc-jp-2007";
687     const char *pattern_ignorable = "[:Default_Ignorable_Code_Point:]";
688     const char *pattern_not_ignorable = "[:^Default_Ignorable_Code_Point:]";
689 
690     UnicodeSet *set_ignorable = new UnicodeSet(pattern_ignorable, status);
691     if (U_FAILURE(status)) {
692         dataerrln("Unable to create Unicodeset: %s - %s\n", pattern_ignorable, u_errorName(status));
693         return;
694     }
695 
696     UnicodeSet *set_not_ignorable = new UnicodeSet(pattern_not_ignorable, status);
697     if (U_FAILURE(status)) {
698         dataerrln("Unable to create Unicodeset: %s - %s\n", pattern_not_ignorable, u_errorName(status));
699         return;
700     }
701 
702     UConverter *cnv = cnv_open(cnv_name, status);
703     if (U_FAILURE(status)) {
704         dataerrln("Unable to open converter: %s - %s\n", cnv_name, u_errorName(status));
705         return;
706     }
707 
708     // set callback for the converter
709     ucnv_setFromUCallBack(cnv, UCNV_FROM_U_CALLBACK_SUBSTITUTE, NULL, NULL, NULL, &status);
710 
711     UChar32 input[1];
712     char output[10];
713     int32_t outputLength;
714 
715     // test default ignorables are ignored
716     int size = set_ignorable->size();
717     for (int i = 0; i < size; i++) {
718         status = U_ZERO_ERROR;
719         outputLength= 0;
720 
721         input[0] = set_ignorable->charAt(i);
722 
723         outputLength = ucnv_fromUChars(cnv, output, 10, UnicodeString::fromUTF32(input, 1).getTerminatedBuffer(), -1, &status);
724         if (U_FAILURE(status) || outputLength != 0) {
725             errln("Ignorable code point: U+%04X not skipped as expected - %s", input[0], u_errorName(status));
726         }
727     }
728 
729     // test non-ignorables are not ignored
730     size = set_not_ignorable->size();
731     for (int i = 0; i < size; i++) {
732         status = U_ZERO_ERROR;
733         outputLength= 0;
734 
735         input[0] = set_not_ignorable->charAt(i);
736 
737         if (input[0] == 0) {
738             continue;
739         }
740 
741         outputLength = ucnv_fromUChars(cnv, output, 10, UnicodeString::fromUTF32(input, 1).getTerminatedBuffer(), -1, &status);
742         if (U_FAILURE(status) || outputLength <= 0) {
743             errln("Non-ignorable code point: U+%04X skipped unexpectedly - %s", input[0], u_errorName(status));
744         }
745     }
746 
747     ucnv_close(cnv);
748     delete set_not_ignorable;
749     delete set_ignorable;
750 }
751 
752 // open testdata or ICU data converter ------------------------------------- ***
753 
754 UConverter *
cnv_open(const char * name,UErrorCode & errorCode)755 ConversionTest::cnv_open(const char *name, UErrorCode &errorCode) {
756     if(name!=NULL && *name=='+') {
757         // Converter names that start with '+' are ignored in ICU4J tests.
758         ++name;
759     }
760     if(name!=NULL && *name=='*') {
761         /* loadTestData(): set the data directory */
762         return ucnv_openPackage(loadTestData(errorCode), name+1, &errorCode);
763     } else {
764         return ucnv_open(name, &errorCode);
765     }
766 }
767 
768 // output helpers ---------------------------------------------------------- ***
769 
770 static inline char
hexDigit(uint8_t digit)771 hexDigit(uint8_t digit) {
772     return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
773 }
774 
775 static char *
printBytes(const uint8_t * bytes,int32_t length,char * out)776 printBytes(const uint8_t *bytes, int32_t length, char *out) {
777     uint8_t b;
778 
779     if(length>0) {
780         b=*bytes++;
781         --length;
782         *out++=hexDigit((uint8_t)(b>>4));
783         *out++=hexDigit((uint8_t)(b&0xf));
784     }
785 
786     while(length>0) {
787         b=*bytes++;
788         --length;
789         *out++=' ';
790         *out++=hexDigit((uint8_t)(b>>4));
791         *out++=hexDigit((uint8_t)(b&0xf));
792     }
793     *out++=0;
794     return out;
795 }
796 
797 static char *
printUnicode(const UChar * unicode,int32_t length,char * out)798 printUnicode(const UChar *unicode, int32_t length, char *out) {
799     UChar32 c;
800     int32_t i;
801 
802     for(i=0; i<length;) {
803         if(i>0) {
804             *out++=' ';
805         }
806         U16_NEXT(unicode, i, length, c);
807         // write 4..6 digits
808         if(c>=0x100000) {
809             *out++='1';
810         }
811         if(c>=0x10000) {
812             *out++=hexDigit((uint8_t)((c>>16)&0xf));
813         }
814         *out++=hexDigit((uint8_t)((c>>12)&0xf));
815         *out++=hexDigit((uint8_t)((c>>8)&0xf));
816         *out++=hexDigit((uint8_t)((c>>4)&0xf));
817         *out++=hexDigit((uint8_t)(c&0xf));
818     }
819     *out++=0;
820     return out;
821 }
822 
823 static char *
printOffsets(const int32_t * offsets,int32_t length,char * out)824 printOffsets(const int32_t *offsets, int32_t length, char *out) {
825     int32_t i, o, d;
826 
827     if(offsets==NULL) {
828         length=0;
829     }
830 
831     for(i=0; i<length; ++i) {
832         if(i>0) {
833             *out++=' ';
834         }
835         o=offsets[i];
836 
837         // print all offsets with 2 characters each (-x, -9..99, xx)
838         if(o<-9) {
839             *out++='-';
840             *out++='x';
841         } else if(o<0) {
842             *out++='-';
843             *out++=(char)('0'-o);
844         } else if(o<=99) {
845             *out++=(d=o/10)==0 ? ' ' : (char)('0'+d);
846             *out++=(char)('0'+o%10);
847         } else /* o>99 */ {
848             *out++='x';
849             *out++='x';
850         }
851     }
852     *out++=0;
853     return out;
854 }
855 
856 // toUnicode test worker functions ----------------------------------------- ***
857 
858 static int32_t
stepToUnicode(ConversionCase & cc,UConverter * cnv,UChar * result,int32_t resultCapacity,int32_t * resultOffsets,int32_t step,UErrorCode * pErrorCode)859 stepToUnicode(ConversionCase &cc, UConverter *cnv,
860               UChar *result, int32_t resultCapacity,
861               int32_t *resultOffsets, /* also resultCapacity */
862               int32_t step,
863               UErrorCode *pErrorCode) {
864     const char *source, *sourceLimit, *bytesLimit;
865     UChar *target, *targetLimit, *resultLimit;
866     UBool flush;
867 
868     source=(const char *)cc.bytes;
869     target=result;
870     bytesLimit=source+cc.bytesLength;
871     resultLimit=result+resultCapacity;
872 
873     if(step>=0) {
874         // call ucnv_toUnicode() with in/out buffers no larger than (step) at a time
875         // move only one buffer (in vs. out) at a time to be extra mean
876         // step==0 performs bulk conversion and generates offsets
877 
878         // initialize the partial limits for the loop
879         if(step==0) {
880             // use the entire buffers
881             sourceLimit=bytesLimit;
882             targetLimit=resultLimit;
883             flush=cc.finalFlush;
884         } else {
885             // start with empty partial buffers
886             sourceLimit=source;
887             targetLimit=target;
888             flush=FALSE;
889 
890             // output offsets only for bulk conversion
891             resultOffsets=NULL;
892         }
893 
894         for(;;) {
895             // resetting the opposite conversion direction must not affect this one
896             ucnv_resetFromUnicode(cnv);
897 
898             // convert
899             ucnv_toUnicode(cnv,
900                 &target, targetLimit,
901                 &source, sourceLimit,
902                 resultOffsets,
903                 flush, pErrorCode);
904 
905             // check pointers and errors
906             if(source>sourceLimit || target>targetLimit) {
907                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
908                 break;
909             } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
910                 if(target!=targetLimit) {
911                     // buffer overflow must only be set when the target is filled
912                     *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
913                     break;
914                 } else if(targetLimit==resultLimit) {
915                     // not just a partial overflow
916                     break;
917                 }
918 
919                 // the partial target is filled, set a new limit, reset the error and continue
920                 targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
921                 *pErrorCode=U_ZERO_ERROR;
922             } else if(U_FAILURE(*pErrorCode)) {
923                 // some other error occurred, done
924                 break;
925             } else {
926                 if(source!=sourceLimit) {
927                     // when no error occurs, then the input must be consumed
928                     *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
929                     break;
930                 }
931 
932                 if(sourceLimit==bytesLimit) {
933                     // we are done
934                     break;
935                 }
936 
937                 // the partial conversion succeeded, set a new limit and continue
938                 sourceLimit=(bytesLimit-source)>=step ? source+step : bytesLimit;
939                 flush=(UBool)(cc.finalFlush && sourceLimit==bytesLimit);
940             }
941         }
942     } else /* step<0 */ {
943         /*
944          * step==-1: call only ucnv_getNextUChar()
945          * otherwise alternate between ucnv_toUnicode() and ucnv_getNextUChar()
946          *   if step==-2 or -3, then give ucnv_toUnicode() the whole remaining input,
947          *   else give it at most (-step-2)/2 bytes
948          */
949         UChar32 c;
950 
951         // end the loop by getting an index out of bounds error
952         for(;;) {
953             // resetting the opposite conversion direction must not affect this one
954             ucnv_resetFromUnicode(cnv);
955 
956             // convert
957             if((step&1)!=0 /* odd: -1, -3, -5, ... */) {
958                 sourceLimit=source; // use sourceLimit not as a real limit
959                                     // but to remember the pre-getNextUChar source pointer
960                 c=ucnv_getNextUChar(cnv, &source, bytesLimit, pErrorCode);
961 
962                 // check pointers and errors
963                 if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
964                     if(source!=bytesLimit) {
965                         *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
966                     } else {
967                         *pErrorCode=U_ZERO_ERROR;
968                     }
969                     break;
970                 } else if(U_FAILURE(*pErrorCode)) {
971                     break;
972                 }
973                 // source may not move if c is from previous overflow
974 
975                 if(target==resultLimit) {
976                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
977                     break;
978                 }
979                 if(c<=0xffff) {
980                     *target++=(UChar)c;
981                 } else {
982                     *target++=U16_LEAD(c);
983                     if(target==resultLimit) {
984                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
985                         break;
986                     }
987                     *target++=U16_TRAIL(c);
988                 }
989 
990                 // alternate between -n-1 and -n but leave -1 alone
991                 if(step<-1) {
992                     ++step;
993                 }
994             } else /* step is even */ {
995                 // allow only one UChar output
996                 targetLimit=target<resultLimit ? target+1 : resultLimit;
997 
998                 // as with ucnv_getNextUChar(), we always flush (if we go to bytesLimit)
999                 // and never output offsets
1000                 if(step==-2) {
1001                     sourceLimit=bytesLimit;
1002                 } else {
1003                     sourceLimit=source+(-step-2)/2;
1004                     if(sourceLimit>bytesLimit) {
1005                         sourceLimit=bytesLimit;
1006                     }
1007                 }
1008 
1009                 ucnv_toUnicode(cnv,
1010                     &target, targetLimit,
1011                     &source, sourceLimit,
1012                     NULL, (UBool)(sourceLimit==bytesLimit), pErrorCode);
1013 
1014                 // check pointers and errors
1015                 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1016                     if(target!=targetLimit) {
1017                         // buffer overflow must only be set when the target is filled
1018                         *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1019                         break;
1020                     } else if(targetLimit==resultLimit) {
1021                         // not just a partial overflow
1022                         break;
1023                     }
1024 
1025                     // the partial target is filled, set a new limit and continue
1026                     *pErrorCode=U_ZERO_ERROR;
1027                 } else if(U_FAILURE(*pErrorCode)) {
1028                     // some other error occurred, done
1029                     break;
1030                 } else {
1031                     if(source!=sourceLimit) {
1032                         // when no error occurs, then the input must be consumed
1033                         *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1034                         break;
1035                     }
1036 
1037                     // we are done (flush==TRUE) but we continue, to get the index out of bounds error above
1038                 }
1039 
1040                 --step;
1041             }
1042         }
1043     }
1044 
1045     return (int32_t)(target-result);
1046 }
1047 
1048 UBool
ToUnicodeCase(ConversionCase & cc,UConverterToUCallback callback,const char * option)1049 ConversionTest::ToUnicodeCase(ConversionCase &cc, UConverterToUCallback callback, const char *option) {
1050     // open the converter
1051     IcuTestErrorCode errorCode(*this, "ToUnicodeCase");
1052     LocalUConverterPointer cnv(cnv_open(cc.charset, errorCode));
1053     if(errorCode.isFailure()) {
1054         errcheckln(errorCode, "toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_open() failed - %s",
1055                 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, errorCode.errorName());
1056         errorCode.reset();
1057         return FALSE;
1058     }
1059 
1060     // set the callback
1061     if(callback!=NULL) {
1062         ucnv_setToUCallBack(cnv.getAlias(), callback, option, NULL, NULL, errorCode);
1063         if(U_FAILURE(errorCode)) {
1064             errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setToUCallBack() failed - %s",
1065                     cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1066             return FALSE;
1067         }
1068     }
1069 
1070     int32_t resultOffsets[256];
1071     UChar result[256];
1072     int32_t resultLength;
1073     UBool ok;
1074 
1075     static const struct {
1076         int32_t step;
1077         const char *name;
1078     } steps[]={
1079         { 0, "bulk" }, // must be first for offsets to be checked
1080         { 1, "step=1" },
1081         { 3, "step=3" },
1082         { 7, "step=7" },
1083         { -1, "getNext" },
1084         { -2, "toU(bulk)+getNext" },
1085         { -3, "getNext+toU(bulk)" },
1086         { -4, "toU(1)+getNext" },
1087         { -5, "getNext+toU(1)" },
1088         { -12, "toU(5)+getNext" },
1089         { -13, "getNext+toU(5)" },
1090     };
1091     int32_t i, step;
1092 
1093     ok=TRUE;
1094     for(i=0; i<UPRV_LENGTHOF(steps) && ok; ++i) {
1095         step=steps[i].step;
1096         if(step<0 && !cc.finalFlush) {
1097             // skip ucnv_getNextUChar() if !finalFlush because
1098             // ucnv_getNextUChar() always implies flush
1099             continue;
1100         }
1101         if(step!=0) {
1102             // bulk test is first, then offsets are not checked any more
1103             cc.offsets=NULL;
1104         }
1105         else {
1106             memset(resultOffsets, -1, UPRV_LENGTHOF(resultOffsets));
1107         }
1108         memset(result, -1, UPRV_LENGTHOF(result));
1109         errorCode.reset();
1110         resultLength=stepToUnicode(cc, cnv.getAlias(),
1111                                 result, UPRV_LENGTHOF(result),
1112                                 step==0 ? resultOffsets : NULL,
1113                                 step, errorCode);
1114         ok=checkToUnicode(
1115                 cc, cnv.getAlias(), steps[i].name,
1116                 result, resultLength,
1117                 cc.offsets!=NULL ? resultOffsets : NULL,
1118                 errorCode);
1119         if(errorCode.isFailure() || !cc.finalFlush) {
1120             // reset if an error occurred or we did not flush
1121             // otherwise do nothing to make sure that flushing resets
1122             ucnv_resetToUnicode(cnv.getAlias());
1123         }
1124         if (cc.offsets != NULL && resultOffsets[resultLength] != -1) {
1125             errln("toUnicode[%d](%s) Conversion wrote too much to offsets at index %d",
1126                 cc.caseNr, cc.charset, resultLength);
1127         }
1128         if (result[resultLength] != (UChar)-1) {
1129             errln("toUnicode[%d](%s) Conversion wrote too much to result at index %d",
1130                 cc.caseNr, cc.charset, resultLength);
1131         }
1132     }
1133 
1134     // not a real loop, just a convenience for breaking out of the block
1135     while(ok && cc.finalFlush) {
1136         // test ucnv_toUChars()
1137         memset(result, 0, sizeof(result));
1138 
1139         errorCode.reset();
1140         resultLength=ucnv_toUChars(cnv.getAlias(),
1141                         result, UPRV_LENGTHOF(result),
1142                         (const char *)cc.bytes, cc.bytesLength,
1143                         errorCode);
1144         ok=checkToUnicode(
1145                 cc, cnv.getAlias(), "toUChars",
1146                 result, resultLength,
1147                 NULL,
1148                 errorCode);
1149         if(!ok) {
1150             break;
1151         }
1152 
1153         // test preflighting
1154         // keep the correct result for simple checking
1155         errorCode.reset();
1156         resultLength=ucnv_toUChars(cnv.getAlias(),
1157                         NULL, 0,
1158                         (const char *)cc.bytes, cc.bytesLength,
1159                         errorCode);
1160         if(errorCode.get()==U_STRING_NOT_TERMINATED_WARNING || errorCode.get()==U_BUFFER_OVERFLOW_ERROR) {
1161             errorCode.reset();
1162         }
1163         ok=checkToUnicode(
1164                 cc, cnv.getAlias(), "preflight toUChars",
1165                 result, resultLength,
1166                 NULL,
1167                 errorCode);
1168         break;
1169     }
1170 
1171     errorCode.reset();  // all errors have already been reported
1172     return ok;
1173 }
1174 
1175 UBool
checkToUnicode(ConversionCase & cc,UConverter * cnv,const char * name,const UChar * result,int32_t resultLength,const int32_t * resultOffsets,UErrorCode resultErrorCode)1176 ConversionTest::checkToUnicode(ConversionCase &cc, UConverter *cnv, const char *name,
1177                                const UChar *result, int32_t resultLength,
1178                                const int32_t *resultOffsets,
1179                                UErrorCode resultErrorCode) {
1180     char resultInvalidChars[8];
1181     int8_t resultInvalidLength;
1182     UErrorCode errorCode;
1183 
1184     const char *msg;
1185 
1186     // reset the message; NULL will mean "ok"
1187     msg=NULL;
1188 
1189     errorCode=U_ZERO_ERROR;
1190     resultInvalidLength=sizeof(resultInvalidChars);
1191     ucnv_getInvalidChars(cnv, resultInvalidChars, &resultInvalidLength, &errorCode);
1192     if(U_FAILURE(errorCode)) {
1193         errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) ucnv_getInvalidChars() failed - %s",
1194                 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, u_errorName(errorCode));
1195         return FALSE;
1196     }
1197 
1198     // check everything that might have gone wrong
1199     if(cc.unicodeLength!=resultLength) {
1200         msg="wrong result length";
1201     } else if(0!=u_memcmp(cc.unicode, result, cc.unicodeLength)) {
1202         msg="wrong result string";
1203     } else if(cc.offsets!=NULL && 0!=memcmp(cc.offsets, resultOffsets, cc.unicodeLength*sizeof(*cc.offsets))) {
1204         msg="wrong offsets";
1205     } else if(cc.outErrorCode!=resultErrorCode) {
1206         msg="wrong error code";
1207     } else if(cc.invalidLength!=resultInvalidLength) {
1208         msg="wrong length of last invalid input";
1209     } else if(0!=memcmp(cc.invalidChars, resultInvalidChars, cc.invalidLength)) {
1210         msg="wrong last invalid input";
1211     }
1212 
1213     if(msg==NULL) {
1214         return TRUE;
1215     } else {
1216         char buffer[2000]; // one buffer for all strings
1217         char *s, *bytesString, *unicodeString, *resultString,
1218             *offsetsString, *resultOffsetsString,
1219             *invalidCharsString, *resultInvalidCharsString;
1220 
1221         bytesString=s=buffer;
1222         s=printBytes(cc.bytes, cc.bytesLength, bytesString);
1223         s=printUnicode(cc.unicode, cc.unicodeLength, unicodeString=s);
1224         s=printUnicode(result, resultLength, resultString=s);
1225         s=printOffsets(cc.offsets, cc.unicodeLength, offsetsString=s);
1226         s=printOffsets(resultOffsets, resultLength, resultOffsetsString=s);
1227         s=printBytes(cc.invalidChars, cc.invalidLength, invalidCharsString=s);
1228         s=printBytes((uint8_t *)resultInvalidChars, resultInvalidLength, resultInvalidCharsString=s);
1229 
1230         if((s-buffer)>(int32_t)sizeof(buffer)) {
1231             errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) fatal error: checkToUnicode() test output buffer overflow writing %d chars\n",
1232                     cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, (int)(s-buffer));
1233             exit(1);
1234         }
1235 
1236         errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) failed: %s\n"
1237               "  bytes <%s>[%d]\n"
1238               " expected <%s>[%d]\n"
1239               "  result  <%s>[%d]\n"
1240               " offsets         <%s>\n"
1241               "  result offsets <%s>\n"
1242               " error code expected %s got %s\n"
1243               "  invalidChars expected <%s> got <%s>\n",
1244               cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, msg,
1245               bytesString, cc.bytesLength,
1246               unicodeString, cc.unicodeLength,
1247               resultString, resultLength,
1248               offsetsString,
1249               resultOffsetsString,
1250               u_errorName(cc.outErrorCode), u_errorName(resultErrorCode),
1251               invalidCharsString, resultInvalidCharsString);
1252 
1253         return FALSE;
1254     }
1255 }
1256 
1257 // fromUnicode test worker functions --------------------------------------- ***
1258 
1259 static int32_t
stepFromUTF8(ConversionCase & cc,UConverter * utf8Cnv,UConverter * cnv,char * result,int32_t resultCapacity,int32_t step,UErrorCode * pErrorCode)1260 stepFromUTF8(ConversionCase &cc,
1261              UConverter *utf8Cnv, UConverter *cnv,
1262              char *result, int32_t resultCapacity,
1263              int32_t step,
1264              UErrorCode *pErrorCode) {
1265     const char *source, *sourceLimit, *utf8Limit;
1266     UChar pivotBuffer[32];
1267     UChar *pivotSource, *pivotTarget, *pivotLimit;
1268     char *target, *targetLimit, *resultLimit;
1269     UBool flush;
1270 
1271     source=cc.utf8;
1272     pivotSource=pivotTarget=pivotBuffer;
1273     target=result;
1274     utf8Limit=source+cc.utf8Length;
1275     resultLimit=result+resultCapacity;
1276 
1277     // call ucnv_convertEx() with in/out buffers no larger than (step) at a time
1278     // move only one buffer (in vs. out) at a time to be extra mean
1279     // step==0 performs bulk conversion
1280 
1281     // initialize the partial limits for the loop
1282     if(step==0) {
1283         // use the entire buffers
1284         sourceLimit=utf8Limit;
1285         targetLimit=resultLimit;
1286         flush=cc.finalFlush;
1287 
1288         pivotLimit=pivotBuffer+UPRV_LENGTHOF(pivotBuffer);
1289     } else {
1290         // start with empty partial buffers
1291         sourceLimit=source;
1292         targetLimit=target;
1293         flush=FALSE;
1294 
1295         // empty pivot is not allowed, make it of length step
1296         pivotLimit=pivotBuffer+step;
1297     }
1298 
1299     for(;;) {
1300         // resetting the opposite conversion direction must not affect this one
1301         ucnv_resetFromUnicode(utf8Cnv);
1302         ucnv_resetToUnicode(cnv);
1303 
1304         // convert
1305         ucnv_convertEx(cnv, utf8Cnv,
1306             &target, targetLimit,
1307             &source, sourceLimit,
1308             pivotBuffer, &pivotSource, &pivotTarget, pivotLimit,
1309             FALSE, flush, pErrorCode);
1310 
1311         // check pointers and errors
1312         if(source>sourceLimit || target>targetLimit) {
1313             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1314             break;
1315         } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1316             if(target!=targetLimit) {
1317                 // buffer overflow must only be set when the target is filled
1318                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1319                 break;
1320             } else if(targetLimit==resultLimit) {
1321                 // not just a partial overflow
1322                 break;
1323             }
1324 
1325             // the partial target is filled, set a new limit, reset the error and continue
1326             targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
1327             *pErrorCode=U_ZERO_ERROR;
1328         } else if(U_FAILURE(*pErrorCode)) {
1329             if(pivotSource==pivotBuffer) {
1330                 // toUnicode error, should not occur
1331                 // toUnicode errors are tested in cintltst TestConvertExFromUTF8()
1332                 break;
1333             } else {
1334                 // fromUnicode error
1335                 // some other error occurred, done
1336                 break;
1337             }
1338         } else {
1339             if(source!=sourceLimit) {
1340                 // when no error occurs, then the input must be consumed
1341                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1342                 break;
1343             }
1344 
1345             if(sourceLimit==utf8Limit) {
1346                 // we are done
1347                 if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) {
1348                     // ucnv_convertEx() warns about not terminating the output
1349                     // but ucnv_fromUnicode() does not and so
1350                     // checkFromUnicode() does not expect it
1351                     *pErrorCode=U_ZERO_ERROR;
1352                 }
1353                 break;
1354             }
1355 
1356             // the partial conversion succeeded, set a new limit and continue
1357             sourceLimit=(utf8Limit-source)>=step ? source+step : utf8Limit;
1358             flush=(UBool)(cc.finalFlush && sourceLimit==utf8Limit);
1359         }
1360     }
1361 
1362     return (int32_t)(target-result);
1363 }
1364 
1365 static int32_t
stepFromUnicode(ConversionCase & cc,UConverter * cnv,char * result,int32_t resultCapacity,int32_t * resultOffsets,int32_t step,UErrorCode * pErrorCode)1366 stepFromUnicode(ConversionCase &cc, UConverter *cnv,
1367                 char *result, int32_t resultCapacity,
1368                 int32_t *resultOffsets, /* also resultCapacity */
1369                 int32_t step,
1370                 UErrorCode *pErrorCode) {
1371     const UChar *source, *sourceLimit, *unicodeLimit;
1372     char *target, *targetLimit, *resultLimit;
1373     UBool flush;
1374 
1375     source=cc.unicode;
1376     target=result;
1377     unicodeLimit=source+cc.unicodeLength;
1378     resultLimit=result+resultCapacity;
1379 
1380     // call ucnv_fromUnicode() with in/out buffers no larger than (step) at a time
1381     // move only one buffer (in vs. out) at a time to be extra mean
1382     // step==0 performs bulk conversion and generates offsets
1383 
1384     // initialize the partial limits for the loop
1385     if(step==0) {
1386         // use the entire buffers
1387         sourceLimit=unicodeLimit;
1388         targetLimit=resultLimit;
1389         flush=cc.finalFlush;
1390     } else {
1391         // start with empty partial buffers
1392         sourceLimit=source;
1393         targetLimit=target;
1394         flush=FALSE;
1395 
1396         // output offsets only for bulk conversion
1397         resultOffsets=NULL;
1398     }
1399 
1400     for(;;) {
1401         // resetting the opposite conversion direction must not affect this one
1402         ucnv_resetToUnicode(cnv);
1403 
1404         // convert
1405         ucnv_fromUnicode(cnv,
1406             &target, targetLimit,
1407             &source, sourceLimit,
1408             resultOffsets,
1409             flush, pErrorCode);
1410 
1411         // check pointers and errors
1412         if(source>sourceLimit || target>targetLimit) {
1413             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1414             break;
1415         } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1416             if(target!=targetLimit) {
1417                 // buffer overflow must only be set when the target is filled
1418                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1419                 break;
1420             } else if(targetLimit==resultLimit) {
1421                 // not just a partial overflow
1422                 break;
1423             }
1424 
1425             // the partial target is filled, set a new limit, reset the error and continue
1426             targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
1427             *pErrorCode=U_ZERO_ERROR;
1428         } else if(U_FAILURE(*pErrorCode)) {
1429             // some other error occurred, done
1430             break;
1431         } else {
1432             if(source!=sourceLimit) {
1433                 // when no error occurs, then the input must be consumed
1434                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1435                 break;
1436             }
1437 
1438             if(sourceLimit==unicodeLimit) {
1439                 // we are done
1440                 break;
1441             }
1442 
1443             // the partial conversion succeeded, set a new limit and continue
1444             sourceLimit=(unicodeLimit-source)>=step ? source+step : unicodeLimit;
1445             flush=(UBool)(cc.finalFlush && sourceLimit==unicodeLimit);
1446         }
1447     }
1448 
1449     return (int32_t)(target-result);
1450 }
1451 
1452 UBool
FromUnicodeCase(ConversionCase & cc,UConverterFromUCallback callback,const char * option)1453 ConversionTest::FromUnicodeCase(ConversionCase &cc, UConverterFromUCallback callback, const char *option) {
1454     UConverter *cnv;
1455     UErrorCode errorCode;
1456 
1457     // open the converter
1458     errorCode=U_ZERO_ERROR;
1459     cnv=cnv_open(cc.charset, errorCode);
1460     if(U_FAILURE(errorCode)) {
1461         errcheckln(errorCode, "fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_open() failed - %s",
1462                 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1463         return FALSE;
1464     }
1465     ucnv_resetToUnicode(utf8Cnv);
1466 
1467     // set the callback
1468     if(callback!=NULL) {
1469         ucnv_setFromUCallBack(cnv, callback, option, NULL, NULL, &errorCode);
1470         if(U_FAILURE(errorCode)) {
1471             errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setFromUCallBack() failed - %s",
1472                     cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1473             ucnv_close(cnv);
1474             return FALSE;
1475         }
1476     }
1477 
1478     // set the fallbacks flag
1479     // TODO change with Jitterbug 2401, then add a similar call for toUnicode too
1480     ucnv_setFallback(cnv, cc.fallbacks);
1481 
1482     // set the subchar
1483     int32_t length;
1484 
1485     if(cc.setSub>0) {
1486         length=(int32_t)strlen(cc.subchar);
1487         ucnv_setSubstChars(cnv, cc.subchar, (int8_t)length, &errorCode);
1488         if(U_FAILURE(errorCode)) {
1489             errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setSubstChars() failed - %s",
1490                     cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1491             ucnv_close(cnv);
1492             return FALSE;
1493         }
1494     } else if(cc.setSub<0) {
1495         ucnv_setSubstString(cnv, cc.subString, -1, &errorCode);
1496         if(U_FAILURE(errorCode)) {
1497             errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setSubstString() failed - %s",
1498                     cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1499             ucnv_close(cnv);
1500             return FALSE;
1501         }
1502     }
1503 
1504     // convert unicode to utf8
1505     char utf8[256];
1506     cc.utf8=utf8;
1507     u_strToUTF8(utf8, UPRV_LENGTHOF(utf8), &cc.utf8Length,
1508                 cc.unicode, cc.unicodeLength,
1509                 &errorCode);
1510     if(U_FAILURE(errorCode)) {
1511         // skip UTF-8 testing of a string with an unpaired surrogate,
1512         // or of one that's too long
1513         // toUnicode errors are tested in cintltst TestConvertExFromUTF8()
1514         cc.utf8Length=-1;
1515     }
1516 
1517     int32_t resultOffsets[256];
1518     char result[256];
1519     int32_t resultLength;
1520     UBool ok;
1521 
1522     static const struct {
1523         int32_t step;
1524         const char *name, *utf8Name;
1525     } steps[]={
1526         { 0, "bulk",   "utf8" }, // must be first for offsets to be checked
1527         { 1, "step=1", "utf8 step=1" },
1528         { 3, "step=3", "utf8 step=3" },
1529         { 7, "step=7", "utf8 step=7" }
1530     };
1531     int32_t i, step;
1532 
1533     ok=TRUE;
1534     for(i=0; i<UPRV_LENGTHOF(steps) && ok; ++i) {
1535         step=steps[i].step;
1536         memset(resultOffsets, -1, UPRV_LENGTHOF(resultOffsets));
1537         memset(result, -1, UPRV_LENGTHOF(result));
1538         errorCode=U_ZERO_ERROR;
1539         resultLength=stepFromUnicode(cc, cnv,
1540                                 result, UPRV_LENGTHOF(result),
1541                                 step==0 ? resultOffsets : NULL,
1542                                 step, &errorCode);
1543         ok=checkFromUnicode(
1544                 cc, cnv, steps[i].name,
1545                 (uint8_t *)result, resultLength,
1546                 cc.offsets!=NULL ? resultOffsets : NULL,
1547                 errorCode);
1548         if(U_FAILURE(errorCode) || !cc.finalFlush) {
1549             // reset if an error occurred or we did not flush
1550             // otherwise do nothing to make sure that flushing resets
1551             ucnv_resetFromUnicode(cnv);
1552         }
1553         if (resultOffsets[resultLength] != -1) {
1554             errln("fromUnicode[%d](%s) Conversion wrote too much to offsets at index %d",
1555                 cc.caseNr, cc.charset, resultLength);
1556         }
1557         if (result[resultLength] != (char)-1) {
1558             errln("fromUnicode[%d](%s) Conversion wrote too much to result at index %d",
1559                 cc.caseNr, cc.charset, resultLength);
1560         }
1561 
1562         // bulk test is first, then offsets are not checked any more
1563         cc.offsets=NULL;
1564 
1565         // test direct conversion from UTF-8
1566         if(cc.utf8Length>=0) {
1567             errorCode=U_ZERO_ERROR;
1568             resultLength=stepFromUTF8(cc, utf8Cnv, cnv,
1569                                     result, UPRV_LENGTHOF(result),
1570                                     step, &errorCode);
1571             ok=checkFromUnicode(
1572                     cc, cnv, steps[i].utf8Name,
1573                     (uint8_t *)result, resultLength,
1574                     NULL,
1575                     errorCode);
1576             if(U_FAILURE(errorCode) || !cc.finalFlush) {
1577                 // reset if an error occurred or we did not flush
1578                 // otherwise do nothing to make sure that flushing resets
1579                 ucnv_resetToUnicode(utf8Cnv);
1580                 ucnv_resetFromUnicode(cnv);
1581             }
1582         }
1583     }
1584 
1585     // not a real loop, just a convenience for breaking out of the block
1586     while(ok && cc.finalFlush) {
1587         // test ucnv_fromUChars()
1588         memset(result, 0, sizeof(result));
1589 
1590         errorCode=U_ZERO_ERROR;
1591         resultLength=ucnv_fromUChars(cnv,
1592                         result, UPRV_LENGTHOF(result),
1593                         cc.unicode, cc.unicodeLength,
1594                         &errorCode);
1595         ok=checkFromUnicode(
1596                 cc, cnv, "fromUChars",
1597                 (uint8_t *)result, resultLength,
1598                 NULL,
1599                 errorCode);
1600         if(!ok) {
1601             break;
1602         }
1603 
1604         // test preflighting
1605         // keep the correct result for simple checking
1606         errorCode=U_ZERO_ERROR;
1607         resultLength=ucnv_fromUChars(cnv,
1608                         NULL, 0,
1609                         cc.unicode, cc.unicodeLength,
1610                         &errorCode);
1611         if(errorCode==U_STRING_NOT_TERMINATED_WARNING || errorCode==U_BUFFER_OVERFLOW_ERROR) {
1612             errorCode=U_ZERO_ERROR;
1613         }
1614         ok=checkFromUnicode(
1615                 cc, cnv, "preflight fromUChars",
1616                 (uint8_t *)result, resultLength,
1617                 NULL,
1618                 errorCode);
1619         break;
1620     }
1621 
1622     ucnv_close(cnv);
1623     return ok;
1624 }
1625 
1626 UBool
checkFromUnicode(ConversionCase & cc,UConverter * cnv,const char * name,const uint8_t * result,int32_t resultLength,const int32_t * resultOffsets,UErrorCode resultErrorCode)1627 ConversionTest::checkFromUnicode(ConversionCase &cc, UConverter *cnv, const char *name,
1628                                  const uint8_t *result, int32_t resultLength,
1629                                  const int32_t *resultOffsets,
1630                                  UErrorCode resultErrorCode) {
1631     UChar resultInvalidUChars[8];
1632     int8_t resultInvalidLength;
1633     UErrorCode errorCode;
1634 
1635     const char *msg;
1636 
1637     // reset the message; NULL will mean "ok"
1638     msg=NULL;
1639 
1640     errorCode=U_ZERO_ERROR;
1641     resultInvalidLength=UPRV_LENGTHOF(resultInvalidUChars);
1642     ucnv_getInvalidUChars(cnv, resultInvalidUChars, &resultInvalidLength, &errorCode);
1643     if(U_FAILURE(errorCode)) {
1644         errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) ucnv_getInvalidUChars() failed - %s",
1645                 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, u_errorName(errorCode));
1646         return FALSE;
1647     }
1648 
1649     // check everything that might have gone wrong
1650     if(cc.bytesLength!=resultLength) {
1651         msg="wrong result length";
1652     } else if(0!=memcmp(cc.bytes, result, cc.bytesLength)) {
1653         msg="wrong result string";
1654     } else if(cc.offsets!=NULL && 0!=memcmp(cc.offsets, resultOffsets, cc.bytesLength*sizeof(*cc.offsets))) {
1655         msg="wrong offsets";
1656     } else if(cc.outErrorCode!=resultErrorCode) {
1657         msg="wrong error code";
1658     } else if(cc.invalidLength!=resultInvalidLength) {
1659         msg="wrong length of last invalid input";
1660     } else if(0!=u_memcmp(cc.invalidUChars, resultInvalidUChars, cc.invalidLength)) {
1661         msg="wrong last invalid input";
1662     }
1663 
1664     if(msg==NULL) {
1665         return TRUE;
1666     } else {
1667         char buffer[2000]; // one buffer for all strings
1668         char *s, *unicodeString, *bytesString, *resultString,
1669             *offsetsString, *resultOffsetsString,
1670             *invalidCharsString, *resultInvalidUCharsString;
1671 
1672         unicodeString=s=buffer;
1673         s=printUnicode(cc.unicode, cc.unicodeLength, unicodeString);
1674         s=printBytes(cc.bytes, cc.bytesLength, bytesString=s);
1675         s=printBytes(result, resultLength, resultString=s);
1676         s=printOffsets(cc.offsets, cc.bytesLength, offsetsString=s);
1677         s=printOffsets(resultOffsets, resultLength, resultOffsetsString=s);
1678         s=printUnicode(cc.invalidUChars, cc.invalidLength, invalidCharsString=s);
1679         s=printUnicode(resultInvalidUChars, resultInvalidLength, resultInvalidUCharsString=s);
1680 
1681         if((s-buffer)>(int32_t)sizeof(buffer)) {
1682             errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) fatal error: checkFromUnicode() test output buffer overflow writing %d chars\n",
1683                     cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, (int)(s-buffer));
1684             exit(1);
1685         }
1686 
1687         errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) failed: %s\n"
1688               "  unicode <%s>[%d]\n"
1689               " expected <%s>[%d]\n"
1690               "  result  <%s>[%d]\n"
1691               " offsets         <%s>\n"
1692               "  result offsets <%s>\n"
1693               " error code expected %s got %s\n"
1694               "  invalidChars expected <%s> got <%s>\n",
1695               cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, msg,
1696               unicodeString, cc.unicodeLength,
1697               bytesString, cc.bytesLength,
1698               resultString, resultLength,
1699               offsetsString,
1700               resultOffsetsString,
1701               u_errorName(cc.outErrorCode), u_errorName(resultErrorCode),
1702               invalidCharsString, resultInvalidUCharsString);
1703 
1704         return FALSE;
1705     }
1706 }
1707 
1708 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
1709