• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  **********************************************************************
5  *   Copyright (C) 2005-2016, International Business Machines
6  *   Corporation and others.  All Rights Reserved.
7  **********************************************************************
8  */
9 
10 
11 #include "unicode/utypes.h"
12 #include "unicode/ucsdet.h"
13 #include "unicode/ucnv.h"
14 #include "unicode/unistr.h"
15 #include "unicode/putil.h"
16 #include "unicode/uniset.h"
17 
18 #include "intltest.h"
19 #include "csdetest.h"
20 
21 #include "xmlparser.h"
22 
23 #include <memory>
24 #include <stdlib.h>
25 #include <string.h>
26 
27 #ifdef DEBUG_DETECT
28 #include <stdio.h>
29 #endif
30 
31 
32 #define CH_SPACE 0x0020
33 #define CH_SLASH 0x002F
34 
35 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
36     if (!(x)) { \
37         errln("Failure in file %s, line %d", __FILE__, __LINE__); \
38     } \
39 } UPRV_BLOCK_MACRO_END
40 
41 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
42     if (U_FAILURE(errcode)) { \
43         errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
44         return; \
45     } \
46 } UPRV_BLOCK_MACRO_END
47 
48 
49 //---------------------------------------------------------------------------
50 //
51 //  Test class boilerplate
52 //
53 //---------------------------------------------------------------------------
CharsetDetectionTest()54 CharsetDetectionTest::CharsetDetectionTest()
55 {
56 }
57 
58 
~CharsetDetectionTest()59 CharsetDetectionTest::~CharsetDetectionTest()
60 {
61 }
62 
63 
64 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)65 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
66 {
67     if (exec) logln("TestSuite CharsetDetectionTest: ");
68     switch (index) {
69        case 0: name = "ConstructionTest";
70             if (exec) ConstructionTest();
71             break;
72 
73        case 1: name = "UTF8Test";
74             if (exec) UTF8Test();
75             break;
76 
77        case 2: name = "UTF16Test";
78             if (exec) UTF16Test();
79             break;
80 
81        case 3: name = "C1BytesTest";
82             if (exec) C1BytesTest();
83             break;
84 
85        case 4: name = "InputFilterTest";
86             if (exec) InputFilterTest();
87             break;
88 
89        case 5: name = "DetectionTest";
90             if (exec) DetectionTest();
91             break;
92 #if !UCONFIG_NO_LEGACY_CONVERSION
93        case 6: name = "IBM424Test";
94             if (exec) IBM424Test();
95             break;
96 
97        case 7: name = "IBM420Test";
98             if (exec) IBM420Test();
99             break;
100 #else
101        case 6:
102        case 7: name = "skip"; break;
103 #endif
104        case 8: name = "Ticket6394Test";
105             if (exec) Ticket6394Test();
106             break;
107 
108        case 9: name = "Ticket6954Test";
109             if (exec) Ticket6954Test();
110             break;
111 
112        case 10: name = "Ticket21823Test";
113             if (exec) Ticket21823Test();
114             break;
115 
116         default: name = "";
117             break; //needed to end loop
118     }
119 }
120 
split(const UnicodeString & src,UChar ch,int32_t & splits)121 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
122 {
123     int32_t offset = -1;
124 
125     splits = 1;
126     while((offset = src.indexOf(ch, offset + 1)) >= 0) {
127         splits += 1;
128     }
129 
130     UnicodeString *result = new UnicodeString[splits];
131 
132     int32_t start = 0;
133     int32_t split = 0;
134     int32_t end;
135 
136     while((end = src.indexOf(ch, start)) >= 0) {
137         src.extractBetween(start, end, result[split++]);
138         start = end + 1;
139     }
140 
141     src.extractBetween(start, src.length(), result[split]);
142 
143     return result;
144 }
145 
extractBytes(const UnicodeString & source,const char * codepage,int32_t & length)146 static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
147 {
148     int32_t sLength = source.length();
149     char *bytes = NULL;
150 
151     length = source.extract(0, sLength, NULL, codepage);
152 
153     if (length > 0) {
154         bytes = new char[length + 1];
155         source.extract(0, sLength, bytes, codepage);
156     }
157 
158     return bytes;
159 }
160 
checkEncoding(const UnicodeString & testString,const UnicodeString & encoding,const UnicodeString & id)161 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
162 {
163     int32_t splits = 0;
164     int32_t testLength = testString.length();
165     std::unique_ptr<UnicodeString []> eSplit(split(encoding, CH_SLASH, splits));
166     UErrorCode status = U_ZERO_ERROR;
167     int32_t cpLength = eSplit[0].length();
168     char codepage[64];
169 
170     u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
171     codepage[cpLength] = '\0';
172 
173     LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
174 
175     int32_t byteLength = 0;
176     std::unique_ptr<char []> bytes(extractBytes(testString, codepage, byteLength));
177 
178     if (! bytes) {
179 #if !UCONFIG_NO_LEGACY_CONVERSION
180         dataerrln("Can't open a " + encoding + " converter for " + id);
181 #endif
182         return;
183     }
184 
185     ucsdet_setText(csd.getAlias(), bytes.get(), byteLength, &status);
186 
187     int32_t matchCount = 0;
188     const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
189 
190 
191     UnicodeString name(ucsdet_getName(matches[0], &status));
192     UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
193     UChar *decoded = NULL;
194     int32_t dLength = 0;
195 
196     if (matchCount == 0) {
197         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
198         return;
199     }
200 
201     if (name.compare(eSplit[0]) != 0) {
202         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
203 
204 #ifdef DEBUG_DETECT
205         for (int32_t m = 0; m < matchCount; m += 1) {
206             const char *name = ucsdet_getName(matches[m], &status);
207             const char *lang = ucsdet_getLanguage(matches[m], &status);
208             int32_t confidence = ucsdet_getConfidence(matches[m], &status);
209 
210             printf("%s (%s) %d\n", name, lang, confidence);
211         }
212 #endif
213         return;
214     }
215 
216     if (splits > 1 && lang.compare(eSplit[1]) != 0) {
217         errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
218         return;
219     }
220 
221     decoded = new UChar[testLength];
222     dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
223 
224     if (testString.compare(decoded, dLength) != 0) {
225         errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yield the original string.");
226 
227 #ifdef DEBUG_DETECT
228         for(int32_t i = 0; i < testLength; i += 1) {
229             if(testString[i] != decoded[i]) {
230                 printf("Strings differ at byte %d\n", i);
231                 break;
232             }
233         }
234 #endif
235 
236     }
237 
238     delete[] decoded;
239 }
240 
getPath(char buffer[2048],const char * filename)241 const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
242     UErrorCode status = U_ZERO_ERROR;
243     const char *testDataDirectory = IntlTest::getSourceTestData(status);
244 
245     if (U_FAILURE(status)) {
246         errln("ERROR: getPath() failed - %s", u_errorName(status));
247         return NULL;
248     }
249 
250     strcpy(buffer, testDataDirectory);
251     strcat(buffer, filename);
252     return buffer;
253 }
254 
ConstructionTest()255 void CharsetDetectionTest::ConstructionTest()
256 {
257     IcuTestErrorCode status(*this, "ConstructionTest");
258     LocalUCharsetDetectorPointer csd(ucsdet_open(status));
259     LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
260     int32_t count = uenum_count(e.getAlias(), status);
261 
262 #ifdef DEBUG_DETECT
263     printf("There are %d recognizers.\n", count);
264 #endif
265 
266     for(int32_t i = 0; i < count; i += 1) {
267         int32_t length;
268         const char *name = uenum_next(e.getAlias(), &length, status);
269 
270         if(name == NULL || length <= 0) {
271             errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
272         }
273 
274 #ifdef DEBUG_DETECT
275         printf("%s\n", name);
276 #endif
277     }
278 
279     const char* defDisabled[] = {
280         "IBM420_rtl", "IBM420_ltr",
281         "IBM424_rtl", "IBM424_ltr",
282         0
283     };
284 
285     LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status));
286     const char *activeName = NULL;
287 
288     while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) {
289         // the charset must be included in all list
290         UBool found = false;
291 
292         const char *name = NULL;
293         uenum_reset(e.getAlias(), status);
294         while ((name = uenum_next(e.getAlias(), NULL, status))) {
295             if (strcmp(activeName, name) == 0) {
296                 found = true;
297                 break;
298             }
299         }
300 
301         if (!found) {
302             errln(UnicodeString(activeName) + " is not included in the all charset list.");
303         }
304 
305         // some charsets are disabled by default
306         found = false;
307         for (int32_t i = 0; defDisabled[i] != 0; i++) {
308             if (strcmp(activeName, defDisabled[i]) == 0) {
309                 found = true;
310                 break;
311             }
312         }
313         if (found) {
314             errln(UnicodeString(activeName) + " should not be included in the default charset list.");
315         }
316     }
317 }
318 
UTF8Test()319 void CharsetDetectionTest::UTF8Test()
320 {
321     UErrorCode status = U_ZERO_ERROR;
322     UnicodeString ss = "This is a string with some non-ascii characters that will "
323                        "be converted to UTF-8, then shoved through the detection process.  "
324                        "\\u0391\\u0392\\u0393\\u0394\\u0395"
325                        "Sure would be nice if our source could contain Unicode directly!";
326     UnicodeString s = ss.unescape();
327     int32_t byteLength = 0, sLength = s.length();
328     char *bytes = extractBytes(s, "UTF-8", byteLength);
329     UCharsetDetector *csd = ucsdet_open(&status);
330     const UCharsetMatch *match;
331     UChar *detected = new UChar[sLength];
332 
333     ucsdet_setText(csd, bytes, byteLength, &status);
334     match = ucsdet_detect(csd, &status);
335 
336     if (match == NULL) {
337         errln("Detection failure for UTF-8: got no matches.");
338         goto bail;
339     }
340 
341     ucsdet_getUChars(match, detected, sLength, &status);
342 
343     if (s.compare(detected, sLength) != 0) {
344         errln("Round-trip test failed!");
345     }
346 
347     ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
348 
349 bail:
350     delete[] detected;
351     delete[] bytes;
352     ucsdet_close(csd);
353 }
354 
UTF16Test()355 void CharsetDetectionTest::UTF16Test()
356 {
357     UErrorCode status = U_ZERO_ERROR;
358     /* Notice the BOM on the start of this string */
359     UChar chars[] = {
360         0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
361         0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
362         0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
363         0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
364         0x064a, 0x062a, 0x0000};
365     UnicodeString s(chars);
366     int32_t beLength = 0, leLength = 0;
367     std::unique_ptr<char []>beBytes(extractBytes(s, "UTF-16BE", beLength));
368     std::unique_ptr<char []>leBytes(extractBytes(s, "UTF-16LE", leLength));
369     LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
370     const UCharsetMatch *match;
371     const char *name;
372     int32_t conf;
373 
374     ucsdet_setText(csd.getAlias(), beBytes.get(), beLength, &status);
375     match = ucsdet_detect(csd.getAlias(), &status);
376 
377     if (match == NULL) {
378         errln("Encoding detection failure for UTF-16BE: got no matches.");
379     } else {
380 
381         name  = ucsdet_getName(match, &status);
382         conf  = ucsdet_getConfidence(match, &status);
383 
384         if (strcmp(name, "UTF-16BE") != 0) {
385             errln("Encoding detection failure for UTF-16BE: got %s", name);
386         } else if (conf != 100) {
387             errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
388         }
389     }
390 
391     ucsdet_setText(csd.getAlias(), leBytes.get(), leLength, &status);
392     match = ucsdet_detect(csd.getAlias(), &status);
393 
394     if (match == NULL) {
395         errln("Encoding detection failure for UTF-16LE: got no matches.");
396         return;
397     }
398 
399     name  = ucsdet_getName(match, &status);
400     conf = ucsdet_getConfidence(match, &status);
401 
402     if (strcmp(name, "UTF-16LE") != 0) {
403         errln("Encoding detection failure for UTF-16LE: got %s", name);
404         return;
405     }
406 
407     if (conf != 100) {
408         errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
409     }
410 }
411 
InputFilterTest()412 void CharsetDetectionTest::InputFilterTest()
413 {
414     UErrorCode status = U_ZERO_ERROR;
415     UnicodeString s(u"<a> <lot> <of> <English> <inside> <the> <markup> Un très petit peu de Français. <to> <confuse> <the> <detector>");
416     int32_t byteLength = 0;
417     char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
418     UCharsetDetector *csd = ucsdet_open(&status);
419     const UCharsetMatch *match;
420     const char *lang, *name;
421 
422     ucsdet_enableInputFilter(csd, true);
423 
424     if (!ucsdet_isInputFilterEnabled(csd)) {
425         errln("ucsdet_enableInputFilter(csd, true) did not enable input filter!");
426     }
427 
428 
429     ucsdet_setText(csd, bytes, byteLength, &status);
430     match = ucsdet_detect(csd, &status);
431 
432     if (match == NULL) {
433         errln("Turning on the input filter resulted in no matches.");
434         goto turn_off;
435     }
436 
437     name = ucsdet_getName(match, &status);
438 
439     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
440         errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
441     } else {
442         lang = ucsdet_getLanguage(match, &status);
443 
444         if (lang == NULL || strcmp(lang, "fr") != 0) {
445             errln("Input filter did not strip markup!");
446         }
447     }
448 
449 turn_off:
450     ucsdet_enableInputFilter(csd, false);
451     ucsdet_setText(csd, bytes, byteLength, &status);
452     match = ucsdet_detect(csd, &status);
453 
454     if (match == NULL) {
455         errln("Turning off the input filter resulted in no matches.");
456         goto bail;
457     }
458 
459     name = ucsdet_getName(match, &status);
460 
461     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
462         errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
463     } else {
464         lang = ucsdet_getLanguage(match, &status);
465 
466         if (lang == NULL || strcmp(lang, "en") != 0) {
467             errln("Unfiltered input did not detect as English!");
468         }
469     }
470 
471 bail:
472     delete[] bytes;
473     ucsdet_close(csd);
474 }
475 
C1BytesTest()476 void CharsetDetectionTest::C1BytesTest()
477 {
478 #if !UCONFIG_NO_LEGACY_CONVERSION
479     UErrorCode status = U_ZERO_ERROR;
480     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
481     UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
482     UnicodeString sWindows  = ssWindows.unescape();
483     int32_t lISO = 0, lWindows = 0;
484     char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
485     char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
486     UCharsetDetector *csd = ucsdet_open(&status);
487     const UCharsetMatch *match;
488     const char *name;
489 
490     ucsdet_setText(csd, bWindows, lWindows, &status);
491     match = ucsdet_detect(csd, &status);
492 
493     if (match == NULL) {
494         errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
495         goto bail;
496     }
497 
498     name  = ucsdet_getName(match, &status);
499 
500     if (strcmp(name, "windows-1252") != 0) {
501         errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
502     }
503 
504     ucsdet_setText(csd, bISO, lISO, &status);
505     match = ucsdet_detect(csd, &status);
506 
507     if (match == NULL) {
508         errln("English text without C1 bytes got no matches.");
509         goto bail;
510     }
511 
512     name  = ucsdet_getName(match, &status);
513 
514     if (strcmp(name, "ISO-8859-1") != 0) {
515         errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
516     }
517 
518 bail:
519     delete[] bWindows;
520     delete[] bISO;
521 
522     ucsdet_close(csd);
523 #endif
524 }
525 
DetectionTest()526 void CharsetDetectionTest::DetectionTest()
527 {
528 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
529     UErrorCode status = U_ZERO_ERROR;
530     char path[2048];
531     const char *testFilePath = getPath(path, "csdetest.xml");
532 
533     if (testFilePath == NULL) {
534         return; /* Couldn't get path: error message already output. */
535     }
536 
537     UXMLParser  *parser = UXMLParser::createParser(status);
538     if (U_FAILURE(status)) {
539         dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
540         return;
541     }
542 
543     UXMLElement *root   = parser->parseFile(testFilePath, status);
544     if (!assertSuccess( "parseFile",status)) return;
545 
546     UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
547     UnicodeString id_attr   = UNICODE_STRING_SIMPLE("id");
548     UnicodeString enc_attr  = UNICODE_STRING_SIMPLE("encodings");
549 
550     const UXMLElement *testCase;
551     int32_t tc = 0;
552 
553     while((testCase = root->nextChildElement(tc)) != NULL) {
554         if (testCase->getTagName().compare(test_case) == 0) {
555             const UnicodeString *id = testCase->getAttribute(id_attr);
556             const UnicodeString *encodings = testCase->getAttribute(enc_attr);
557             const UnicodeString  text = testCase->getText(true);
558             int32_t encodingCount;
559             UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
560 
561             for(int32_t e = 0; e < encodingCount; e += 1) {
562                 checkEncoding(text, encodingList[e], *id);
563             }
564 
565             delete[] encodingList;
566         }
567     }
568 
569     delete root;
570     delete parser;
571 #endif
572 }
573 
IBM424Test()574 void CharsetDetectionTest::IBM424Test()
575 {
576 #if !UCONFIG_ONLY_HTML_CONVERSION
577     UErrorCode status = U_ZERO_ERROR;
578 
579     static const UChar chars[] = {
580             0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
581             0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
582             0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
583             0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
584             0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
585             0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
586             0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
587             0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
588             0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
589             0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
590             0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
591             0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
592             0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
593             0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
594             0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
595             0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
596             0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
597     };
598 
599     static const UChar chars_reverse[] = {
600             0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
601             0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
602             0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
603             0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
604             0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
605             0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
606             0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
607             0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
608             0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
609             0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
610             0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
611             0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
612             0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
613             0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
614             0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
615             0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
616             0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
617             0x0000
618     };
619 
620     int32_t bLength = 0, brLength = 0;
621 
622     UnicodeString s1(chars);
623     UnicodeString s2(chars_reverse);
624 
625     char *bytes = extractBytes(s1, "IBM424", bLength);
626     char *bytes_r = extractBytes(s2, "IBM424", brLength);
627 
628     UCharsetDetector *csd = ucsdet_open(&status);
629 	ucsdet_setDetectableCharset(csd, "IBM424_rtl", true, &status);
630 	ucsdet_setDetectableCharset(csd, "IBM424_ltr", true, &status);
631 	ucsdet_setDetectableCharset(csd, "IBM420_rtl", true, &status);
632 	ucsdet_setDetectableCharset(csd, "IBM420_ltr", true, &status);
633     if (U_FAILURE(status)) {
634         errln("Error opening charset detector. - %s", u_errorName(status));
635     }
636     const UCharsetMatch *match;
637     const char *name;
638 
639     ucsdet_setText(csd, bytes, bLength, &status);
640     match = ucsdet_detect(csd, &status);
641 
642     if (match == NULL) {
643         errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
644         goto bail;
645     }
646 
647     name  = ucsdet_getName(match, &status);
648     if (strcmp(name, "IBM424_rtl") != 0) {
649         errln("Encoding detection failure for IBM424_rtl: got %s", name);
650     }
651 
652     ucsdet_setText(csd, bytes_r, brLength, &status);
653     match = ucsdet_detect(csd, &status);
654 
655     if (match == NULL) {
656         errln("Encoding detection failure for IBM424_ltr: got no matches.");
657         goto bail;
658     }
659 
660     name  = ucsdet_getName(match, &status);
661     if (strcmp(name, "IBM424_ltr") != 0) {
662         errln("Encoding detection failure for IBM424_ltr: got %s", name);
663     }
664 
665 bail:
666     delete[] bytes;
667     delete[] bytes_r;
668     ucsdet_close(csd);
669 #endif
670 }
671 
IBM420Test()672 void CharsetDetectionTest::IBM420Test()
673 {
674 #if !UCONFIG_ONLY_HTML_CONVERSION
675     UErrorCode status = U_ZERO_ERROR;
676 
677     static const UChar chars[] = {
678         0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
679         0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
680         0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
681         0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
682         0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
683         0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
684         0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
685         0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
686         0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
687         0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
688         0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
689         0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
690         0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
691         0x0000
692     };
693     static const UChar chars_reverse[] = {
694         0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
695         0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
696         0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
697         0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
698         0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
699         0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
700         0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
701         0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
702         0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
703         0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
704         0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
705         0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
706         0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
707         0x0000,
708     };
709 
710     int32_t bLength = 0, brLength = 0;
711 
712     UnicodeString s1(chars);
713     UnicodeString s2(chars_reverse);
714 
715     char *bytes = extractBytes(s1, "IBM420", bLength);
716     char *bytes_r = extractBytes(s2, "IBM420", brLength);
717 
718     UCharsetDetector *csd = ucsdet_open(&status);
719     if (U_FAILURE(status)) {
720         errln("Error opening charset detector. - %s", u_errorName(status));
721     }
722 	ucsdet_setDetectableCharset(csd, "IBM424_rtl", true, &status);
723 	ucsdet_setDetectableCharset(csd, "IBM424_ltr", true, &status);
724 	ucsdet_setDetectableCharset(csd, "IBM420_rtl", true, &status);
725 	ucsdet_setDetectableCharset(csd, "IBM420_ltr", true, &status);
726     const UCharsetMatch *match;
727     const char *name;
728 
729     ucsdet_setText(csd, bytes, bLength, &status);
730     match = ucsdet_detect(csd, &status);
731 
732     if (match == NULL) {
733         errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
734         goto bail;
735     }
736 
737     name  = ucsdet_getName(match, &status);
738     if (strcmp(name, "IBM420_rtl") != 0) {
739         errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
740     }
741 
742     ucsdet_setText(csd, bytes_r, brLength, &status);
743     match = ucsdet_detect(csd, &status);
744 
745     if (match == NULL) {
746         errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
747         goto bail;
748     }
749 
750     name  = ucsdet_getName(match, &status);
751     if (strcmp(name, "IBM420_ltr") != 0) {
752         errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
753     }
754 
755 bail:
756     delete[] bytes;
757     delete[] bytes_r;
758     ucsdet_close(csd);
759 #endif
760 }
761 
762 
Ticket6394Test()763 void CharsetDetectionTest::Ticket6394Test() {
764 #if !UCONFIG_NO_CONVERSION
765     const char charText[] =  "Here is some random English text that should be detected as ISO-8859-1."
766                              "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
767                              "encodings more than once.  The hop through UnicodeString is for platforms "
768                              "where this char * string is be EBCDIC and needs conversion to Latin1.";
769     char latin1Text[sizeof(charText)];
770     UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
771 
772     UErrorCode status = U_ZERO_ERROR;
773     UCharsetDetector *csd = ucsdet_open(&status);
774     ucsdet_setText(csd, latin1Text, -1, &status);
775     if (U_FAILURE(status)) {
776         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
777         return;
778     }
779 
780     int32_t matchCount = 0;
781     const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
782     if (U_FAILURE(status)) {
783         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
784         return;
785     }
786 
787     UnicodeSet  setOfCharsetNames;    // UnicodeSets can hold strings.
788     int32_t i;
789     for (i=0; i<matchCount; i++) {
790         UnicodeString charSetName(ucsdet_getName(matches[i], &status));
791         if (U_FAILURE(status)) {
792             errln("Fail at file %s, line %d.  status = %s;  i=%d", __FILE__, __LINE__, u_errorName(status), i);
793             status = U_ZERO_ERROR;
794         }
795         if (setOfCharsetNames.contains(charSetName)) {
796             errln("Fail at file %s, line %d ", __FILE__, __LINE__);
797             errln(UnicodeString("   Duplicate charset name = ") + charSetName);
798         }
799         setOfCharsetNames.add(charSetName);
800     }
801     ucsdet_close(csd);
802 #endif
803 }
804 
805 
806 // Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
807 //               similar Windows and non-Windows SBCS encodings. State was kept in the shared
808 //               Charset Recognizer objects, and could be overwritten.
Ticket6954Test()809 void CharsetDetectionTest::Ticket6954Test() {
810 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_FORMATTING
811     UErrorCode status = U_ZERO_ERROR;
812     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
813     UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
814                             "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
815     UnicodeString sWindows  = ssWindows.unescape();
816     int32_t lISO = 0, lWindows = 0;
817     std::unique_ptr<char[]> bISO(extractBytes(sISO, "ISO-8859-1", lISO));
818     std::unique_ptr<char[]> bWindows(extractBytes(sWindows, "windows-1252", lWindows));
819 
820     // First do a plain vanilla detect of 1252 text
821 
822     LocalUCharsetDetectorPointer csd1(ucsdet_open(&status));
823     ucsdet_setText(csd1.getAlias(), bWindows.get(), lWindows, &status);
824     const UCharsetMatch *match1 = ucsdet_detect(csd1.getAlias(), &status);
825     const char *name1 = ucsdet_getName(match1, &status);
826     TEST_ASSERT_SUCCESS(status);
827     TEST_ASSERT(strcmp(name1, "windows-1252")==0);
828 
829     // Next, using a completely separate detector, detect some 8859-1 text
830 
831     LocalUCharsetDetectorPointer csd2(ucsdet_open(&status));
832     ucsdet_setText(csd2.getAlias(), bISO.get(), lISO, &status);
833     const UCharsetMatch *match2 = ucsdet_detect(csd2.getAlias(), &status);
834     const char *name2 = ucsdet_getName(match2, &status);
835     TEST_ASSERT_SUCCESS(status);
836     TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0);
837 
838     // Recheck the 1252 results from the first detector, which should not have been
839     //  altered by the use of a different detector.
840 
841     name1 = ucsdet_getName(match1, &status);
842     TEST_ASSERT_SUCCESS(status);
843     TEST_ASSERT(strcmp(name1, "windows-1252")==0);
844 #endif
845 }
846 
847 
848 // Ticket 21823 - Issue with Charset Detector for ill-formed input strings.
849 //                Its fix involves returning a failure based error code
850 //                (U_INVALID_CHAR_FOUND) incase no charsets appear to match the input data.
Ticket21823Test()851 void CharsetDetectionTest::Ticket21823Test() {
852     UErrorCode status = U_ZERO_ERROR;
853     std::string str = "\x80";
854     UCharsetDetector* csd = ucsdet_open(&status);
855 
856     ucsdet_setText(csd, str.data(), str.length(), &status);
857     const UCharsetMatch* match = ucsdet_detect(csd, &status);
858 
859     if (match == NULL) {
860         TEST_ASSERT(U_FAILURE(status));
861     }
862 
863     ucsdet_close(csd);
864 }
865