• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  **********************************************************************
3  *   Copyright (C) 2005-2013, International Business Machines
4  *   Corporation and others.  All Rights Reserved.
5  **********************************************************************
6  */
7 
8 
9 #include "unicode/utypes.h"
10 #include "unicode/ucsdet.h"
11 #include "unicode/ucnv.h"
12 #include "unicode/unistr.h"
13 #include "unicode/putil.h"
14 #include "unicode/uniset.h"
15 
16 #include "intltest.h"
17 #include "csdetest.h"
18 
19 #include "xmlparser.h"
20 
21 #include <stdlib.h>
22 #include <string.h>
23 
24 #ifdef DEBUG_DETECT
25 #include <stdio.h>
26 #endif
27 
28 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
29 
30 #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
31 #define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
32 
33 #define CH_SPACE 0x0020
34 #define CH_SLASH 0x002F
35 
36 #define TEST_ASSERT(x) {if (!(x)) { \
37     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
38 
39 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
40     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));\
41     return;}}
42 
43 
44 //---------------------------------------------------------------------------
45 //
46 //  Test class boilerplate
47 //
48 //---------------------------------------------------------------------------
CharsetDetectionTest()49 CharsetDetectionTest::CharsetDetectionTest()
50 {
51 }
52 
53 
~CharsetDetectionTest()54 CharsetDetectionTest::~CharsetDetectionTest()
55 {
56 }
57 
58 
59 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)60 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
61 {
62     if (exec) logln("TestSuite CharsetDetectionTest: ");
63     switch (index) {
64        case 0: name = "ConstructionTest";
65             if (exec) ConstructionTest();
66             break;
67 
68        case 1: name = "UTF8Test";
69             if (exec) UTF8Test();
70             break;
71 
72        case 2: name = "UTF16Test";
73             if (exec) UTF16Test();
74             break;
75 
76        case 3: name = "C1BytesTest";
77             if (exec) C1BytesTest();
78             break;
79 
80        case 4: name = "InputFilterTest";
81             if (exec) InputFilterTest();
82             break;
83 
84        case 5: name = "DetectionTest";
85             if (exec) DetectionTest();
86             break;
87 #if !UCONFIG_NO_LEGACY_CONVERSION
88        case 6: name = "IBM424Test";
89             if (exec) IBM424Test();
90             break;
91 
92        case 7: name = "IBM420Test";
93             if (exec) IBM420Test();
94             break;
95 #else
96        case 6:
97        case 7: name = "skip"; break;
98 #endif
99        case 8: name = "Ticket6394Test";
100             if (exec) Ticket6394Test();
101             break;
102 
103        case 9: name = "Ticket6954Test";
104             if (exec) Ticket6954Test();
105             break;
106 
107         default: name = "";
108             break; //needed to end loop
109     }
110 }
111 
split(const UnicodeString & src,UChar ch,int32_t & splits)112 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
113 {
114     int32_t offset = -1;
115 
116     splits = 1;
117     while((offset = src.indexOf(ch, offset + 1)) >= 0) {
118         splits += 1;
119     }
120 
121     UnicodeString *result = new UnicodeString[splits];
122 
123     int32_t start = 0;
124     int32_t split = 0;
125     int32_t end;
126 
127     while((end = src.indexOf(ch, start)) >= 0) {
128         src.extractBetween(start, end, result[split++]);
129         start = end + 1;
130     }
131 
132     src.extractBetween(start, src.length(), result[split]);
133 
134     return result;
135 }
136 
extractBytes(const UnicodeString & source,const char * codepage,int32_t & length)137 static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
138 {
139     int32_t sLength = source.length();
140     char *bytes = NULL;
141 
142     length = source.extract(0, sLength, NULL, codepage);
143 
144     if (length > 0) {
145         bytes = NEW_ARRAY(char, length + 1);
146         source.extract(0, sLength, bytes, codepage);
147     }
148 
149     return bytes;
150 }
151 
freeBytes(char * bytes)152 static void freeBytes(char *bytes)
153 {
154     DELETE_ARRAY(bytes);
155 }
156 
checkEncoding(const UnicodeString & testString,const UnicodeString & encoding,const UnicodeString & id)157 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
158 {
159     int32_t splits = 0;
160     int32_t testLength = testString.length();
161     UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
162     UErrorCode status = U_ZERO_ERROR;
163     int32_t cpLength = eSplit[0].length();
164     char codepage[64];
165 
166     u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
167     codepage[cpLength] = '\0';
168 
169     LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
170 
171     int32_t byteLength = 0;
172     char *bytes = extractBytes(testString, codepage, byteLength);
173 
174     if (bytes == NULL) {
175 #if !UCONFIG_NO_LEGACY_CONVERSION
176         dataerrln("Can't open a " + encoding + " converter for " + id);
177 #endif
178         return;
179     }
180 
181     ucsdet_setText(csd.getAlias(), bytes, byteLength, &status);
182 
183     int32_t matchCount = 0;
184     const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
185 
186 
187     UnicodeString name(ucsdet_getName(matches[0], &status));
188     UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
189     UChar *decoded = NULL;
190     int32_t dLength = 0;
191 
192     if (matchCount == 0) {
193         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
194         goto bail;
195     }
196 
197     if (name.compare(eSplit[0]) != 0) {
198         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
199 
200 #ifdef DEBUG_DETECT
201         for (int32_t m = 0; m < matchCount; m += 1) {
202             const char *name = ucsdet_getName(matches[m], &status);
203             const char *lang = ucsdet_getLanguage(matches[m], &status);
204             int32_t confidence = ucsdet_getConfidence(matches[m], &status);
205 
206             printf("%s (%s) %d\n", name, lang, confidence);
207         }
208 #endif
209         goto bail;
210     }
211 
212     if (splits > 1 && lang.compare(eSplit[1]) != 0) {
213         errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
214         goto bail;
215     }
216 
217     decoded = NEW_ARRAY(UChar, testLength);
218     dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
219 
220     if (testString.compare(decoded, dLength) != 0) {
221         errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
222 
223 #ifdef DEBUG_DETECT
224         for(int32_t i = 0; i < testLength; i += 1) {
225             if(testString[i] != decoded[i]) {
226                 printf("Strings differ at byte %d\n", i);
227                 break;
228             }
229         }
230 #endif
231 
232     }
233 
234     DELETE_ARRAY(decoded);
235 
236 bail:
237     freeBytes(bytes);
238     delete[] eSplit;
239 }
240 
getPath(char buffer[2048],const char * filename)241 const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
242     UErrorCode status = U_ZERO_ERROR;
243     const char *testDataDirectory = IntlTest::getSourceTestData(status);
244 
245     if (U_FAILURE(status)) {
246         errln("ERROR: getPath() failed - %s", u_errorName(status));
247         return NULL;
248     }
249 
250     strcpy(buffer, testDataDirectory);
251     strcat(buffer, filename);
252     return buffer;
253 }
254 
ConstructionTest()255 void CharsetDetectionTest::ConstructionTest()
256 {
257     IcuTestErrorCode status(*this, "ConstructionTest");
258     LocalUCharsetDetectorPointer csd(ucsdet_open(status));
259     LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
260     int32_t count = uenum_count(e.getAlias(), status);
261 
262 #ifdef DEBUG_DETECT
263     printf("There are %d recognizers.\n", count);
264 #endif
265 
266     for(int32_t i = 0; i < count; i += 1) {
267         int32_t length;
268         const char *name = uenum_next(e.getAlias(), &length, status);
269 
270         if(name == NULL || length <= 0) {
271             errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
272         }
273 
274 #ifdef DEBUG_DETECT
275         printf("%s\n", name);
276 #endif
277     }
278 
279     const char* defDisabled[] = {
280         "IBM420_rtl", "IBM420_ltr",
281         "IBM424_rtl", "IBM424_ltr",
282         0
283     };
284 
285     LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status));
286     const char *activeName = NULL;
287 
288     while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) {
289         // the charset must be included in all list
290         UBool found = FALSE;
291 
292         const char *name = NULL;
293         uenum_reset(e.getAlias(), status);
294         while ((name = uenum_next(e.getAlias(), NULL, status))) {
295             if (strcmp(activeName, name) == 0) {
296                 found = TRUE;
297                 break;
298             }
299         }
300 
301         if (!found) {
302             errln(UnicodeString(activeName) + " is not included in the all charset list.");
303         }
304 
305         // some charsets are disabled by default
306         found = FALSE;
307         for (int32_t i = 0; defDisabled[i] != 0; i++) {
308             if (strcmp(activeName, defDisabled[i]) == 0) {
309                 found = TRUE;
310                 break;
311             }
312         }
313         if (found) {
314             errln(UnicodeString(activeName) + " should not be included in the default charset list.");
315         }
316     }
317 }
318 
UTF8Test()319 void CharsetDetectionTest::UTF8Test()
320 {
321     UErrorCode status = U_ZERO_ERROR;
322     UnicodeString ss = "This is a string with some non-ascii characters that will "
323                        "be converted to UTF-8, then shoved through the detection process.  "
324                        "\\u0391\\u0392\\u0393\\u0394\\u0395"
325                        "Sure would be nice if our source could contain Unicode directly!";
326     UnicodeString s = ss.unescape();
327     int32_t byteLength = 0, sLength = s.length();
328     char *bytes = extractBytes(s, "UTF-8", byteLength);
329     UCharsetDetector *csd = ucsdet_open(&status);
330     const UCharsetMatch *match;
331     UChar *detected = NEW_ARRAY(UChar, sLength);
332 
333     ucsdet_setText(csd, bytes, byteLength, &status);
334     match = ucsdet_detect(csd, &status);
335 
336     if (match == NULL) {
337         errln("Detection failure for UTF-8: got no matches.");
338         goto bail;
339     }
340 
341     ucsdet_getUChars(match, detected, sLength, &status);
342 
343     if (s.compare(detected, sLength) != 0) {
344         errln("Round-trip test failed!");
345     }
346 
347     ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
348 
349 bail:
350     DELETE_ARRAY(detected);
351     freeBytes(bytes);
352     ucsdet_close(csd);
353 }
354 
UTF16Test()355 void CharsetDetectionTest::UTF16Test()
356 {
357     UErrorCode status = U_ZERO_ERROR;
358     /* Notice the BOM on the start of this string */
359     UChar chars[] = {
360         0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
361         0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
362         0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
363         0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
364         0x064a, 0x062a, 0x0000};
365     UnicodeString s(chars);
366     int32_t beLength = 0, leLength = 0;
367     char *beBytes = extractBytes(s, "UTF-16BE", beLength);
368     char *leBytes = extractBytes(s, "UTF-16LE", leLength);
369     UCharsetDetector *csd = ucsdet_open(&status);
370     const UCharsetMatch *match;
371     const char *name;
372     int32_t conf;
373 
374     ucsdet_setText(csd, beBytes, beLength, &status);
375     match = ucsdet_detect(csd, &status);
376 
377     if (match == NULL) {
378         errln("Encoding detection failure for UTF-16BE: got no matches.");
379         goto try_le;
380     }
381 
382     name  = ucsdet_getName(match, &status);
383     conf  = ucsdet_getConfidence(match, &status);
384 
385     if (strcmp(name, "UTF-16BE") != 0) {
386         errln("Encoding detection failure for UTF-16BE: got %s", name);
387         goto try_le; // no point in looking at confidence if we got the wrong character set.
388     }
389 
390     if (conf != 100) {
391         errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
392     }
393 
394 try_le:
395     ucsdet_setText(csd, leBytes, leLength, &status);
396     match = ucsdet_detect(csd, &status);
397 
398     if (match == NULL) {
399         errln("Encoding detection failure for UTF-16LE: got no matches.");
400         goto bail;
401     }
402 
403     name  = ucsdet_getName(match, &status);
404     conf = ucsdet_getConfidence(match, &status);
405 
406 
407     if (strcmp(name, "UTF-16LE") != 0) {
408         errln("Enconding detection failure for UTF-16LE: got %s", name);
409         goto bail; // no point in looking at confidence if we got the wrong character set.
410     }
411 
412     if (conf != 100) {
413         errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
414     }
415 
416 bail:
417     freeBytes(leBytes);
418     freeBytes(beBytes);
419     ucsdet_close(csd);
420 }
421 
InputFilterTest()422 void CharsetDetectionTest::InputFilterTest()
423 {
424     UErrorCode status = U_ZERO_ERROR;
425     UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
426     UnicodeString s  = ss.unescape();
427     int32_t byteLength = 0;
428     char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
429     UCharsetDetector *csd = ucsdet_open(&status);
430     const UCharsetMatch *match;
431     const char *lang, *name;
432 
433     ucsdet_enableInputFilter(csd, TRUE);
434 
435     if (!ucsdet_isInputFilterEnabled(csd)) {
436         errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
437     }
438 
439 
440     ucsdet_setText(csd, bytes, byteLength, &status);
441     match = ucsdet_detect(csd, &status);
442 
443     if (match == NULL) {
444         errln("Turning on the input filter resulted in no matches.");
445         goto turn_off;
446     }
447 
448     name = ucsdet_getName(match, &status);
449 
450     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
451         errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
452     } else {
453         lang = ucsdet_getLanguage(match, &status);
454 
455         if (lang == NULL || strcmp(lang, "fr") != 0) {
456             errln("Input filter did not strip markup!");
457         }
458     }
459 
460 turn_off:
461     ucsdet_enableInputFilter(csd, FALSE);
462     ucsdet_setText(csd, bytes, byteLength, &status);
463     match = ucsdet_detect(csd, &status);
464 
465     if (match == NULL) {
466         errln("Turning off the input filter resulted in no matches.");
467         goto bail;
468     }
469 
470     name = ucsdet_getName(match, &status);
471 
472     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
473         errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
474     } else {
475         lang = ucsdet_getLanguage(match, &status);
476 
477         if (lang == NULL || strcmp(lang, "en") != 0) {
478             errln("Unfiltered input did not detect as English!");
479         }
480     }
481 
482 bail:
483     freeBytes(bytes);
484     ucsdet_close(csd);
485 }
486 
C1BytesTest()487 void CharsetDetectionTest::C1BytesTest()
488 {
489 #if !UCONFIG_NO_LEGACY_CONVERSION
490     UErrorCode status = U_ZERO_ERROR;
491     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
492     UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
493     UnicodeString sWindows  = ssWindows.unescape();
494     int32_t lISO = 0, lWindows = 0;
495     char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
496     char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
497     UCharsetDetector *csd = ucsdet_open(&status);
498     const UCharsetMatch *match;
499     const char *name;
500 
501     ucsdet_setText(csd, bWindows, lWindows, &status);
502     match = ucsdet_detect(csd, &status);
503 
504     if (match == NULL) {
505         errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
506         goto bail;
507     }
508 
509     name  = ucsdet_getName(match, &status);
510 
511     if (strcmp(name, "windows-1252") != 0) {
512         errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
513     }
514 
515     ucsdet_setText(csd, bISO, lISO, &status);
516     match = ucsdet_detect(csd, &status);
517 
518     if (match == NULL) {
519         errln("English text without C1 bytes got no matches.");
520         goto bail;
521     }
522 
523     name  = ucsdet_getName(match, &status);
524 
525     if (strcmp(name, "ISO-8859-1") != 0) {
526         errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
527     }
528 
529 bail:
530     freeBytes(bWindows);
531     freeBytes(bISO);
532 
533     ucsdet_close(csd);
534 #endif
535 }
536 
DetectionTest()537 void CharsetDetectionTest::DetectionTest()
538 {
539 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
540     UErrorCode status = U_ZERO_ERROR;
541     char path[2048];
542     const char *testFilePath = getPath(path, "csdetest.xml");
543 
544     if (testFilePath == NULL) {
545         return; /* Couldn't get path: error message already output. */
546     }
547 
548     UXMLParser  *parser = UXMLParser::createParser(status);
549     if (U_FAILURE(status)) {
550         dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
551         return;
552     }
553 
554     UXMLElement *root   = parser->parseFile(testFilePath, status);
555     if (!assertSuccess( "parseFile",status)) return;
556 
557     UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
558     UnicodeString id_attr   = UNICODE_STRING_SIMPLE("id");
559     UnicodeString enc_attr  = UNICODE_STRING_SIMPLE("encodings");
560 
561     const UXMLElement *testCase;
562     int32_t tc = 0;
563 
564     while((testCase = root->nextChildElement(tc)) != NULL) {
565         if (testCase->getTagName().compare(test_case) == 0) {
566             const UnicodeString *id = testCase->getAttribute(id_attr);
567             const UnicodeString *encodings = testCase->getAttribute(enc_attr);
568             const UnicodeString  text = testCase->getText(TRUE);
569             int32_t encodingCount;
570             UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
571 
572             for(int32_t e = 0; e < encodingCount; e += 1) {
573                 checkEncoding(text, encodingList[e], *id);
574             }
575 
576             delete[] encodingList;
577         }
578     }
579 
580     delete root;
581     delete parser;
582 #endif
583 }
584 
IBM424Test()585 void CharsetDetectionTest::IBM424Test()
586 {
587     UErrorCode status = U_ZERO_ERROR;
588 
589     static const UChar chars[] = {
590             0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
591             0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
592             0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
593             0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
594             0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
595             0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
596             0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
597             0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
598             0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
599             0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
600             0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
601             0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
602             0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
603             0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
604             0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
605             0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
606             0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
607     };
608 
609     static const UChar chars_reverse[] = {
610             0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
611             0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
612             0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
613             0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
614             0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
615             0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
616             0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
617             0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
618             0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
619             0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
620             0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
621             0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
622             0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
623             0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
624             0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
625             0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
626             0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
627             0x0000
628     };
629 
630     int32_t bLength = 0, brLength = 0;
631 
632     UnicodeString s1(chars);
633     UnicodeString s2(chars_reverse);
634 
635     char *bytes = extractBytes(s1, "IBM424", bLength);
636     char *bytes_r = extractBytes(s2, "IBM424", brLength);
637 
638     UCharsetDetector *csd = ucsdet_open(&status);
639 	ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
640 	ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
641 	ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
642 	ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
643     if (U_FAILURE(status)) {
644         errln("Error opening charset detector. - %s", u_errorName(status));
645     }
646     const UCharsetMatch *match;
647     const char *name;
648 
649     ucsdet_setText(csd, bytes, bLength, &status);
650     match = ucsdet_detect(csd, &status);
651 
652     if (match == NULL) {
653         errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
654         goto bail;
655     }
656 
657     name  = ucsdet_getName(match, &status);
658     if (strcmp(name, "IBM424_rtl") != 0) {
659         errln("Encoding detection failure for IBM424_rtl: got %s", name);
660     }
661 
662     ucsdet_setText(csd, bytes_r, brLength, &status);
663     match = ucsdet_detect(csd, &status);
664 
665     if (match == NULL) {
666         errln("Encoding detection failure for IBM424_ltr: got no matches.");
667         goto bail;
668     }
669 
670     name  = ucsdet_getName(match, &status);
671     if (strcmp(name, "IBM424_ltr") != 0) {
672         errln("Encoding detection failure for IBM424_ltr: got %s", name);
673     }
674 
675 bail:
676     freeBytes(bytes);
677     freeBytes(bytes_r);
678     ucsdet_close(csd);
679 }
680 
IBM420Test()681 void CharsetDetectionTest::IBM420Test()
682 {
683     UErrorCode status = U_ZERO_ERROR;
684 
685     static const UChar chars[] = {
686         0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
687         0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
688         0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
689         0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
690         0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
691         0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
692         0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
693         0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
694         0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
695         0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
696         0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
697         0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
698         0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
699         0x0000
700     };
701     static const UChar chars_reverse[] = {
702         0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
703         0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
704         0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
705         0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
706         0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
707         0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
708         0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
709         0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
710         0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
711         0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
712         0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
713         0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
714         0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
715         0x0000,
716     };
717 
718     int32_t bLength = 0, brLength = 0;
719 
720     UnicodeString s1(chars);
721     UnicodeString s2(chars_reverse);
722 
723     char *bytes = extractBytes(s1, "IBM420", bLength);
724     char *bytes_r = extractBytes(s2, "IBM420", brLength);
725 
726     UCharsetDetector *csd = ucsdet_open(&status);
727     if (U_FAILURE(status)) {
728         errln("Error opening charset detector. - %s", u_errorName(status));
729     }
730 	ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
731 	ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
732 	ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
733 	ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
734     const UCharsetMatch *match;
735     const char *name;
736 
737     ucsdet_setText(csd, bytes, bLength, &status);
738     match = ucsdet_detect(csd, &status);
739 
740     if (match == NULL) {
741         errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
742         goto bail;
743     }
744 
745     name  = ucsdet_getName(match, &status);
746     if (strcmp(name, "IBM420_rtl") != 0) {
747         errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
748     }
749 
750     ucsdet_setText(csd, bytes_r, brLength, &status);
751     match = ucsdet_detect(csd, &status);
752 
753     if (match == NULL) {
754         errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
755         goto bail;
756     }
757 
758     name  = ucsdet_getName(match, &status);
759     if (strcmp(name, "IBM420_ltr") != 0) {
760         errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
761     }
762 
763 bail:
764     freeBytes(bytes);
765     freeBytes(bytes_r);
766     ucsdet_close(csd);
767 }
768 
769 
Ticket6394Test()770 void CharsetDetectionTest::Ticket6394Test() {
771 #if !UCONFIG_NO_CONVERSION
772     const char charText[] =  "Here is some random English text that should be detected as ISO-8859-1."
773                              "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
774                              "encodings more than once.  The hop through UnicodeString is for platforms "
775                              "where this char * string is be EBCDIC and needs conversion to Latin1.";
776     char latin1Text[sizeof(charText)];
777     UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
778 
779     UErrorCode status = U_ZERO_ERROR;
780     UCharsetDetector *csd = ucsdet_open(&status);
781     ucsdet_setText(csd, latin1Text, -1, &status);
782     if (U_FAILURE(status)) {
783         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
784         return;
785     }
786 
787     int32_t matchCount = 0;
788     const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
789     if (U_FAILURE(status)) {
790         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
791         return;
792     }
793 
794     UnicodeSet  setOfCharsetNames;    // UnicodSets can hold strings.
795     int32_t i;
796     for (i=0; i<matchCount; i++) {
797         UnicodeString charSetName(ucsdet_getName(matches[i], &status));
798         if (U_FAILURE(status)) {
799             errln("Fail at file %s, line %d.  status = %s;  i=%d", __FILE__, __LINE__, u_errorName(status), i);
800             status = U_ZERO_ERROR;
801         }
802         if (setOfCharsetNames.contains(charSetName)) {
803             errln("Fail at file %s, line %d ", __FILE__, __LINE__);
804             errln(UnicodeString("   Duplicate charset name = ") + charSetName);
805         }
806         setOfCharsetNames.add(charSetName);
807     }
808     ucsdet_close(csd);
809 #endif
810 }
811 
812 
813 // Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
814 //               similar Windows and non-Windows SBCS encodings. State was kept in the shared
815 //               Charset Recognizer objects, and could be overwritten.
Ticket6954Test()816 void CharsetDetectionTest::Ticket6954Test() {
817 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_FORMATTING
818     UErrorCode status = U_ZERO_ERROR;
819     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
820     UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
821                             "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
822     UnicodeString sWindows  = ssWindows.unescape();
823     int32_t lISO = 0, lWindows = 0;
824     char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
825     char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
826 
827     // First do a plain vanilla detect of 1252 text
828 
829     UCharsetDetector *csd1 = ucsdet_open(&status);
830     ucsdet_setText(csd1, bWindows, lWindows, &status);
831     const UCharsetMatch *match1 = ucsdet_detect(csd1, &status);
832     const char *name1 = ucsdet_getName(match1, &status);
833     TEST_ASSERT_SUCCESS(status);
834     TEST_ASSERT(strcmp(name1, "windows-1252")==0);
835 
836     // Next, using a completely separate detector, detect some 8859-1 text
837 
838     UCharsetDetector *csd2 = ucsdet_open(&status);
839     ucsdet_setText(csd2, bISO, lISO, &status);
840     const UCharsetMatch *match2 = ucsdet_detect(csd2, &status);
841     const char *name2 = ucsdet_getName(match2, &status);
842     TEST_ASSERT_SUCCESS(status);
843     TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0);
844 
845     // Recheck the 1252 results from the first detector, which should not have been
846     //  altered by the use of a different detector.
847 
848     name1 = ucsdet_getName(match1, &status);
849     TEST_ASSERT_SUCCESS(status);
850     TEST_ASSERT(strcmp(name1, "windows-1252")==0);
851 
852     ucsdet_close(csd1);
853     ucsdet_close(csd2);
854     freeBytes(bISO);
855     freeBytes(bWindows);
856 #endif
857 }
858