1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2005-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9
10
11 #include "unicode/utypes.h"
12 #include "unicode/ucsdet.h"
13 #include "unicode/ucnv.h"
14 #include "unicode/unistr.h"
15 #include "unicode/putil.h"
16 #include "unicode/uniset.h"
17
18 #include "intltest.h"
19 #include "csdetest.h"
20
21 #include "xmlparser.h"
22
23 #include <memory>
24 #include <stdlib.h>
25 #include <string.h>
26
27 #ifdef DEBUG_DETECT
28 #include <stdio.h>
29 #endif
30
31
32 #define CH_SPACE 0x0020
33 #define CH_SLASH 0x002F
34
35 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
36 if (!(x)) { \
37 errln("Failure in file %s, line %d", __FILE__, __LINE__); \
38 } \
39 } UPRV_BLOCK_MACRO_END
40
41 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
42 if (U_FAILURE(errcode)) { \
43 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
44 return; \
45 } \
46 } UPRV_BLOCK_MACRO_END
47
48
49 //---------------------------------------------------------------------------
50 //
51 // Test class boilerplate
52 //
53 //---------------------------------------------------------------------------
CharsetDetectionTest()54 CharsetDetectionTest::CharsetDetectionTest()
55 {
56 }
57
58
~CharsetDetectionTest()59 CharsetDetectionTest::~CharsetDetectionTest()
60 {
61 }
62
63
64
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)65 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
66 {
67 if (exec) logln("TestSuite CharsetDetectionTest: ");
68 switch (index) {
69 case 0: name = "ConstructionTest";
70 if (exec) ConstructionTest();
71 break;
72
73 case 1: name = "UTF8Test";
74 if (exec) UTF8Test();
75 break;
76
77 case 2: name = "UTF16Test";
78 if (exec) UTF16Test();
79 break;
80
81 case 3: name = "C1BytesTest";
82 if (exec) C1BytesTest();
83 break;
84
85 case 4: name = "InputFilterTest";
86 if (exec) InputFilterTest();
87 break;
88
89 case 5: name = "DetectionTest";
90 if (exec) DetectionTest();
91 break;
92 #if !UCONFIG_NO_LEGACY_CONVERSION
93 case 6: name = "IBM424Test";
94 if (exec) IBM424Test();
95 break;
96
97 case 7: name = "IBM420Test";
98 if (exec) IBM420Test();
99 break;
100 #else
101 case 6:
102 case 7: name = "skip"; break;
103 #endif
104 case 8: name = "Ticket6394Test";
105 if (exec) Ticket6394Test();
106 break;
107
108 case 9: name = "Ticket6954Test";
109 if (exec) Ticket6954Test();
110 break;
111
112 case 10: name = "Ticket21823Test";
113 if (exec) Ticket21823Test();
114 break;
115
116 default: name = "";
117 break; //needed to end loop
118 }
119 }
120
split(const UnicodeString & src,UChar ch,int32_t & splits)121 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
122 {
123 int32_t offset = -1;
124
125 splits = 1;
126 while((offset = src.indexOf(ch, offset + 1)) >= 0) {
127 splits += 1;
128 }
129
130 UnicodeString *result = new UnicodeString[splits];
131
132 int32_t start = 0;
133 int32_t split = 0;
134 int32_t end;
135
136 while((end = src.indexOf(ch, start)) >= 0) {
137 src.extractBetween(start, end, result[split++]);
138 start = end + 1;
139 }
140
141 src.extractBetween(start, src.length(), result[split]);
142
143 return result;
144 }
145
extractBytes(const UnicodeString & source,const char * codepage,int32_t & length)146 static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
147 {
148 int32_t sLength = source.length();
149 char *bytes = NULL;
150
151 length = source.extract(0, sLength, NULL, codepage);
152
153 if (length > 0) {
154 bytes = new char[length + 1];
155 source.extract(0, sLength, bytes, codepage);
156 }
157
158 return bytes;
159 }
160
checkEncoding(const UnicodeString & testString,const UnicodeString & encoding,const UnicodeString & id)161 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
162 {
163 int32_t splits = 0;
164 int32_t testLength = testString.length();
165 std::unique_ptr<UnicodeString []> eSplit(split(encoding, CH_SLASH, splits));
166 UErrorCode status = U_ZERO_ERROR;
167 int32_t cpLength = eSplit[0].length();
168 char codepage[64];
169
170 u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
171 codepage[cpLength] = '\0';
172
173 LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
174
175 int32_t byteLength = 0;
176 std::unique_ptr<char []> bytes(extractBytes(testString, codepage, byteLength));
177
178 if (! bytes) {
179 #if !UCONFIG_NO_LEGACY_CONVERSION
180 dataerrln("Can't open a " + encoding + " converter for " + id);
181 #endif
182 return;
183 }
184
185 ucsdet_setText(csd.getAlias(), bytes.get(), byteLength, &status);
186
187 int32_t matchCount = 0;
188 const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
189
190
191 UnicodeString name(ucsdet_getName(matches[0], &status));
192 UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
193 UChar *decoded = NULL;
194 int32_t dLength = 0;
195
196 if (matchCount == 0) {
197 errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
198 return;
199 }
200
201 if (name.compare(eSplit[0]) != 0) {
202 errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
203
204 #ifdef DEBUG_DETECT
205 for (int32_t m = 0; m < matchCount; m += 1) {
206 const char *name = ucsdet_getName(matches[m], &status);
207 const char *lang = ucsdet_getLanguage(matches[m], &status);
208 int32_t confidence = ucsdet_getConfidence(matches[m], &status);
209
210 printf("%s (%s) %d\n", name, lang, confidence);
211 }
212 #endif
213 return;
214 }
215
216 if (splits > 1 && lang.compare(eSplit[1]) != 0) {
217 errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
218 return;
219 }
220
221 decoded = new UChar[testLength];
222 dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
223
224 if (testString.compare(decoded, dLength) != 0) {
225 errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yield the original string.");
226
227 #ifdef DEBUG_DETECT
228 for(int32_t i = 0; i < testLength; i += 1) {
229 if(testString[i] != decoded[i]) {
230 printf("Strings differ at byte %d\n", i);
231 break;
232 }
233 }
234 #endif
235
236 }
237
238 delete[] decoded;
239 }
240
getPath(char buffer[2048],const char * filename)241 const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
242 UErrorCode status = U_ZERO_ERROR;
243 const char *testDataDirectory = IntlTest::getSourceTestData(status);
244
245 if (U_FAILURE(status)) {
246 errln("ERROR: getPath() failed - %s", u_errorName(status));
247 return NULL;
248 }
249
250 strcpy(buffer, testDataDirectory);
251 strcat(buffer, filename);
252 return buffer;
253 }
254
ConstructionTest()255 void CharsetDetectionTest::ConstructionTest()
256 {
257 IcuTestErrorCode status(*this, "ConstructionTest");
258 LocalUCharsetDetectorPointer csd(ucsdet_open(status));
259 LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
260 int32_t count = uenum_count(e.getAlias(), status);
261
262 #ifdef DEBUG_DETECT
263 printf("There are %d recognizers.\n", count);
264 #endif
265
266 for(int32_t i = 0; i < count; i += 1) {
267 int32_t length;
268 const char *name = uenum_next(e.getAlias(), &length, status);
269
270 if(name == NULL || length <= 0) {
271 errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
272 }
273
274 #ifdef DEBUG_DETECT
275 printf("%s\n", name);
276 #endif
277 }
278
279 const char* defDisabled[] = {
280 "IBM420_rtl", "IBM420_ltr",
281 "IBM424_rtl", "IBM424_ltr",
282 0
283 };
284
285 LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status));
286 const char *activeName = NULL;
287
288 while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) {
289 // the charset must be included in all list
290 UBool found = false;
291
292 const char *name = NULL;
293 uenum_reset(e.getAlias(), status);
294 while ((name = uenum_next(e.getAlias(), NULL, status))) {
295 if (strcmp(activeName, name) == 0) {
296 found = true;
297 break;
298 }
299 }
300
301 if (!found) {
302 errln(UnicodeString(activeName) + " is not included in the all charset list.");
303 }
304
305 // some charsets are disabled by default
306 found = false;
307 for (int32_t i = 0; defDisabled[i] != 0; i++) {
308 if (strcmp(activeName, defDisabled[i]) == 0) {
309 found = true;
310 break;
311 }
312 }
313 if (found) {
314 errln(UnicodeString(activeName) + " should not be included in the default charset list.");
315 }
316 }
317 }
318
UTF8Test()319 void CharsetDetectionTest::UTF8Test()
320 {
321 UErrorCode status = U_ZERO_ERROR;
322 UnicodeString ss = "This is a string with some non-ascii characters that will "
323 "be converted to UTF-8, then shoved through the detection process. "
324 "\\u0391\\u0392\\u0393\\u0394\\u0395"
325 "Sure would be nice if our source could contain Unicode directly!";
326 UnicodeString s = ss.unescape();
327 int32_t byteLength = 0, sLength = s.length();
328 char *bytes = extractBytes(s, "UTF-8", byteLength);
329 UCharsetDetector *csd = ucsdet_open(&status);
330 const UCharsetMatch *match;
331 UChar *detected = new UChar[sLength];
332
333 ucsdet_setText(csd, bytes, byteLength, &status);
334 match = ucsdet_detect(csd, &status);
335
336 if (match == NULL) {
337 errln("Detection failure for UTF-8: got no matches.");
338 goto bail;
339 }
340
341 ucsdet_getUChars(match, detected, sLength, &status);
342
343 if (s.compare(detected, sLength) != 0) {
344 errln("Round-trip test failed!");
345 }
346
347 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
348
349 bail:
350 delete[] detected;
351 delete[] bytes;
352 ucsdet_close(csd);
353 }
354
UTF16Test()355 void CharsetDetectionTest::UTF16Test()
356 {
357 UErrorCode status = U_ZERO_ERROR;
358 /* Notice the BOM on the start of this string */
359 UChar chars[] = {
360 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
361 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
362 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
363 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
364 0x064a, 0x062a, 0x0000};
365 UnicodeString s(chars);
366 int32_t beLength = 0, leLength = 0;
367 std::unique_ptr<char []>beBytes(extractBytes(s, "UTF-16BE", beLength));
368 std::unique_ptr<char []>leBytes(extractBytes(s, "UTF-16LE", leLength));
369 LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
370 const UCharsetMatch *match;
371 const char *name;
372 int32_t conf;
373
374 ucsdet_setText(csd.getAlias(), beBytes.get(), beLength, &status);
375 match = ucsdet_detect(csd.getAlias(), &status);
376
377 if (match == NULL) {
378 errln("Encoding detection failure for UTF-16BE: got no matches.");
379 } else {
380
381 name = ucsdet_getName(match, &status);
382 conf = ucsdet_getConfidence(match, &status);
383
384 if (strcmp(name, "UTF-16BE") != 0) {
385 errln("Encoding detection failure for UTF-16BE: got %s", name);
386 } else if (conf != 100) {
387 errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
388 }
389 }
390
391 ucsdet_setText(csd.getAlias(), leBytes.get(), leLength, &status);
392 match = ucsdet_detect(csd.getAlias(), &status);
393
394 if (match == NULL) {
395 errln("Encoding detection failure for UTF-16LE: got no matches.");
396 return;
397 }
398
399 name = ucsdet_getName(match, &status);
400 conf = ucsdet_getConfidence(match, &status);
401
402 if (strcmp(name, "UTF-16LE") != 0) {
403 errln("Encoding detection failure for UTF-16LE: got %s", name);
404 return;
405 }
406
407 if (conf != 100) {
408 errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
409 }
410 }
411
InputFilterTest()412 void CharsetDetectionTest::InputFilterTest()
413 {
414 UErrorCode status = U_ZERO_ERROR;
415 UnicodeString s(u"<a> <lot> <of> <English> <inside> <the> <markup> Un très petit peu de Français. <to> <confuse> <the> <detector>");
416 int32_t byteLength = 0;
417 char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
418 UCharsetDetector *csd = ucsdet_open(&status);
419 const UCharsetMatch *match;
420 const char *lang, *name;
421
422 ucsdet_enableInputFilter(csd, true);
423
424 if (!ucsdet_isInputFilterEnabled(csd)) {
425 errln("ucsdet_enableInputFilter(csd, true) did not enable input filter!");
426 }
427
428
429 ucsdet_setText(csd, bytes, byteLength, &status);
430 match = ucsdet_detect(csd, &status);
431
432 if (match == NULL) {
433 errln("Turning on the input filter resulted in no matches.");
434 goto turn_off;
435 }
436
437 name = ucsdet_getName(match, &status);
438
439 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
440 errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
441 } else {
442 lang = ucsdet_getLanguage(match, &status);
443
444 if (lang == NULL || strcmp(lang, "fr") != 0) {
445 errln("Input filter did not strip markup!");
446 }
447 }
448
449 turn_off:
450 ucsdet_enableInputFilter(csd, false);
451 ucsdet_setText(csd, bytes, byteLength, &status);
452 match = ucsdet_detect(csd, &status);
453
454 if (match == NULL) {
455 errln("Turning off the input filter resulted in no matches.");
456 goto bail;
457 }
458
459 name = ucsdet_getName(match, &status);
460
461 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
462 errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
463 } else {
464 lang = ucsdet_getLanguage(match, &status);
465
466 if (lang == NULL || strcmp(lang, "en") != 0) {
467 errln("Unfiltered input did not detect as English!");
468 }
469 }
470
471 bail:
472 delete[] bytes;
473 ucsdet_close(csd);
474 }
475
C1BytesTest()476 void CharsetDetectionTest::C1BytesTest()
477 {
478 #if !UCONFIG_NO_LEGACY_CONVERSION
479 UErrorCode status = U_ZERO_ERROR;
480 UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
481 UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
482 UnicodeString sWindows = ssWindows.unescape();
483 int32_t lISO = 0, lWindows = 0;
484 char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
485 char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
486 UCharsetDetector *csd = ucsdet_open(&status);
487 const UCharsetMatch *match;
488 const char *name;
489
490 ucsdet_setText(csd, bWindows, lWindows, &status);
491 match = ucsdet_detect(csd, &status);
492
493 if (match == NULL) {
494 errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
495 goto bail;
496 }
497
498 name = ucsdet_getName(match, &status);
499
500 if (strcmp(name, "windows-1252") != 0) {
501 errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
502 }
503
504 ucsdet_setText(csd, bISO, lISO, &status);
505 match = ucsdet_detect(csd, &status);
506
507 if (match == NULL) {
508 errln("English text without C1 bytes got no matches.");
509 goto bail;
510 }
511
512 name = ucsdet_getName(match, &status);
513
514 if (strcmp(name, "ISO-8859-1") != 0) {
515 errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
516 }
517
518 bail:
519 delete[] bWindows;
520 delete[] bISO;
521
522 ucsdet_close(csd);
523 #endif
524 }
525
DetectionTest()526 void CharsetDetectionTest::DetectionTest()
527 {
528 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
529 UErrorCode status = U_ZERO_ERROR;
530 char path[2048];
531 const char *testFilePath = getPath(path, "csdetest.xml");
532
533 if (testFilePath == NULL) {
534 return; /* Couldn't get path: error message already output. */
535 }
536
537 UXMLParser *parser = UXMLParser::createParser(status);
538 if (U_FAILURE(status)) {
539 dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
540 return;
541 }
542
543 UXMLElement *root = parser->parseFile(testFilePath, status);
544 if (!assertSuccess( "parseFile",status)) return;
545
546 UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
547 UnicodeString id_attr = UNICODE_STRING_SIMPLE("id");
548 UnicodeString enc_attr = UNICODE_STRING_SIMPLE("encodings");
549
550 const UXMLElement *testCase;
551 int32_t tc = 0;
552
553 while((testCase = root->nextChildElement(tc)) != NULL) {
554 if (testCase->getTagName().compare(test_case) == 0) {
555 const UnicodeString *id = testCase->getAttribute(id_attr);
556 const UnicodeString *encodings = testCase->getAttribute(enc_attr);
557 const UnicodeString text = testCase->getText(true);
558 int32_t encodingCount;
559 UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
560
561 for(int32_t e = 0; e < encodingCount; e += 1) {
562 checkEncoding(text, encodingList[e], *id);
563 }
564
565 delete[] encodingList;
566 }
567 }
568
569 delete root;
570 delete parser;
571 #endif
572 }
573
IBM424Test()574 void CharsetDetectionTest::IBM424Test()
575 {
576 #if !UCONFIG_ONLY_HTML_CONVERSION
577 UErrorCode status = U_ZERO_ERROR;
578
579 static const UChar chars[] = {
580 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
581 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
582 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
583 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
584 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
585 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
586 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
587 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
588 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
589 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
590 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
591 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
592 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
593 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
594 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
595 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
596 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
597 };
598
599 static const UChar chars_reverse[] = {
600 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
601 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
602 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
603 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
604 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
605 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
606 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
607 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
608 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
609 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
610 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
611 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
612 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
613 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
614 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
615 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
616 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
617 0x0000
618 };
619
620 int32_t bLength = 0, brLength = 0;
621
622 UnicodeString s1(chars);
623 UnicodeString s2(chars_reverse);
624
625 char *bytes = extractBytes(s1, "IBM424", bLength);
626 char *bytes_r = extractBytes(s2, "IBM424", brLength);
627
628 UCharsetDetector *csd = ucsdet_open(&status);
629 ucsdet_setDetectableCharset(csd, "IBM424_rtl", true, &status);
630 ucsdet_setDetectableCharset(csd, "IBM424_ltr", true, &status);
631 ucsdet_setDetectableCharset(csd, "IBM420_rtl", true, &status);
632 ucsdet_setDetectableCharset(csd, "IBM420_ltr", true, &status);
633 if (U_FAILURE(status)) {
634 errln("Error opening charset detector. - %s", u_errorName(status));
635 }
636 const UCharsetMatch *match;
637 const char *name;
638
639 ucsdet_setText(csd, bytes, bLength, &status);
640 match = ucsdet_detect(csd, &status);
641
642 if (match == NULL) {
643 errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
644 goto bail;
645 }
646
647 name = ucsdet_getName(match, &status);
648 if (strcmp(name, "IBM424_rtl") != 0) {
649 errln("Encoding detection failure for IBM424_rtl: got %s", name);
650 }
651
652 ucsdet_setText(csd, bytes_r, brLength, &status);
653 match = ucsdet_detect(csd, &status);
654
655 if (match == NULL) {
656 errln("Encoding detection failure for IBM424_ltr: got no matches.");
657 goto bail;
658 }
659
660 name = ucsdet_getName(match, &status);
661 if (strcmp(name, "IBM424_ltr") != 0) {
662 errln("Encoding detection failure for IBM424_ltr: got %s", name);
663 }
664
665 bail:
666 delete[] bytes;
667 delete[] bytes_r;
668 ucsdet_close(csd);
669 #endif
670 }
671
IBM420Test()672 void CharsetDetectionTest::IBM420Test()
673 {
674 #if !UCONFIG_ONLY_HTML_CONVERSION
675 UErrorCode status = U_ZERO_ERROR;
676
677 static const UChar chars[] = {
678 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
679 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
680 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
681 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
682 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
683 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
684 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
685 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
686 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
687 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
688 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
689 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
690 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
691 0x0000
692 };
693 static const UChar chars_reverse[] = {
694 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
695 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
696 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
697 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
698 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
699 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
700 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
701 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
702 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
703 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
704 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
705 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
706 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
707 0x0000,
708 };
709
710 int32_t bLength = 0, brLength = 0;
711
712 UnicodeString s1(chars);
713 UnicodeString s2(chars_reverse);
714
715 char *bytes = extractBytes(s1, "IBM420", bLength);
716 char *bytes_r = extractBytes(s2, "IBM420", brLength);
717
718 UCharsetDetector *csd = ucsdet_open(&status);
719 if (U_FAILURE(status)) {
720 errln("Error opening charset detector. - %s", u_errorName(status));
721 }
722 ucsdet_setDetectableCharset(csd, "IBM424_rtl", true, &status);
723 ucsdet_setDetectableCharset(csd, "IBM424_ltr", true, &status);
724 ucsdet_setDetectableCharset(csd, "IBM420_rtl", true, &status);
725 ucsdet_setDetectableCharset(csd, "IBM420_ltr", true, &status);
726 const UCharsetMatch *match;
727 const char *name;
728
729 ucsdet_setText(csd, bytes, bLength, &status);
730 match = ucsdet_detect(csd, &status);
731
732 if (match == NULL) {
733 errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
734 goto bail;
735 }
736
737 name = ucsdet_getName(match, &status);
738 if (strcmp(name, "IBM420_rtl") != 0) {
739 errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
740 }
741
742 ucsdet_setText(csd, bytes_r, brLength, &status);
743 match = ucsdet_detect(csd, &status);
744
745 if (match == NULL) {
746 errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
747 goto bail;
748 }
749
750 name = ucsdet_getName(match, &status);
751 if (strcmp(name, "IBM420_ltr") != 0) {
752 errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
753 }
754
755 bail:
756 delete[] bytes;
757 delete[] bytes_r;
758 ucsdet_close(csd);
759 #endif
760 }
761
762
Ticket6394Test()763 void CharsetDetectionTest::Ticket6394Test() {
764 #if !UCONFIG_NO_CONVERSION
765 const char charText[] = "Here is some random English text that should be detected as ISO-8859-1."
766 "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
767 "encodings more than once. The hop through UnicodeString is for platforms "
768 "where this char * string is be EBCDIC and needs conversion to Latin1.";
769 char latin1Text[sizeof(charText)];
770 UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
771
772 UErrorCode status = U_ZERO_ERROR;
773 UCharsetDetector *csd = ucsdet_open(&status);
774 ucsdet_setText(csd, latin1Text, -1, &status);
775 if (U_FAILURE(status)) {
776 errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status));
777 return;
778 }
779
780 int32_t matchCount = 0;
781 const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
782 if (U_FAILURE(status)) {
783 errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status));
784 return;
785 }
786
787 UnicodeSet setOfCharsetNames; // UnicodeSets can hold strings.
788 int32_t i;
789 for (i=0; i<matchCount; i++) {
790 UnicodeString charSetName(ucsdet_getName(matches[i], &status));
791 if (U_FAILURE(status)) {
792 errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__, __LINE__, u_errorName(status), i);
793 status = U_ZERO_ERROR;
794 }
795 if (setOfCharsetNames.contains(charSetName)) {
796 errln("Fail at file %s, line %d ", __FILE__, __LINE__);
797 errln(UnicodeString(" Duplicate charset name = ") + charSetName);
798 }
799 setOfCharsetNames.add(charSetName);
800 }
801 ucsdet_close(csd);
802 #endif
803 }
804
805
806 // Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
807 // similar Windows and non-Windows SBCS encodings. State was kept in the shared
808 // Charset Recognizer objects, and could be overwritten.
Ticket6954Test()809 void CharsetDetectionTest::Ticket6954Test() {
810 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_FORMATTING
811 UErrorCode status = U_ZERO_ERROR;
812 UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
813 UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
814 "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
815 UnicodeString sWindows = ssWindows.unescape();
816 int32_t lISO = 0, lWindows = 0;
817 std::unique_ptr<char[]> bISO(extractBytes(sISO, "ISO-8859-1", lISO));
818 std::unique_ptr<char[]> bWindows(extractBytes(sWindows, "windows-1252", lWindows));
819
820 // First do a plain vanilla detect of 1252 text
821
822 LocalUCharsetDetectorPointer csd1(ucsdet_open(&status));
823 ucsdet_setText(csd1.getAlias(), bWindows.get(), lWindows, &status);
824 const UCharsetMatch *match1 = ucsdet_detect(csd1.getAlias(), &status);
825 const char *name1 = ucsdet_getName(match1, &status);
826 TEST_ASSERT_SUCCESS(status);
827 TEST_ASSERT(strcmp(name1, "windows-1252")==0);
828
829 // Next, using a completely separate detector, detect some 8859-1 text
830
831 LocalUCharsetDetectorPointer csd2(ucsdet_open(&status));
832 ucsdet_setText(csd2.getAlias(), bISO.get(), lISO, &status);
833 const UCharsetMatch *match2 = ucsdet_detect(csd2.getAlias(), &status);
834 const char *name2 = ucsdet_getName(match2, &status);
835 TEST_ASSERT_SUCCESS(status);
836 TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0);
837
838 // Recheck the 1252 results from the first detector, which should not have been
839 // altered by the use of a different detector.
840
841 name1 = ucsdet_getName(match1, &status);
842 TEST_ASSERT_SUCCESS(status);
843 TEST_ASSERT(strcmp(name1, "windows-1252")==0);
844 #endif
845 }
846
847
848 // Ticket 21823 - Issue with Charset Detector for ill-formed input strings.
849 // Its fix involves returning a failure based error code
850 // (U_INVALID_CHAR_FOUND) incase no charsets appear to match the input data.
Ticket21823Test()851 void CharsetDetectionTest::Ticket21823Test() {
852 UErrorCode status = U_ZERO_ERROR;
853 std::string str = "\x80";
854 UCharsetDetector* csd = ucsdet_open(&status);
855
856 ucsdet_setText(csd, str.data(), str.length(), &status);
857 const UCharsetMatch* match = ucsdet_detect(csd, &status);
858
859 if (match == NULL) {
860 TEST_ASSERT(U_FAILURE(status));
861 }
862
863 ucsdet_close(csd);
864 }
865