1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2005-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9
10
11 #include "unicode/utypes.h"
12 #include "unicode/ucsdet.h"
13 #include "unicode/ucnv.h"
14 #include "unicode/unistr.h"
15 #include "unicode/putil.h"
16 #include "unicode/uniset.h"
17
18 #include "intltest.h"
19 #include "csdetest.h"
20
21 #include "xmlparser.h"
22
23 #include <memory>
24 #include <stdlib.h>
25 #include <string.h>
26
27 #ifdef DEBUG_DETECT
28 #include <stdio.h>
29 #endif
30
31
32 #define CH_SPACE 0x0020
33 #define CH_SLASH 0x002F
34
35 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
36 if (!(x)) { \
37 errln("Failure in file %s, line %d", __FILE__, __LINE__); \
38 } \
39 } UPRV_BLOCK_MACRO_END
40
41 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
42 if (U_FAILURE(errcode)) { \
43 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
44 return; \
45 } \
46 } UPRV_BLOCK_MACRO_END
47
48
49 //---------------------------------------------------------------------------
50 //
51 // Test class boilerplate
52 //
53 //---------------------------------------------------------------------------
CharsetDetectionTest()54 CharsetDetectionTest::CharsetDetectionTest()
55 {
56 }
57
58
~CharsetDetectionTest()59 CharsetDetectionTest::~CharsetDetectionTest()
60 {
61 }
62
63
64
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)65 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
66 {
67 if (exec) logln("TestSuite CharsetDetectionTest: ");
68 switch (index) {
69 case 0: name = "ConstructionTest";
70 if (exec) ConstructionTest();
71 break;
72
73 case 1: name = "UTF8Test";
74 if (exec) UTF8Test();
75 break;
76
77 case 2: name = "UTF16Test";
78 if (exec) UTF16Test();
79 break;
80
81 case 3: name = "C1BytesTest";
82 if (exec) C1BytesTest();
83 break;
84
85 case 4: name = "InputFilterTest";
86 if (exec) InputFilterTest();
87 break;
88
89 case 5: name = "DetectionTest";
90 if (exec) DetectionTest();
91 break;
92 #if !UCONFIG_NO_LEGACY_CONVERSION
93 case 6: name = "IBM424Test";
94 if (exec) IBM424Test();
95 break;
96
97 case 7: name = "IBM420Test";
98 if (exec) IBM420Test();
99 break;
100 #else
101 case 6:
102 case 7: name = "skip"; break;
103 #endif
104 case 8: name = "Ticket6394Test";
105 if (exec) Ticket6394Test();
106 break;
107
108 case 9: name = "Ticket6954Test";
109 if (exec) Ticket6954Test();
110 break;
111
112 default: name = "";
113 break; //needed to end loop
114 }
115 }
116
split(const UnicodeString & src,UChar ch,int32_t & splits)117 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
118 {
119 int32_t offset = -1;
120
121 splits = 1;
122 while((offset = src.indexOf(ch, offset + 1)) >= 0) {
123 splits += 1;
124 }
125
126 UnicodeString *result = new UnicodeString[splits];
127
128 int32_t start = 0;
129 int32_t split = 0;
130 int32_t end;
131
132 while((end = src.indexOf(ch, start)) >= 0) {
133 src.extractBetween(start, end, result[split++]);
134 start = end + 1;
135 }
136
137 src.extractBetween(start, src.length(), result[split]);
138
139 return result;
140 }
141
extractBytes(const UnicodeString & source,const char * codepage,int32_t & length)142 static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
143 {
144 int32_t sLength = source.length();
145 char *bytes = NULL;
146
147 length = source.extract(0, sLength, NULL, codepage);
148
149 if (length > 0) {
150 bytes = new char[length + 1];
151 source.extract(0, sLength, bytes, codepage);
152 }
153
154 return bytes;
155 }
156
checkEncoding(const UnicodeString & testString,const UnicodeString & encoding,const UnicodeString & id)157 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
158 {
159 int32_t splits = 0;
160 int32_t testLength = testString.length();
161 std::unique_ptr<UnicodeString []> eSplit(split(encoding, CH_SLASH, splits));
162 UErrorCode status = U_ZERO_ERROR;
163 int32_t cpLength = eSplit[0].length();
164 char codepage[64];
165
166 u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
167 codepage[cpLength] = '\0';
168
169 LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
170
171 int32_t byteLength = 0;
172 std::unique_ptr<char []> bytes(extractBytes(testString, codepage, byteLength));
173
174 if (! bytes) {
175 #if !UCONFIG_NO_LEGACY_CONVERSION
176 dataerrln("Can't open a " + encoding + " converter for " + id);
177 #endif
178 return;
179 }
180
181 ucsdet_setText(csd.getAlias(), bytes.get(), byteLength, &status);
182
183 int32_t matchCount = 0;
184 const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
185
186
187 UnicodeString name(ucsdet_getName(matches[0], &status));
188 UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
189 UChar *decoded = NULL;
190 int32_t dLength = 0;
191
192 if (matchCount == 0) {
193 errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
194 return;
195 }
196
197 if (name.compare(eSplit[0]) != 0) {
198 errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
199
200 #ifdef DEBUG_DETECT
201 for (int32_t m = 0; m < matchCount; m += 1) {
202 const char *name = ucsdet_getName(matches[m], &status);
203 const char *lang = ucsdet_getLanguage(matches[m], &status);
204 int32_t confidence = ucsdet_getConfidence(matches[m], &status);
205
206 printf("%s (%s) %d\n", name, lang, confidence);
207 }
208 #endif
209 return;
210 }
211
212 if (splits > 1 && lang.compare(eSplit[1]) != 0) {
213 errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
214 return;
215 }
216
217 decoded = new UChar[testLength];
218 dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
219
220 if (testString.compare(decoded, dLength) != 0) {
221 errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
222
223 #ifdef DEBUG_DETECT
224 for(int32_t i = 0; i < testLength; i += 1) {
225 if(testString[i] != decoded[i]) {
226 printf("Strings differ at byte %d\n", i);
227 break;
228 }
229 }
230 #endif
231
232 }
233
234 delete[] decoded;
235 }
236
getPath(char buffer[2048],const char * filename)237 const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
238 UErrorCode status = U_ZERO_ERROR;
239 const char *testDataDirectory = IntlTest::getSourceTestData(status);
240
241 if (U_FAILURE(status)) {
242 errln("ERROR: getPath() failed - %s", u_errorName(status));
243 return NULL;
244 }
245
246 strcpy(buffer, testDataDirectory);
247 strcat(buffer, filename);
248 return buffer;
249 }
250
ConstructionTest()251 void CharsetDetectionTest::ConstructionTest()
252 {
253 IcuTestErrorCode status(*this, "ConstructionTest");
254 LocalUCharsetDetectorPointer csd(ucsdet_open(status));
255 LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
256 int32_t count = uenum_count(e.getAlias(), status);
257
258 #ifdef DEBUG_DETECT
259 printf("There are %d recognizers.\n", count);
260 #endif
261
262 for(int32_t i = 0; i < count; i += 1) {
263 int32_t length;
264 const char *name = uenum_next(e.getAlias(), &length, status);
265
266 if(name == NULL || length <= 0) {
267 errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
268 }
269
270 #ifdef DEBUG_DETECT
271 printf("%s\n", name);
272 #endif
273 }
274
275 const char* defDisabled[] = {
276 "IBM420_rtl", "IBM420_ltr",
277 "IBM424_rtl", "IBM424_ltr",
278 0
279 };
280
281 LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status));
282 const char *activeName = NULL;
283
284 while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) {
285 // the charset must be included in all list
286 UBool found = FALSE;
287
288 const char *name = NULL;
289 uenum_reset(e.getAlias(), status);
290 while ((name = uenum_next(e.getAlias(), NULL, status))) {
291 if (strcmp(activeName, name) == 0) {
292 found = TRUE;
293 break;
294 }
295 }
296
297 if (!found) {
298 errln(UnicodeString(activeName) + " is not included in the all charset list.");
299 }
300
301 // some charsets are disabled by default
302 found = FALSE;
303 for (int32_t i = 0; defDisabled[i] != 0; i++) {
304 if (strcmp(activeName, defDisabled[i]) == 0) {
305 found = TRUE;
306 break;
307 }
308 }
309 if (found) {
310 errln(UnicodeString(activeName) + " should not be included in the default charset list.");
311 }
312 }
313 }
314
UTF8Test()315 void CharsetDetectionTest::UTF8Test()
316 {
317 UErrorCode status = U_ZERO_ERROR;
318 UnicodeString ss = "This is a string with some non-ascii characters that will "
319 "be converted to UTF-8, then shoved through the detection process. "
320 "\\u0391\\u0392\\u0393\\u0394\\u0395"
321 "Sure would be nice if our source could contain Unicode directly!";
322 UnicodeString s = ss.unescape();
323 int32_t byteLength = 0, sLength = s.length();
324 char *bytes = extractBytes(s, "UTF-8", byteLength);
325 UCharsetDetector *csd = ucsdet_open(&status);
326 const UCharsetMatch *match;
327 UChar *detected = new UChar[sLength];
328
329 ucsdet_setText(csd, bytes, byteLength, &status);
330 match = ucsdet_detect(csd, &status);
331
332 if (match == NULL) {
333 errln("Detection failure for UTF-8: got no matches.");
334 goto bail;
335 }
336
337 ucsdet_getUChars(match, detected, sLength, &status);
338
339 if (s.compare(detected, sLength) != 0) {
340 errln("Round-trip test failed!");
341 }
342
343 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
344
345 bail:
346 delete[] detected;
347 delete[] bytes;
348 ucsdet_close(csd);
349 }
350
UTF16Test()351 void CharsetDetectionTest::UTF16Test()
352 {
353 UErrorCode status = U_ZERO_ERROR;
354 /* Notice the BOM on the start of this string */
355 UChar chars[] = {
356 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
357 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
358 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
359 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
360 0x064a, 0x062a, 0x0000};
361 UnicodeString s(chars);
362 int32_t beLength = 0, leLength = 0;
363 std::unique_ptr<char []>beBytes(extractBytes(s, "UTF-16BE", beLength));
364 std::unique_ptr<char []>leBytes(extractBytes(s, "UTF-16LE", leLength));
365 LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
366 const UCharsetMatch *match;
367 const char *name;
368 int32_t conf;
369
370 ucsdet_setText(csd.getAlias(), beBytes.get(), beLength, &status);
371 match = ucsdet_detect(csd.getAlias(), &status);
372
373 if (match == NULL) {
374 errln("Encoding detection failure for UTF-16BE: got no matches.");
375 } else {
376
377 name = ucsdet_getName(match, &status);
378 conf = ucsdet_getConfidence(match, &status);
379
380 if (strcmp(name, "UTF-16BE") != 0) {
381 errln("Encoding detection failure for UTF-16BE: got %s", name);
382 } else if (conf != 100) {
383 errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
384 }
385 }
386
387 ucsdet_setText(csd.getAlias(), leBytes.get(), leLength, &status);
388 match = ucsdet_detect(csd.getAlias(), &status);
389
390 if (match == NULL) {
391 errln("Encoding detection failure for UTF-16LE: got no matches.");
392 return;
393 }
394
395 name = ucsdet_getName(match, &status);
396 conf = ucsdet_getConfidence(match, &status);
397
398 if (strcmp(name, "UTF-16LE") != 0) {
399 errln("Enconding detection failure for UTF-16LE: got %s", name);
400 return;
401 }
402
403 if (conf != 100) {
404 errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
405 }
406 }
407
InputFilterTest()408 void CharsetDetectionTest::InputFilterTest()
409 {
410 UErrorCode status = U_ZERO_ERROR;
411 UnicodeString s(u"<a> <lot> <of> <English> <inside> <the> <markup> Un très petit peu de Français. <to> <confuse> <the> <detector>");
412 int32_t byteLength = 0;
413 char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
414 UCharsetDetector *csd = ucsdet_open(&status);
415 const UCharsetMatch *match;
416 const char *lang, *name;
417
418 ucsdet_enableInputFilter(csd, TRUE);
419
420 if (!ucsdet_isInputFilterEnabled(csd)) {
421 errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
422 }
423
424
425 ucsdet_setText(csd, bytes, byteLength, &status);
426 match = ucsdet_detect(csd, &status);
427
428 if (match == NULL) {
429 errln("Turning on the input filter resulted in no matches.");
430 goto turn_off;
431 }
432
433 name = ucsdet_getName(match, &status);
434
435 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
436 errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
437 } else {
438 lang = ucsdet_getLanguage(match, &status);
439
440 if (lang == NULL || strcmp(lang, "fr") != 0) {
441 errln("Input filter did not strip markup!");
442 }
443 }
444
445 turn_off:
446 ucsdet_enableInputFilter(csd, FALSE);
447 ucsdet_setText(csd, bytes, byteLength, &status);
448 match = ucsdet_detect(csd, &status);
449
450 if (match == NULL) {
451 errln("Turning off the input filter resulted in no matches.");
452 goto bail;
453 }
454
455 name = ucsdet_getName(match, &status);
456
457 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
458 errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
459 } else {
460 lang = ucsdet_getLanguage(match, &status);
461
462 if (lang == NULL || strcmp(lang, "en") != 0) {
463 errln("Unfiltered input did not detect as English!");
464 }
465 }
466
467 bail:
468 delete[] bytes;
469 ucsdet_close(csd);
470 }
471
C1BytesTest()472 void CharsetDetectionTest::C1BytesTest()
473 {
474 #if !UCONFIG_NO_LEGACY_CONVERSION
475 UErrorCode status = U_ZERO_ERROR;
476 UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
477 UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
478 UnicodeString sWindows = ssWindows.unescape();
479 int32_t lISO = 0, lWindows = 0;
480 char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
481 char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
482 UCharsetDetector *csd = ucsdet_open(&status);
483 const UCharsetMatch *match;
484 const char *name;
485
486 ucsdet_setText(csd, bWindows, lWindows, &status);
487 match = ucsdet_detect(csd, &status);
488
489 if (match == NULL) {
490 errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
491 goto bail;
492 }
493
494 name = ucsdet_getName(match, &status);
495
496 if (strcmp(name, "windows-1252") != 0) {
497 errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
498 }
499
500 ucsdet_setText(csd, bISO, lISO, &status);
501 match = ucsdet_detect(csd, &status);
502
503 if (match == NULL) {
504 errln("English text without C1 bytes got no matches.");
505 goto bail;
506 }
507
508 name = ucsdet_getName(match, &status);
509
510 if (strcmp(name, "ISO-8859-1") != 0) {
511 errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
512 }
513
514 bail:
515 delete[] bWindows;
516 delete[] bISO;
517
518 ucsdet_close(csd);
519 #endif
520 }
521
DetectionTest()522 void CharsetDetectionTest::DetectionTest()
523 {
524 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
525 UErrorCode status = U_ZERO_ERROR;
526 char path[2048];
527 const char *testFilePath = getPath(path, "csdetest.xml");
528
529 if (testFilePath == NULL) {
530 return; /* Couldn't get path: error message already output. */
531 }
532
533 UXMLParser *parser = UXMLParser::createParser(status);
534 if (U_FAILURE(status)) {
535 dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
536 return;
537 }
538
539 UXMLElement *root = parser->parseFile(testFilePath, status);
540 if (!assertSuccess( "parseFile",status)) return;
541
542 UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
543 UnicodeString id_attr = UNICODE_STRING_SIMPLE("id");
544 UnicodeString enc_attr = UNICODE_STRING_SIMPLE("encodings");
545
546 const UXMLElement *testCase;
547 int32_t tc = 0;
548
549 while((testCase = root->nextChildElement(tc)) != NULL) {
550 if (testCase->getTagName().compare(test_case) == 0) {
551 const UnicodeString *id = testCase->getAttribute(id_attr);
552 const UnicodeString *encodings = testCase->getAttribute(enc_attr);
553 const UnicodeString text = testCase->getText(TRUE);
554 int32_t encodingCount;
555 UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
556
557 for(int32_t e = 0; e < encodingCount; e += 1) {
558 checkEncoding(text, encodingList[e], *id);
559 }
560
561 delete[] encodingList;
562 }
563 }
564
565 delete root;
566 delete parser;
567 #endif
568 }
569
IBM424Test()570 void CharsetDetectionTest::IBM424Test()
571 {
572 #if !UCONFIG_ONLY_HTML_CONVERSION
573 UErrorCode status = U_ZERO_ERROR;
574
575 static const UChar chars[] = {
576 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
577 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
578 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
579 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
580 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
581 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
582 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
583 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
584 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
585 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
586 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
587 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
588 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
589 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
590 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
591 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
592 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
593 };
594
595 static const UChar chars_reverse[] = {
596 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
597 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
598 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
599 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
600 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
601 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
602 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
603 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
604 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
605 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
606 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
607 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
608 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
609 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
610 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
611 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
612 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
613 0x0000
614 };
615
616 int32_t bLength = 0, brLength = 0;
617
618 UnicodeString s1(chars);
619 UnicodeString s2(chars_reverse);
620
621 char *bytes = extractBytes(s1, "IBM424", bLength);
622 char *bytes_r = extractBytes(s2, "IBM424", brLength);
623
624 UCharsetDetector *csd = ucsdet_open(&status);
625 ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
626 ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
627 ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
628 ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
629 if (U_FAILURE(status)) {
630 errln("Error opening charset detector. - %s", u_errorName(status));
631 }
632 const UCharsetMatch *match;
633 const char *name;
634
635 ucsdet_setText(csd, bytes, bLength, &status);
636 match = ucsdet_detect(csd, &status);
637
638 if (match == NULL) {
639 errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
640 goto bail;
641 }
642
643 name = ucsdet_getName(match, &status);
644 if (strcmp(name, "IBM424_rtl") != 0) {
645 errln("Encoding detection failure for IBM424_rtl: got %s", name);
646 }
647
648 ucsdet_setText(csd, bytes_r, brLength, &status);
649 match = ucsdet_detect(csd, &status);
650
651 if (match == NULL) {
652 errln("Encoding detection failure for IBM424_ltr: got no matches.");
653 goto bail;
654 }
655
656 name = ucsdet_getName(match, &status);
657 if (strcmp(name, "IBM424_ltr") != 0) {
658 errln("Encoding detection failure for IBM424_ltr: got %s", name);
659 }
660
661 bail:
662 delete[] bytes;
663 delete[] bytes_r;
664 ucsdet_close(csd);
665 #endif
666 }
667
IBM420Test()668 void CharsetDetectionTest::IBM420Test()
669 {
670 #if !UCONFIG_ONLY_HTML_CONVERSION
671 UErrorCode status = U_ZERO_ERROR;
672
673 static const UChar chars[] = {
674 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
675 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
676 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
677 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
678 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
679 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
680 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
681 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
682 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
683 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
684 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
685 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
686 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
687 0x0000
688 };
689 static const UChar chars_reverse[] = {
690 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
691 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
692 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
693 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
694 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
695 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
696 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
697 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
698 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
699 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
700 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
701 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
702 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
703 0x0000,
704 };
705
706 int32_t bLength = 0, brLength = 0;
707
708 UnicodeString s1(chars);
709 UnicodeString s2(chars_reverse);
710
711 char *bytes = extractBytes(s1, "IBM420", bLength);
712 char *bytes_r = extractBytes(s2, "IBM420", brLength);
713
714 UCharsetDetector *csd = ucsdet_open(&status);
715 if (U_FAILURE(status)) {
716 errln("Error opening charset detector. - %s", u_errorName(status));
717 }
718 ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
719 ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
720 ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
721 ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
722 const UCharsetMatch *match;
723 const char *name;
724
725 ucsdet_setText(csd, bytes, bLength, &status);
726 match = ucsdet_detect(csd, &status);
727
728 if (match == NULL) {
729 errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
730 goto bail;
731 }
732
733 name = ucsdet_getName(match, &status);
734 if (strcmp(name, "IBM420_rtl") != 0) {
735 errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
736 }
737
738 ucsdet_setText(csd, bytes_r, brLength, &status);
739 match = ucsdet_detect(csd, &status);
740
741 if (match == NULL) {
742 errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
743 goto bail;
744 }
745
746 name = ucsdet_getName(match, &status);
747 if (strcmp(name, "IBM420_ltr") != 0) {
748 errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
749 }
750
751 bail:
752 delete[] bytes;
753 delete[] bytes_r;
754 ucsdet_close(csd);
755 #endif
756 }
757
758
Ticket6394Test()759 void CharsetDetectionTest::Ticket6394Test() {
760 #if !UCONFIG_NO_CONVERSION
761 const char charText[] = "Here is some random English text that should be detected as ISO-8859-1."
762 "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
763 "encodings more than once. The hop through UnicodeString is for platforms "
764 "where this char * string is be EBCDIC and needs conversion to Latin1.";
765 char latin1Text[sizeof(charText)];
766 UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
767
768 UErrorCode status = U_ZERO_ERROR;
769 UCharsetDetector *csd = ucsdet_open(&status);
770 ucsdet_setText(csd, latin1Text, -1, &status);
771 if (U_FAILURE(status)) {
772 errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status));
773 return;
774 }
775
776 int32_t matchCount = 0;
777 const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
778 if (U_FAILURE(status)) {
779 errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status));
780 return;
781 }
782
783 UnicodeSet setOfCharsetNames; // UnicodSets can hold strings.
784 int32_t i;
785 for (i=0; i<matchCount; i++) {
786 UnicodeString charSetName(ucsdet_getName(matches[i], &status));
787 if (U_FAILURE(status)) {
788 errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__, __LINE__, u_errorName(status), i);
789 status = U_ZERO_ERROR;
790 }
791 if (setOfCharsetNames.contains(charSetName)) {
792 errln("Fail at file %s, line %d ", __FILE__, __LINE__);
793 errln(UnicodeString(" Duplicate charset name = ") + charSetName);
794 }
795 setOfCharsetNames.add(charSetName);
796 }
797 ucsdet_close(csd);
798 #endif
799 }
800
801
802 // Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
803 // similar Windows and non-Windows SBCS encodings. State was kept in the shared
804 // Charset Recognizer objects, and could be overwritten.
Ticket6954Test()805 void CharsetDetectionTest::Ticket6954Test() {
806 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_FORMATTING
807 UErrorCode status = U_ZERO_ERROR;
808 UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
809 UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
810 "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
811 UnicodeString sWindows = ssWindows.unescape();
812 int32_t lISO = 0, lWindows = 0;
813 std::unique_ptr<char[]> bISO(extractBytes(sISO, "ISO-8859-1", lISO));
814 std::unique_ptr<char[]> bWindows(extractBytes(sWindows, "windows-1252", lWindows));
815
816 // First do a plain vanilla detect of 1252 text
817
818 LocalUCharsetDetectorPointer csd1(ucsdet_open(&status));
819 ucsdet_setText(csd1.getAlias(), bWindows.get(), lWindows, &status);
820 const UCharsetMatch *match1 = ucsdet_detect(csd1.getAlias(), &status);
821 const char *name1 = ucsdet_getName(match1, &status);
822 TEST_ASSERT_SUCCESS(status);
823 TEST_ASSERT(strcmp(name1, "windows-1252")==0);
824
825 // Next, using a completely separate detector, detect some 8859-1 text
826
827 LocalUCharsetDetectorPointer csd2(ucsdet_open(&status));
828 ucsdet_setText(csd2.getAlias(), bISO.get(), lISO, &status);
829 const UCharsetMatch *match2 = ucsdet_detect(csd2.getAlias(), &status);
830 const char *name2 = ucsdet_getName(match2, &status);
831 TEST_ASSERT_SUCCESS(status);
832 TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0);
833
834 // Recheck the 1252 results from the first detector, which should not have been
835 // altered by the use of a different detector.
836
837 name1 = ucsdet_getName(match1, &status);
838 TEST_ASSERT_SUCCESS(status);
839 TEST_ASSERT(strcmp(name1, "windows-1252")==0);
840 #endif
841 }
842