1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2013, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8
9 #include "unicode/utypes.h"
10 #include "unicode/ucsdet.h"
11 #include "unicode/ucnv.h"
12 #include "unicode/unistr.h"
13 #include "unicode/putil.h"
14 #include "unicode/uniset.h"
15
16 #include "intltest.h"
17 #include "csdetest.h"
18
19 #include "xmlparser.h"
20
21 #include <stdlib.h>
22 #include <string.h>
23
24 #ifdef DEBUG_DETECT
25 #include <stdio.h>
26 #endif
27
28 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
29
30 #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
31 #define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
32
33 #define CH_SPACE 0x0020
34 #define CH_SLASH 0x002F
35
36 #define TEST_ASSERT(x) {if (!(x)) { \
37 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
38
39 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
40 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));\
41 return;}}
42
43
44 //---------------------------------------------------------------------------
45 //
46 // Test class boilerplate
47 //
48 //---------------------------------------------------------------------------
CharsetDetectionTest()49 CharsetDetectionTest::CharsetDetectionTest()
50 {
51 }
52
53
~CharsetDetectionTest()54 CharsetDetectionTest::~CharsetDetectionTest()
55 {
56 }
57
58
59
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)60 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
61 {
62 if (exec) logln("TestSuite CharsetDetectionTest: ");
63 switch (index) {
64 case 0: name = "ConstructionTest";
65 if (exec) ConstructionTest();
66 break;
67
68 case 1: name = "UTF8Test";
69 if (exec) UTF8Test();
70 break;
71
72 case 2: name = "UTF16Test";
73 if (exec) UTF16Test();
74 break;
75
76 case 3: name = "C1BytesTest";
77 if (exec) C1BytesTest();
78 break;
79
80 case 4: name = "InputFilterTest";
81 if (exec) InputFilterTest();
82 break;
83
84 case 5: name = "DetectionTest";
85 if (exec) DetectionTest();
86 break;
87 #if !UCONFIG_NO_LEGACY_CONVERSION
88 case 6: name = "IBM424Test";
89 if (exec) IBM424Test();
90 break;
91
92 case 7: name = "IBM420Test";
93 if (exec) IBM420Test();
94 break;
95 #else
96 case 6:
97 case 7: name = "skip"; break;
98 #endif
99 case 8: name = "Ticket6394Test";
100 if (exec) Ticket6394Test();
101 break;
102
103 case 9: name = "Ticket6954Test";
104 if (exec) Ticket6954Test();
105 break;
106
107 default: name = "";
108 break; //needed to end loop
109 }
110 }
111
split(const UnicodeString & src,UChar ch,int32_t & splits)112 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
113 {
114 int32_t offset = -1;
115
116 splits = 1;
117 while((offset = src.indexOf(ch, offset + 1)) >= 0) {
118 splits += 1;
119 }
120
121 UnicodeString *result = new UnicodeString[splits];
122
123 int32_t start = 0;
124 int32_t split = 0;
125 int32_t end;
126
127 while((end = src.indexOf(ch, start)) >= 0) {
128 src.extractBetween(start, end, result[split++]);
129 start = end + 1;
130 }
131
132 src.extractBetween(start, src.length(), result[split]);
133
134 return result;
135 }
136
extractBytes(const UnicodeString & source,const char * codepage,int32_t & length)137 static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
138 {
139 int32_t sLength = source.length();
140 char *bytes = NULL;
141
142 length = source.extract(0, sLength, NULL, codepage);
143
144 if (length > 0) {
145 bytes = NEW_ARRAY(char, length + 1);
146 source.extract(0, sLength, bytes, codepage);
147 }
148
149 return bytes;
150 }
151
freeBytes(char * bytes)152 static void freeBytes(char *bytes)
153 {
154 DELETE_ARRAY(bytes);
155 }
156
checkEncoding(const UnicodeString & testString,const UnicodeString & encoding,const UnicodeString & id)157 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
158 {
159 int32_t splits = 0;
160 int32_t testLength = testString.length();
161 UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
162 UErrorCode status = U_ZERO_ERROR;
163 int32_t cpLength = eSplit[0].length();
164 char codepage[64];
165
166 u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
167 codepage[cpLength] = '\0';
168
169 LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
170
171 int32_t byteLength = 0;
172 char *bytes = extractBytes(testString, codepage, byteLength);
173
174 if (bytes == NULL) {
175 #if !UCONFIG_NO_LEGACY_CONVERSION
176 dataerrln("Can't open a " + encoding + " converter for " + id);
177 #endif
178 return;
179 }
180
181 ucsdet_setText(csd.getAlias(), bytes, byteLength, &status);
182
183 int32_t matchCount = 0;
184 const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
185
186
187 UnicodeString name(ucsdet_getName(matches[0], &status));
188 UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
189 UChar *decoded = NULL;
190 int32_t dLength = 0;
191
192 if (matchCount == 0) {
193 errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
194 goto bail;
195 }
196
197 if (name.compare(eSplit[0]) != 0) {
198 errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
199
200 #ifdef DEBUG_DETECT
201 for (int32_t m = 0; m < matchCount; m += 1) {
202 const char *name = ucsdet_getName(matches[m], &status);
203 const char *lang = ucsdet_getLanguage(matches[m], &status);
204 int32_t confidence = ucsdet_getConfidence(matches[m], &status);
205
206 printf("%s (%s) %d\n", name, lang, confidence);
207 }
208 #endif
209 goto bail;
210 }
211
212 if (splits > 1 && lang.compare(eSplit[1]) != 0) {
213 errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
214 goto bail;
215 }
216
217 decoded = NEW_ARRAY(UChar, testLength);
218 dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
219
220 if (testString.compare(decoded, dLength) != 0) {
221 errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
222
223 #ifdef DEBUG_DETECT
224 for(int32_t i = 0; i < testLength; i += 1) {
225 if(testString[i] != decoded[i]) {
226 printf("Strings differ at byte %d\n", i);
227 break;
228 }
229 }
230 #endif
231
232 }
233
234 DELETE_ARRAY(decoded);
235
236 bail:
237 freeBytes(bytes);
238 delete[] eSplit;
239 }
240
getPath(char buffer[2048],const char * filename)241 const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
242 UErrorCode status = U_ZERO_ERROR;
243 const char *testDataDirectory = IntlTest::getSourceTestData(status);
244
245 if (U_FAILURE(status)) {
246 errln("ERROR: getPath() failed - %s", u_errorName(status));
247 return NULL;
248 }
249
250 strcpy(buffer, testDataDirectory);
251 strcat(buffer, filename);
252 return buffer;
253 }
254
ConstructionTest()255 void CharsetDetectionTest::ConstructionTest()
256 {
257 IcuTestErrorCode status(*this, "ConstructionTest");
258 LocalUCharsetDetectorPointer csd(ucsdet_open(status));
259 LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
260 int32_t count = uenum_count(e.getAlias(), status);
261
262 #ifdef DEBUG_DETECT
263 printf("There are %d recognizers.\n", count);
264 #endif
265
266 for(int32_t i = 0; i < count; i += 1) {
267 int32_t length;
268 const char *name = uenum_next(e.getAlias(), &length, status);
269
270 if(name == NULL || length <= 0) {
271 errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
272 }
273
274 #ifdef DEBUG_DETECT
275 printf("%s\n", name);
276 #endif
277 }
278
279 const char* defDisabled[] = {
280 "IBM420_rtl", "IBM420_ltr",
281 "IBM424_rtl", "IBM424_ltr",
282 0
283 };
284
285 LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status));
286 const char *activeName = NULL;
287
288 while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) {
289 // the charset must be included in all list
290 UBool found = FALSE;
291
292 const char *name = NULL;
293 uenum_reset(e.getAlias(), status);
294 while ((name = uenum_next(e.getAlias(), NULL, status))) {
295 if (strcmp(activeName, name) == 0) {
296 found = TRUE;
297 break;
298 }
299 }
300
301 if (!found) {
302 errln(UnicodeString(activeName) + " is not included in the all charset list.");
303 }
304
305 // some charsets are disabled by default
306 found = FALSE;
307 for (int32_t i = 0; defDisabled[i] != 0; i++) {
308 if (strcmp(activeName, defDisabled[i]) == 0) {
309 found = TRUE;
310 break;
311 }
312 }
313 if (found) {
314 errln(UnicodeString(activeName) + " should not be included in the default charset list.");
315 }
316 }
317 }
318
UTF8Test()319 void CharsetDetectionTest::UTF8Test()
320 {
321 UErrorCode status = U_ZERO_ERROR;
322 UnicodeString ss = "This is a string with some non-ascii characters that will "
323 "be converted to UTF-8, then shoved through the detection process. "
324 "\\u0391\\u0392\\u0393\\u0394\\u0395"
325 "Sure would be nice if our source could contain Unicode directly!";
326 UnicodeString s = ss.unescape();
327 int32_t byteLength = 0, sLength = s.length();
328 char *bytes = extractBytes(s, "UTF-8", byteLength);
329 UCharsetDetector *csd = ucsdet_open(&status);
330 const UCharsetMatch *match;
331 UChar *detected = NEW_ARRAY(UChar, sLength);
332
333 ucsdet_setText(csd, bytes, byteLength, &status);
334 match = ucsdet_detect(csd, &status);
335
336 if (match == NULL) {
337 errln("Detection failure for UTF-8: got no matches.");
338 goto bail;
339 }
340
341 ucsdet_getUChars(match, detected, sLength, &status);
342
343 if (s.compare(detected, sLength) != 0) {
344 errln("Round-trip test failed!");
345 }
346
347 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
348
349 bail:
350 DELETE_ARRAY(detected);
351 freeBytes(bytes);
352 ucsdet_close(csd);
353 }
354
UTF16Test()355 void CharsetDetectionTest::UTF16Test()
356 {
357 UErrorCode status = U_ZERO_ERROR;
358 /* Notice the BOM on the start of this string */
359 UChar chars[] = {
360 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
361 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
362 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
363 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
364 0x064a, 0x062a, 0x0000};
365 UnicodeString s(chars);
366 int32_t beLength = 0, leLength = 0;
367 char *beBytes = extractBytes(s, "UTF-16BE", beLength);
368 char *leBytes = extractBytes(s, "UTF-16LE", leLength);
369 UCharsetDetector *csd = ucsdet_open(&status);
370 const UCharsetMatch *match;
371 const char *name;
372 int32_t conf;
373
374 ucsdet_setText(csd, beBytes, beLength, &status);
375 match = ucsdet_detect(csd, &status);
376
377 if (match == NULL) {
378 errln("Encoding detection failure for UTF-16BE: got no matches.");
379 goto try_le;
380 }
381
382 name = ucsdet_getName(match, &status);
383 conf = ucsdet_getConfidence(match, &status);
384
385 if (strcmp(name, "UTF-16BE") != 0) {
386 errln("Encoding detection failure for UTF-16BE: got %s", name);
387 goto try_le; // no point in looking at confidence if we got the wrong character set.
388 }
389
390 if (conf != 100) {
391 errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
392 }
393
394 try_le:
395 ucsdet_setText(csd, leBytes, leLength, &status);
396 match = ucsdet_detect(csd, &status);
397
398 if (match == NULL) {
399 errln("Encoding detection failure for UTF-16LE: got no matches.");
400 goto bail;
401 }
402
403 name = ucsdet_getName(match, &status);
404 conf = ucsdet_getConfidence(match, &status);
405
406
407 if (strcmp(name, "UTF-16LE") != 0) {
408 errln("Enconding detection failure for UTF-16LE: got %s", name);
409 goto bail; // no point in looking at confidence if we got the wrong character set.
410 }
411
412 if (conf != 100) {
413 errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
414 }
415
416 bail:
417 freeBytes(leBytes);
418 freeBytes(beBytes);
419 ucsdet_close(csd);
420 }
421
InputFilterTest()422 void CharsetDetectionTest::InputFilterTest()
423 {
424 UErrorCode status = U_ZERO_ERROR;
425 UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
426 UnicodeString s = ss.unescape();
427 int32_t byteLength = 0;
428 char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
429 UCharsetDetector *csd = ucsdet_open(&status);
430 const UCharsetMatch *match;
431 const char *lang, *name;
432
433 ucsdet_enableInputFilter(csd, TRUE);
434
435 if (!ucsdet_isInputFilterEnabled(csd)) {
436 errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
437 }
438
439
440 ucsdet_setText(csd, bytes, byteLength, &status);
441 match = ucsdet_detect(csd, &status);
442
443 if (match == NULL) {
444 errln("Turning on the input filter resulted in no matches.");
445 goto turn_off;
446 }
447
448 name = ucsdet_getName(match, &status);
449
450 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
451 errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
452 } else {
453 lang = ucsdet_getLanguage(match, &status);
454
455 if (lang == NULL || strcmp(lang, "fr") != 0) {
456 errln("Input filter did not strip markup!");
457 }
458 }
459
460 turn_off:
461 ucsdet_enableInputFilter(csd, FALSE);
462 ucsdet_setText(csd, bytes, byteLength, &status);
463 match = ucsdet_detect(csd, &status);
464
465 if (match == NULL) {
466 errln("Turning off the input filter resulted in no matches.");
467 goto bail;
468 }
469
470 name = ucsdet_getName(match, &status);
471
472 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
473 errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
474 } else {
475 lang = ucsdet_getLanguage(match, &status);
476
477 if (lang == NULL || strcmp(lang, "en") != 0) {
478 errln("Unfiltered input did not detect as English!");
479 }
480 }
481
482 bail:
483 freeBytes(bytes);
484 ucsdet_close(csd);
485 }
486
C1BytesTest()487 void CharsetDetectionTest::C1BytesTest()
488 {
489 #if !UCONFIG_NO_LEGACY_CONVERSION
490 UErrorCode status = U_ZERO_ERROR;
491 UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
492 UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
493 UnicodeString sWindows = ssWindows.unescape();
494 int32_t lISO = 0, lWindows = 0;
495 char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
496 char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
497 UCharsetDetector *csd = ucsdet_open(&status);
498 const UCharsetMatch *match;
499 const char *name;
500
501 ucsdet_setText(csd, bWindows, lWindows, &status);
502 match = ucsdet_detect(csd, &status);
503
504 if (match == NULL) {
505 errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
506 goto bail;
507 }
508
509 name = ucsdet_getName(match, &status);
510
511 if (strcmp(name, "windows-1252") != 0) {
512 errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
513 }
514
515 ucsdet_setText(csd, bISO, lISO, &status);
516 match = ucsdet_detect(csd, &status);
517
518 if (match == NULL) {
519 errln("English text without C1 bytes got no matches.");
520 goto bail;
521 }
522
523 name = ucsdet_getName(match, &status);
524
525 if (strcmp(name, "ISO-8859-1") != 0) {
526 errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
527 }
528
529 bail:
530 freeBytes(bWindows);
531 freeBytes(bISO);
532
533 ucsdet_close(csd);
534 #endif
535 }
536
DetectionTest()537 void CharsetDetectionTest::DetectionTest()
538 {
539 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
540 UErrorCode status = U_ZERO_ERROR;
541 char path[2048];
542 const char *testFilePath = getPath(path, "csdetest.xml");
543
544 if (testFilePath == NULL) {
545 return; /* Couldn't get path: error message already output. */
546 }
547
548 UXMLParser *parser = UXMLParser::createParser(status);
549 if (U_FAILURE(status)) {
550 dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
551 return;
552 }
553
554 UXMLElement *root = parser->parseFile(testFilePath, status);
555 if (!assertSuccess( "parseFile",status)) return;
556
557 UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
558 UnicodeString id_attr = UNICODE_STRING_SIMPLE("id");
559 UnicodeString enc_attr = UNICODE_STRING_SIMPLE("encodings");
560
561 const UXMLElement *testCase;
562 int32_t tc = 0;
563
564 while((testCase = root->nextChildElement(tc)) != NULL) {
565 if (testCase->getTagName().compare(test_case) == 0) {
566 const UnicodeString *id = testCase->getAttribute(id_attr);
567 const UnicodeString *encodings = testCase->getAttribute(enc_attr);
568 const UnicodeString text = testCase->getText(TRUE);
569 int32_t encodingCount;
570 UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
571
572 for(int32_t e = 0; e < encodingCount; e += 1) {
573 checkEncoding(text, encodingList[e], *id);
574 }
575
576 delete[] encodingList;
577 }
578 }
579
580 delete root;
581 delete parser;
582 #endif
583 }
584
IBM424Test()585 void CharsetDetectionTest::IBM424Test()
586 {
587 UErrorCode status = U_ZERO_ERROR;
588
589 static const UChar chars[] = {
590 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
591 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
592 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
593 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
594 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
595 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
596 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
597 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
598 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
599 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
600 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
601 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
602 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
603 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
604 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
605 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
606 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
607 };
608
609 static const UChar chars_reverse[] = {
610 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
611 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
612 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
613 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
614 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
615 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
616 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
617 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
618 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
619 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
620 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
621 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
622 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
623 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
624 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
625 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
626 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
627 0x0000
628 };
629
630 int32_t bLength = 0, brLength = 0;
631
632 UnicodeString s1(chars);
633 UnicodeString s2(chars_reverse);
634
635 char *bytes = extractBytes(s1, "IBM424", bLength);
636 char *bytes_r = extractBytes(s2, "IBM424", brLength);
637
638 UCharsetDetector *csd = ucsdet_open(&status);
639 ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
640 ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
641 ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
642 ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
643 if (U_FAILURE(status)) {
644 errln("Error opening charset detector. - %s", u_errorName(status));
645 }
646 const UCharsetMatch *match;
647 const char *name;
648
649 ucsdet_setText(csd, bytes, bLength, &status);
650 match = ucsdet_detect(csd, &status);
651
652 if (match == NULL) {
653 errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
654 goto bail;
655 }
656
657 name = ucsdet_getName(match, &status);
658 if (strcmp(name, "IBM424_rtl") != 0) {
659 errln("Encoding detection failure for IBM424_rtl: got %s", name);
660 }
661
662 ucsdet_setText(csd, bytes_r, brLength, &status);
663 match = ucsdet_detect(csd, &status);
664
665 if (match == NULL) {
666 errln("Encoding detection failure for IBM424_ltr: got no matches.");
667 goto bail;
668 }
669
670 name = ucsdet_getName(match, &status);
671 if (strcmp(name, "IBM424_ltr") != 0) {
672 errln("Encoding detection failure for IBM424_ltr: got %s", name);
673 }
674
675 bail:
676 freeBytes(bytes);
677 freeBytes(bytes_r);
678 ucsdet_close(csd);
679 }
680
IBM420Test()681 void CharsetDetectionTest::IBM420Test()
682 {
683 UErrorCode status = U_ZERO_ERROR;
684
685 static const UChar chars[] = {
686 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
687 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
688 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
689 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
690 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
691 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
692 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
693 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
694 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
695 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
696 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
697 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
698 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
699 0x0000
700 };
701 static const UChar chars_reverse[] = {
702 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
703 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
704 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
705 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
706 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
707 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
708 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
709 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
710 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
711 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
712 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
713 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
714 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
715 0x0000,
716 };
717
718 int32_t bLength = 0, brLength = 0;
719
720 UnicodeString s1(chars);
721 UnicodeString s2(chars_reverse);
722
723 char *bytes = extractBytes(s1, "IBM420", bLength);
724 char *bytes_r = extractBytes(s2, "IBM420", brLength);
725
726 UCharsetDetector *csd = ucsdet_open(&status);
727 if (U_FAILURE(status)) {
728 errln("Error opening charset detector. - %s", u_errorName(status));
729 }
730 ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
731 ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
732 ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
733 ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
734 const UCharsetMatch *match;
735 const char *name;
736
737 ucsdet_setText(csd, bytes, bLength, &status);
738 match = ucsdet_detect(csd, &status);
739
740 if (match == NULL) {
741 errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
742 goto bail;
743 }
744
745 name = ucsdet_getName(match, &status);
746 if (strcmp(name, "IBM420_rtl") != 0) {
747 errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
748 }
749
750 ucsdet_setText(csd, bytes_r, brLength, &status);
751 match = ucsdet_detect(csd, &status);
752
753 if (match == NULL) {
754 errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
755 goto bail;
756 }
757
758 name = ucsdet_getName(match, &status);
759 if (strcmp(name, "IBM420_ltr") != 0) {
760 errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
761 }
762
763 bail:
764 freeBytes(bytes);
765 freeBytes(bytes_r);
766 ucsdet_close(csd);
767 }
768
769
Ticket6394Test()770 void CharsetDetectionTest::Ticket6394Test() {
771 #if !UCONFIG_NO_CONVERSION
772 const char charText[] = "Here is some random English text that should be detected as ISO-8859-1."
773 "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
774 "encodings more than once. The hop through UnicodeString is for platforms "
775 "where this char * string is be EBCDIC and needs conversion to Latin1.";
776 char latin1Text[sizeof(charText)];
777 UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
778
779 UErrorCode status = U_ZERO_ERROR;
780 UCharsetDetector *csd = ucsdet_open(&status);
781 ucsdet_setText(csd, latin1Text, -1, &status);
782 if (U_FAILURE(status)) {
783 errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status));
784 return;
785 }
786
787 int32_t matchCount = 0;
788 const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
789 if (U_FAILURE(status)) {
790 errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status));
791 return;
792 }
793
794 UnicodeSet setOfCharsetNames; // UnicodSets can hold strings.
795 int32_t i;
796 for (i=0; i<matchCount; i++) {
797 UnicodeString charSetName(ucsdet_getName(matches[i], &status));
798 if (U_FAILURE(status)) {
799 errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__, __LINE__, u_errorName(status), i);
800 status = U_ZERO_ERROR;
801 }
802 if (setOfCharsetNames.contains(charSetName)) {
803 errln("Fail at file %s, line %d ", __FILE__, __LINE__);
804 errln(UnicodeString(" Duplicate charset name = ") + charSetName);
805 }
806 setOfCharsetNames.add(charSetName);
807 }
808 ucsdet_close(csd);
809 #endif
810 }
811
812
813 // Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
814 // similar Windows and non-Windows SBCS encodings. State was kept in the shared
815 // Charset Recognizer objects, and could be overwritten.
Ticket6954Test()816 void CharsetDetectionTest::Ticket6954Test() {
817 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_FORMATTING
818 UErrorCode status = U_ZERO_ERROR;
819 UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
820 UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
821 "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
822 UnicodeString sWindows = ssWindows.unescape();
823 int32_t lISO = 0, lWindows = 0;
824 char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
825 char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
826
827 // First do a plain vanilla detect of 1252 text
828
829 UCharsetDetector *csd1 = ucsdet_open(&status);
830 ucsdet_setText(csd1, bWindows, lWindows, &status);
831 const UCharsetMatch *match1 = ucsdet_detect(csd1, &status);
832 const char *name1 = ucsdet_getName(match1, &status);
833 TEST_ASSERT_SUCCESS(status);
834 TEST_ASSERT(strcmp(name1, "windows-1252")==0);
835
836 // Next, using a completely separate detector, detect some 8859-1 text
837
838 UCharsetDetector *csd2 = ucsdet_open(&status);
839 ucsdet_setText(csd2, bISO, lISO, &status);
840 const UCharsetMatch *match2 = ucsdet_detect(csd2, &status);
841 const char *name2 = ucsdet_getName(match2, &status);
842 TEST_ASSERT_SUCCESS(status);
843 TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0);
844
845 // Recheck the 1252 results from the first detector, which should not have been
846 // altered by the use of a different detector.
847
848 name1 = ucsdet_getName(match1, &status);
849 TEST_ASSERT_SUCCESS(status);
850 TEST_ASSERT(strcmp(name1, "windows-1252")==0);
851
852 ucsdet_close(csd1);
853 ucsdet_close(csd2);
854 freeBytes(bISO);
855 freeBytes(bWindows);
856 #endif
857 }
858