1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ****************************************************************************
5 * Copyright (c) 2005-2016, International Business Machines Corporation and *
6 * others. All Rights Reserved. *
7 ****************************************************************************
8 */
9
10 #include "unicode/utypes.h"
11
12 #include "unicode/ucsdet.h"
13 #include "unicode/ucnv.h"
14 #include "unicode/ustring.h"
15
16 #include "cintltst.h"
17 #include "cmemory.h"
18
19 #include <stdlib.h>
20 #include <string.h>
21
22 #define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type))
23 #define DELETE_ARRAY(array) free(array)
24
25 static void TestConstruction(void);
26 static void TestUTF8(void);
27 static void TestUTF16(void);
28 static void TestC1Bytes(void);
29 static void TestInputFilter(void);
30 static void TestChaining(void);
31 static void TestBufferOverflow(void);
32 static void TestIBM424(void);
33 static void TestIBM420(void);
34
35 void addUCsdetTest(TestNode** root);
36
addUCsdetTest(TestNode ** root)37 void addUCsdetTest(TestNode** root)
38 {
39 addTest(root, &TestConstruction, "ucsdetst/TestConstruction");
40 addTest(root, &TestUTF8, "ucsdetst/TestUTF8");
41 addTest(root, &TestUTF16, "ucsdetst/TestUTF16");
42 addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes");
43 addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter");
44 addTest(root, &TestChaining, "ucsdetst/TestErrorChaining");
45 addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow");
46 #if !UCONFIG_NO_LEGACY_CONVERSION
47 addTest(root, &TestIBM424, "ucsdetst/TestIBM424");
48 addTest(root, &TestIBM420, "ucsdetst/TestIBM420");
49 #endif
50 }
51
preflight(const UChar * src,int32_t length,UConverter * cnv)52 static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv)
53 {
54 UErrorCode status;
55 char buffer[1024];
56 char *dest, *destLimit = buffer + sizeof(buffer);
57 const UChar *srcLimit = src + length;
58 int32_t result = 0;
59
60 do {
61 dest = buffer;
62 status = U_ZERO_ERROR;
63 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
64 result += (int32_t) (dest - buffer);
65 } while (status == U_BUFFER_OVERFLOW_ERROR);
66
67 return result;
68 }
69
extractBytes(const UChar * src,int32_t length,const char * codepage,int32_t * byteLength)70 static char *extractBytes(const UChar *src, int32_t length, const char *codepage, int32_t *byteLength)
71 {
72 UErrorCode status = U_ZERO_ERROR;
73 UConverter *cnv = ucnv_open(codepage, &status);
74 int32_t byteCount = preflight(src, length, cnv);
75 const UChar *srcLimit = src + length;
76 char *bytes = NEW_ARRAY(char, byteCount + 1);
77 char *dest = bytes, *destLimit = bytes + byteCount + 1;
78
79 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
80 ucnv_close(cnv);
81
82 *byteLength = byteCount;
83 return bytes;
84 }
85
freeBytes(char * bytes)86 static void freeBytes(char *bytes)
87 {
88 DELETE_ARRAY(bytes);
89 }
90
TestConstruction(void)91 static void TestConstruction(void)
92 {
93 UErrorCode status = U_ZERO_ERROR;
94 UCharsetDetector *csd = ucsdet_open(&status);
95 UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
96 const char *name;
97 int32_t count = uenum_count(e, &status);
98 int32_t i, length;
99
100 for(i = 0; i < count; i += 1) {
101 name = uenum_next(e, &length, &status);
102
103 if(name == NULL || length <= 0) {
104 log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n");
105 }
106 }
107 /* one past the list of all names must return NULL */
108 name = uenum_next(e, &length, &status);
109 if(name != NULL || length != 0 || U_FAILURE(status)) {
110 log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n");
111 }
112
113 uenum_close(e);
114 ucsdet_close(csd);
115 }
116
TestUTF8(void)117 static void TestUTF8(void)
118 {
119 UErrorCode status = U_ZERO_ERROR;
120 static const char ss[] = "This is a string with some non-ascii characters that will "
121 "be converted to UTF-8, then shoved through the detection process. "
122 "\\u0391\\u0392\\u0393\\u0394\\u0395"
123 "Sure would be nice if our source could contain Unicode directly!";
124 int32_t byteLength = 0, sLength = 0, dLength = 0;
125 UChar s[sizeof(ss)];
126 char *bytes;
127 UCharsetDetector *csd = ucsdet_open(&status);
128 const UCharsetMatch *match;
129 UChar detected[sizeof(ss)];
130
131 sLength = u_unescape(ss, s, sizeof(ss));
132 bytes = extractBytes(s, sLength, "UTF-8", &byteLength);
133
134 ucsdet_setText(csd, bytes, byteLength, &status);
135 if (U_FAILURE(status)) {
136 log_err("status is %s\n", u_errorName(status));
137 goto bail;
138 }
139
140 match = ucsdet_detect(csd, &status);
141
142 if (match == NULL) {
143 log_err("Detection failure for UTF-8: got no matches.\n");
144 goto bail;
145 }
146
147 dLength = ucsdet_getUChars(match, detected, sLength, &status);
148
149 if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) {
150 log_err("Round-trip test failed!\n");
151 }
152
153 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
154
155 bail:
156 freeBytes(bytes);
157 ucsdet_close(csd);
158 }
159
TestUTF16(void)160 static void TestUTF16(void)
161 {
162 UErrorCode status = U_ZERO_ERROR;
163 /* Notice the BOM on the start of this string */
164 static const UChar chars[] = {
165 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
166 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
167 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
168 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
169 0x064a, 0x062a, 0x0000};
170 int32_t beLength = 0, leLength = 0, cLength = UPRV_LENGTHOF(chars);
171 char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength);
172 char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength);
173 UCharsetDetector *csd = ucsdet_open(&status);
174 const UCharsetMatch *match;
175 const char *name;
176 int32_t conf;
177
178 ucsdet_setText(csd, beBytes, beLength, &status);
179 match = ucsdet_detect(csd, &status);
180
181 if (match == NULL) {
182 log_err("Encoding detection failure for UTF-16BE: got no matches.\n");
183 goto try_le;
184 }
185
186 name = ucsdet_getName(match, &status);
187 conf = ucsdet_getConfidence(match, &status);
188
189 if (strcmp(name, "UTF-16BE") != 0) {
190 log_err("Encoding detection failure for UTF-16BE: got %s\n", name);
191 }
192
193 if (conf != 100) {
194 log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf);
195 }
196
197 try_le:
198 ucsdet_setText(csd, leBytes, leLength, &status);
199 match = ucsdet_detect(csd, &status);
200
201 if (match == NULL) {
202 log_err("Encoding detection failure for UTF-16LE: got no matches.\n");
203 goto bail;
204 }
205
206 name = ucsdet_getName(match, &status);
207 conf = ucsdet_getConfidence(match, &status);
208
209
210 if (strcmp(name, "UTF-16LE") != 0) {
211 log_err("Enconding detection failure for UTF-16LE: got %s\n", name);
212 }
213
214 if (conf != 100) {
215 log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf);
216 }
217
218 bail:
219 freeBytes(leBytes);
220 freeBytes(beBytes);
221 ucsdet_close(csd);
222 }
223
TestC1Bytes(void)224 static void TestC1Bytes(void)
225 {
226 #if !UCONFIG_NO_LEGACY_CONVERSION
227 UErrorCode status = U_ZERO_ERROR;
228 static const char ssISO[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
229 static const char ssWindows[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.";
230 int32_t sISOLength = 0, sWindowsLength = 0;
231 UChar sISO[sizeof(ssISO)];
232 UChar sWindows[sizeof(ssWindows)];
233 int32_t lISO = 0, lWindows = 0;
234 char *bISO;
235 char *bWindows;
236 UCharsetDetector *csd = ucsdet_open(&status);
237 const UCharsetMatch *match;
238 const char *name;
239
240 sISOLength = u_unescape(ssISO, sISO, sizeof(ssISO));
241 sWindowsLength = u_unescape(ssWindows, sWindows, sizeof(ssWindows));
242 bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO);
243 bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows);
244
245 ucsdet_setText(csd, bWindows, lWindows, &status);
246 match = ucsdet_detect(csd, &status);
247
248 if (match == NULL) {
249 log_err("English test with C1 bytes got no matches.\n");
250 goto bail;
251 }
252
253 name = ucsdet_getName(match, &status);
254
255 if (strcmp(name, "windows-1252") != 0) {
256 log_data_err("English text with C1 bytes does not detect as windows-1252, but as %s. (Are you missing data?)\n", name);
257 }
258
259 ucsdet_setText(csd, bISO, lISO, &status);
260 match = ucsdet_detect(csd, &status);
261
262 if (match == NULL) {
263 log_err("English text without C1 bytes got no matches.\n");
264 goto bail;
265 }
266
267 name = ucsdet_getName(match, &status);
268
269 if (strcmp(name, "ISO-8859-1") != 0) {
270 log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name);
271 }
272
273 bail:
274 freeBytes(bWindows);
275 freeBytes(bISO);
276
277 ucsdet_close(csd);
278 #endif
279 }
280
TestInputFilter(void)281 static void TestInputFilter(void)
282 {
283 UErrorCode status = U_ZERO_ERROR;
284 static const char ss[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
285 int32_t sLength = 0;
286 UChar s[sizeof(ss)];
287 int32_t byteLength = 0;
288 char *bytes;
289 UCharsetDetector *csd = ucsdet_open(&status);
290 const UCharsetMatch *match;
291 const char *lang, *name;
292
293 sLength = u_unescape(ss, s, sizeof(ss));
294 bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength);
295
296 ucsdet_enableInputFilter(csd, TRUE);
297
298 if (!ucsdet_isInputFilterEnabled(csd)) {
299 log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!\n");
300 }
301
302
303 ucsdet_setText(csd, bytes, byteLength, &status);
304 match = ucsdet_detect(csd, &status);
305
306 if (match == NULL) {
307 log_err("Turning on the input filter resulted in no matches.\n");
308 goto turn_off;
309 }
310
311 name = ucsdet_getName(match, &status);
312
313 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
314 log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
315 } else {
316 lang = ucsdet_getLanguage(match, &status);
317
318 if (lang == NULL || strcmp(lang, "fr") != 0) {
319 log_err("Input filter did not strip markup!\n");
320 }
321 }
322
323 turn_off:
324 ucsdet_enableInputFilter(csd, FALSE);
325 ucsdet_setText(csd, bytes, byteLength, &status);
326 match = ucsdet_detect(csd, &status);
327
328 if (match == NULL) {
329 log_err("Turning off the input filter resulted in no matches.\n");
330 goto bail;
331 }
332
333 name = ucsdet_getName(match, &status);
334
335 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
336 log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
337 } else {
338 lang = ucsdet_getLanguage(match, &status);
339
340 if (lang == NULL || strcmp(lang, "en") != 0) {
341 log_err("Unfiltered input did not detect as English!\n");
342 }
343 }
344
345 bail:
346 freeBytes(bytes);
347 ucsdet_close(csd);
348 }
349
TestChaining(void)350 static void TestChaining(void) {
351 UErrorCode status = U_USELESS_COLLATOR_ERROR;
352
353 ucsdet_open(&status);
354 ucsdet_setText(NULL, NULL, 0, &status);
355 ucsdet_getName(NULL, &status);
356 ucsdet_getConfidence(NULL, &status);
357 ucsdet_getLanguage(NULL, &status);
358 ucsdet_detect(NULL, &status);
359 ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status);
360 ucsdet_detectAll(NULL, NULL, &status);
361 ucsdet_getUChars(NULL, NULL, 0, &status);
362 ucsdet_getUChars(NULL, NULL, 0, &status);
363 ucsdet_close(NULL);
364
365 /* All of this code should have done nothing. */
366 if (status != U_USELESS_COLLATOR_ERROR) {
367 log_err("Status got changed to %s\n", u_errorName(status));
368 }
369 }
370
TestBufferOverflow(void)371 static void TestBufferOverflow(void) {
372 UErrorCode status = U_ZERO_ERROR;
373 static const char *testStrings[] = {
374 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b", /* A partial ISO-2022 shift state at the end */
375 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24", /* A partial ISO-2022 shift state at the end */
376 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */
377 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one at the start */
378 "\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */
379 "\xa1", /* Could be a single byte shift-jis at the end */
380 "\x74\x68\xa1", /* Could be a single byte shift-jis at the end */
381 "\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
382 };
383 static const char *testResults[] = {
384 "windows-1252",
385 "windows-1252",
386 "windows-1252",
387 "windows-1252",
388 "ISO-2022-JP",
389 NULL,
390 NULL,
391 "ISO-8859-1"
392 };
393 int32_t idx = 0;
394 UCharsetDetector *csd = ucsdet_open(&status);
395 const UCharsetMatch *match;
396
397 ucsdet_setDeclaredEncoding(csd, "ISO-2022-JP", -1, &status);
398
399 if (U_FAILURE(status)) {
400 log_err("Couldn't open detector. %s\n", u_errorName(status));
401 goto bail;
402 }
403
404 for (idx = 0; idx < UPRV_LENGTHOF(testStrings); idx++) {
405 ucsdet_setText(csd, testStrings[idx], -1, &status);
406 match = ucsdet_detect(csd, &status);
407
408 if (match == NULL) {
409 if (testResults[idx] != NULL) {
410 log_err("Unexpectedly got no results at index %d.\n", idx);
411 }
412 else {
413 log_verbose("Got no result as expected at index %d.\n", idx);
414 }
415 continue;
416 }
417
418 if (testResults[idx] == NULL || strcmp(ucsdet_getName(match, &status), testResults[idx]) != 0) {
419 log_err("Unexpectedly got %s instead of %s at index %d with confidence %d.\n",
420 ucsdet_getName(match, &status), testResults[idx], idx, ucsdet_getConfidence(match, &status));
421 goto bail;
422 }
423 }
424
425 bail:
426 ucsdet_close(csd);
427 }
428
TestIBM424(void)429 static void TestIBM424(void)
430 {
431 UErrorCode status = U_ZERO_ERROR;
432
433 static const UChar chars[] = {
434 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
435 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
436 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
437 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
438 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
439 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
440 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
441 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
442 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
443 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
444 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
445 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
446 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
447 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
448 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
449 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
450 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
451 };
452
453 static const UChar chars_reverse[] = {
454 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
455 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
456 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
457 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
458 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
459 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
460 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
461 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
462 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
463 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
464 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
465 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
466 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
467 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
468 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
469 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
470 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
471 0x0000
472 };
473
474 int32_t bLength = 0, brLength = 0, cLength = UPRV_LENGTHOF(chars), crLength = UPRV_LENGTHOF(chars_reverse);
475
476 char *bytes = extractBytes(chars, cLength, "IBM424", &bLength);
477 char *bytes_r = extractBytes(chars_reverse, crLength, "IBM424", &brLength);
478
479 UCharsetDetector *csd = ucsdet_open(&status);
480 const UCharsetMatch *match;
481 const char *name;
482
483 ucsdet_setText(csd, bytes, bLength, &status);
484 match = ucsdet_detect(csd, &status);
485
486 if (match == NULL) {
487 log_err("Encoding detection failure for IBM424_rtl: got no matches.\n");
488 goto bail;
489 }
490
491 name = ucsdet_getName(match, &status);
492 if (strcmp(name, "IBM424_rtl") != 0) {
493 log_data_err("Encoding detection failure for IBM424_rtl: got %s. (Are you missing data?)\n", name);
494 }
495
496 ucsdet_setText(csd, bytes_r, brLength, &status);
497 match = ucsdet_detect(csd, &status);
498
499 if (match == NULL) {
500 log_err("Encoding detection failure for IBM424_ltr: got no matches.\n");
501 goto bail;
502 }
503
504 name = ucsdet_getName(match, &status);
505 if (strcmp(name, "IBM424_ltr") != 0) {
506 log_data_err("Encoding detection failure for IBM424_ltr: got %s. (Are you missing data?)\n", name);
507 }
508
509 bail:
510 freeBytes(bytes);
511 freeBytes(bytes_r);
512 ucsdet_close(csd);
513 }
514
TestIBM420(void)515 static void TestIBM420(void)
516 {
517 UErrorCode status = U_ZERO_ERROR;
518
519 static const UChar chars[] = {
520 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
521 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
522 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
523 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
524 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
525 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
526 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
527 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
528 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
529 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
530 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
531 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
532 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
533 0x0000
534 };
535 static const UChar chars_reverse[] = {
536 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
537 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
538 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
539 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
540 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
541 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
542 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
543 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
544 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
545 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
546 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
547 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
548 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
549 0x0000,
550 };
551
552 int32_t bLength = 0, brLength = 0, cLength = UPRV_LENGTHOF(chars), crLength = UPRV_LENGTHOF(chars_reverse);
553
554 char *bytes = extractBytes(chars, cLength, "IBM420", &bLength);
555 char *bytes_r = extractBytes(chars_reverse, crLength, "IBM420", &brLength);
556
557 UCharsetDetector *csd = ucsdet_open(&status);
558 const UCharsetMatch *match;
559 const char *name;
560
561 ucsdet_setText(csd, bytes, bLength, &status);
562 match = ucsdet_detect(csd, &status);
563
564 if (match == NULL) {
565 log_err("Encoding detection failure for IBM420_rtl: got no matches.\n");
566 goto bail;
567 }
568
569 name = ucsdet_getName(match, &status);
570 if (strcmp(name, "IBM420_rtl") != 0) {
571 log_data_err("Encoding detection failure for IBM420_rtl: got %s. (Are you missing data?)\n", name);
572 }
573
574 ucsdet_setText(csd, bytes_r, brLength, &status);
575 match = ucsdet_detect(csd, &status);
576
577 if (match == NULL) {
578 log_err("Encoding detection failure for IBM420_ltr: got no matches.\n");
579 goto bail;
580 }
581
582 name = ucsdet_getName(match, &status);
583 if (strcmp(name, "IBM420_ltr") != 0) {
584 log_data_err("Encoding detection failure for IBM420_ltr: got %s. (Are you missing data?)\n", name);
585 }
586
587 bail:
588 freeBytes(bytes);
589 freeBytes(bytes_r);
590 ucsdet_close(csd);
591 }
592