1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ****************************************************************************
5 * Copyright (c) 2005-2016, International Business Machines Corporation and *
6 * others. All Rights Reserved. *
7 ****************************************************************************
8 */
9
10 #include "unicode/utypes.h"
11
12 #include "unicode/ucsdet.h"
13 #include "unicode/ucnv.h"
14 #include "unicode/ustring.h"
15
16 #include "cintltst.h"
17 #include "cmemory.h"
18
19 #include <stdbool.h>
20 #include <stdlib.h>
21 #include <string.h>
22
23 #define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type))
24 #define DELETE_ARRAY(array) free(array)
25
26 static void TestConstruction(void);
27 static void TestUTF8(void);
28 static void TestUTF16(void);
29 static void TestC1Bytes(void);
30 static void TestInputFilter(void);
31 static void TestChaining(void);
32 static void TestBufferOverflow(void);
33 static void TestIBM424(void);
34 static void TestIBM420(void);
35
36 void addUCsdetTest(TestNode** root);
37
addUCsdetTest(TestNode ** root)38 void addUCsdetTest(TestNode** root)
39 {
40 addTest(root, &TestConstruction, "ucsdetst/TestConstruction");
41 addTest(root, &TestUTF8, "ucsdetst/TestUTF8");
42 addTest(root, &TestUTF16, "ucsdetst/TestUTF16");
43 addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes");
44 addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter");
45 addTest(root, &TestChaining, "ucsdetst/TestErrorChaining");
46 addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow");
47 #if !UCONFIG_NO_LEGACY_CONVERSION
48 addTest(root, &TestIBM424, "ucsdetst/TestIBM424");
49 addTest(root, &TestIBM420, "ucsdetst/TestIBM420");
50 #endif
51 }
52
preflight(const UChar * src,int32_t length,UConverter * cnv)53 static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv)
54 {
55 UErrorCode status;
56 char buffer[1024];
57 char *dest, *destLimit = buffer + sizeof(buffer);
58 const UChar *srcLimit = src + length;
59 int32_t result = 0;
60
61 do {
62 dest = buffer;
63 status = U_ZERO_ERROR;
64 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, true, &status);
65 result += (int32_t) (dest - buffer);
66 } while (status == U_BUFFER_OVERFLOW_ERROR);
67
68 return result;
69 }
70
extractBytes(const UChar * src,int32_t length,const char * codepage,int32_t * byteLength)71 static char *extractBytes(const UChar *src, int32_t length, const char *codepage, int32_t *byteLength)
72 {
73 UErrorCode status = U_ZERO_ERROR;
74 UConverter *cnv = ucnv_open(codepage, &status);
75 int32_t byteCount = preflight(src, length, cnv);
76 const UChar *srcLimit = src + length;
77 char *bytes = NEW_ARRAY(char, byteCount + 1);
78 char *dest = bytes, *destLimit = bytes + byteCount + 1;
79
80 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, true, &status);
81 ucnv_close(cnv);
82
83 *byteLength = byteCount;
84 return bytes;
85 }
86
freeBytes(char * bytes)87 static void freeBytes(char *bytes)
88 {
89 DELETE_ARRAY(bytes);
90 }
91
TestConstruction(void)92 static void TestConstruction(void)
93 {
94 UErrorCode status = U_ZERO_ERROR;
95 UCharsetDetector *csd = ucsdet_open(&status);
96 UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
97 const char *name;
98 int32_t count = uenum_count(e, &status);
99 int32_t i, length;
100
101 for(i = 0; i < count; i += 1) {
102 name = uenum_next(e, &length, &status);
103
104 if(name == NULL || length <= 0) {
105 log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n");
106 }
107 }
108 /* one past the list of all names must return NULL */
109 name = uenum_next(e, &length, &status);
110 if(name != NULL || length != 0 || U_FAILURE(status)) {
111 log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n");
112 }
113
114 uenum_close(e);
115 ucsdet_close(csd);
116 }
117
TestUTF8(void)118 static void TestUTF8(void)
119 {
120 UErrorCode status = U_ZERO_ERROR;
121 static const char ss[] = "This is a string with some non-ascii characters that will "
122 "be converted to UTF-8, then shoved through the detection process. "
123 "\\u0391\\u0392\\u0393\\u0394\\u0395"
124 "Sure would be nice if our source could contain Unicode directly!";
125 int32_t byteLength = 0, sLength = 0, dLength = 0;
126 UChar s[sizeof(ss)];
127 char *bytes;
128 UCharsetDetector *csd = ucsdet_open(&status);
129 const UCharsetMatch *match;
130 UChar detected[sizeof(ss)];
131
132 sLength = u_unescape(ss, s, sizeof(ss));
133 bytes = extractBytes(s, sLength, "UTF-8", &byteLength);
134
135 ucsdet_setText(csd, bytes, byteLength, &status);
136 if (U_FAILURE(status)) {
137 log_err("status is %s\n", u_errorName(status));
138 goto bail;
139 }
140
141 match = ucsdet_detect(csd, &status);
142
143 if (match == NULL) {
144 log_err("Detection failure for UTF-8: got no matches.\n");
145 goto bail;
146 }
147
148 dLength = ucsdet_getUChars(match, detected, sLength, &status);
149
150 if (u_strCompare(detected, dLength, s, sLength, false) != 0) {
151 log_err("Round-trip test failed!\n");
152 }
153
154 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
155
156 bail:
157 freeBytes(bytes);
158 ucsdet_close(csd);
159 }
160
TestUTF16(void)161 static void TestUTF16(void)
162 {
163 UErrorCode status = U_ZERO_ERROR;
164 /* Notice the BOM on the start of this string */
165 static const UChar chars[] = {
166 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
167 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
168 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
169 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
170 0x064a, 0x062a, 0x0000};
171 int32_t beLength = 0, leLength = 0, cLength = UPRV_LENGTHOF(chars);
172 char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength);
173 char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength);
174 UCharsetDetector *csd = ucsdet_open(&status);
175 const UCharsetMatch *match;
176 const char *name;
177 int32_t conf;
178
179 ucsdet_setText(csd, beBytes, beLength, &status);
180 match = ucsdet_detect(csd, &status);
181
182 if (match == NULL) {
183 log_err("Encoding detection failure for UTF-16BE: got no matches.\n");
184 goto try_le;
185 }
186
187 name = ucsdet_getName(match, &status);
188 conf = ucsdet_getConfidence(match, &status);
189
190 if (strcmp(name, "UTF-16BE") != 0) {
191 log_err("Encoding detection failure for UTF-16BE: got %s\n", name);
192 }
193
194 if (conf != 100) {
195 log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf);
196 }
197
198 try_le:
199 ucsdet_setText(csd, leBytes, leLength, &status);
200 match = ucsdet_detect(csd, &status);
201
202 if (match == NULL) {
203 log_err("Encoding detection failure for UTF-16LE: got no matches.\n");
204 goto bail;
205 }
206
207 name = ucsdet_getName(match, &status);
208 conf = ucsdet_getConfidence(match, &status);
209
210
211 if (strcmp(name, "UTF-16LE") != 0) {
212 log_err("Encoding detection failure for UTF-16LE: got %s\n", name);
213 }
214
215 if (conf != 100) {
216 log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf);
217 }
218
219 bail:
220 freeBytes(leBytes);
221 freeBytes(beBytes);
222 ucsdet_close(csd);
223 }
224
TestC1Bytes(void)225 static void TestC1Bytes(void)
226 {
227 #if !UCONFIG_NO_LEGACY_CONVERSION
228 UErrorCode status = U_ZERO_ERROR;
229 static const char ssISO[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
230 static const char ssWindows[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.";
231 int32_t sISOLength = 0, sWindowsLength = 0;
232 UChar sISO[sizeof(ssISO)];
233 UChar sWindows[sizeof(ssWindows)];
234 int32_t lISO = 0, lWindows = 0;
235 char *bISO;
236 char *bWindows;
237 UCharsetDetector *csd = ucsdet_open(&status);
238 const UCharsetMatch *match;
239 const char *name;
240
241 sISOLength = u_unescape(ssISO, sISO, sizeof(ssISO));
242 sWindowsLength = u_unescape(ssWindows, sWindows, sizeof(ssWindows));
243 bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO);
244 bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows);
245
246 ucsdet_setText(csd, bWindows, lWindows, &status);
247 match = ucsdet_detect(csd, &status);
248
249 if (match == NULL) {
250 log_err("English test with C1 bytes got no matches.\n");
251 goto bail;
252 }
253
254 name = ucsdet_getName(match, &status);
255
256 if (strcmp(name, "windows-1252") != 0) {
257 log_data_err("English text with C1 bytes does not detect as windows-1252, but as %s. (Are you missing data?)\n", name);
258 }
259
260 ucsdet_setText(csd, bISO, lISO, &status);
261 match = ucsdet_detect(csd, &status);
262
263 if (match == NULL) {
264 log_err("English text without C1 bytes got no matches.\n");
265 goto bail;
266 }
267
268 name = ucsdet_getName(match, &status);
269
270 if (strcmp(name, "ISO-8859-1") != 0) {
271 log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name);
272 }
273
274 bail:
275 freeBytes(bWindows);
276 freeBytes(bISO);
277
278 ucsdet_close(csd);
279 #endif
280 }
281
TestInputFilter(void)282 static void TestInputFilter(void)
283 {
284 UErrorCode status = U_ZERO_ERROR;
285 static const char ss[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
286 int32_t sLength = 0;
287 UChar s[sizeof(ss)];
288 int32_t byteLength = 0;
289 char *bytes;
290 UCharsetDetector *csd = ucsdet_open(&status);
291 const UCharsetMatch *match;
292 const char *lang, *name;
293
294 sLength = u_unescape(ss, s, sizeof(ss));
295 bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength);
296
297 ucsdet_enableInputFilter(csd, true);
298
299 if (!ucsdet_isInputFilterEnabled(csd)) {
300 log_err("ucsdet_enableInputFilter(csd, true) did not enable input filter!\n");
301 }
302
303
304 ucsdet_setText(csd, bytes, byteLength, &status);
305 match = ucsdet_detect(csd, &status);
306
307 if (match == NULL) {
308 log_err("Turning on the input filter resulted in no matches.\n");
309 goto turn_off;
310 }
311
312 name = ucsdet_getName(match, &status);
313
314 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
315 log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
316 } else {
317 lang = ucsdet_getLanguage(match, &status);
318
319 if (lang == NULL || strcmp(lang, "fr") != 0) {
320 log_err("Input filter did not strip markup!\n");
321 }
322 }
323
324 turn_off:
325 ucsdet_enableInputFilter(csd, false);
326 ucsdet_setText(csd, bytes, byteLength, &status);
327 match = ucsdet_detect(csd, &status);
328
329 if (match == NULL) {
330 log_err("Turning off the input filter resulted in no matches.\n");
331 goto bail;
332 }
333
334 name = ucsdet_getName(match, &status);
335
336 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
337 log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
338 } else {
339 lang = ucsdet_getLanguage(match, &status);
340
341 if (lang == NULL || strcmp(lang, "en") != 0) {
342 log_err("Unfiltered input did not detect as English!\n");
343 }
344 }
345
346 bail:
347 freeBytes(bytes);
348 ucsdet_close(csd);
349 }
350
TestChaining(void)351 static void TestChaining(void) {
352 UErrorCode status = U_USELESS_COLLATOR_ERROR;
353
354 ucsdet_open(&status);
355 ucsdet_setText(NULL, NULL, 0, &status);
356 ucsdet_getName(NULL, &status);
357 ucsdet_getConfidence(NULL, &status);
358 ucsdet_getLanguage(NULL, &status);
359 ucsdet_detect(NULL, &status);
360 ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status);
361 ucsdet_detectAll(NULL, NULL, &status);
362 ucsdet_getUChars(NULL, NULL, 0, &status);
363 ucsdet_getUChars(NULL, NULL, 0, &status);
364 ucsdet_close(NULL);
365
366 /* All of this code should have done nothing. */
367 if (status != U_USELESS_COLLATOR_ERROR) {
368 log_err("Status got changed to %s\n", u_errorName(status));
369 }
370 }
371
TestBufferOverflow(void)372 static void TestBufferOverflow(void) {
373 UErrorCode status = U_ZERO_ERROR;
374 static const char *testStrings[] = {
375 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b", /* A partial ISO-2022 shift state at the end */
376 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24", /* A partial ISO-2022 shift state at the end */
377 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */
378 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one at the start */
379 "\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */
380 "\xa1", /* Could be a single byte shift-jis at the end */
381 "\x74\x68\xa1", /* Could be a single byte shift-jis at the end */
382 "\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
383 };
384 static const char *testResults[] = {
385 "windows-1252",
386 "windows-1252",
387 "windows-1252",
388 "windows-1252",
389 "ISO-2022-JP",
390 NULL,
391 NULL,
392 "ISO-8859-1"
393 };
394 int32_t idx = 0;
395 UCharsetDetector *csd = ucsdet_open(&status);
396 const UCharsetMatch *match;
397
398 ucsdet_setDeclaredEncoding(csd, "ISO-2022-JP", -1, &status);
399
400 if (U_FAILURE(status)) {
401 log_err("Couldn't open detector. %s\n", u_errorName(status));
402 goto bail;
403 }
404
405 for (idx = 0; idx < UPRV_LENGTHOF(testStrings); idx++) {
406 status = U_ZERO_ERROR;
407 ucsdet_setText(csd, testStrings[idx], -1, &status);
408 match = ucsdet_detect(csd, &status);
409
410 if (match == NULL) {
411 if (testResults[idx] != NULL) {
412 log_err("Unexpectedly got no results at index %d.\n", idx);
413 }
414 else {
415 log_verbose("Got no result as expected at index %d.\n", idx);
416 }
417 continue;
418 }
419
420 if (testResults[idx] == NULL || strcmp(ucsdet_getName(match, &status), testResults[idx]) != 0) {
421 log_err("Unexpectedly got %s instead of %s at index %d with confidence %d.\n",
422 ucsdet_getName(match, &status), testResults[idx], idx, ucsdet_getConfidence(match, &status));
423 goto bail;
424 }
425 }
426
427 bail:
428 ucsdet_close(csd);
429 }
430
TestIBM424(void)431 static void TestIBM424(void)
432 {
433 UErrorCode status = U_ZERO_ERROR;
434
435 static const UChar chars[] = {
436 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
437 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
438 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
439 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
440 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
441 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
442 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
443 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
444 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
445 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
446 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
447 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
448 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
449 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
450 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
451 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
452 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
453 };
454
455 static const UChar chars_reverse[] = {
456 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
457 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
458 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
459 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
460 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
461 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
462 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
463 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
464 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
465 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
466 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
467 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
468 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
469 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
470 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
471 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
472 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
473 0x0000
474 };
475
476 int32_t bLength = 0, brLength = 0, cLength = UPRV_LENGTHOF(chars), crLength = UPRV_LENGTHOF(chars_reverse);
477
478 char *bytes = extractBytes(chars, cLength, "IBM424", &bLength);
479 char *bytes_r = extractBytes(chars_reverse, crLength, "IBM424", &brLength);
480
481 UCharsetDetector *csd = ucsdet_open(&status);
482 const UCharsetMatch *match;
483 const char *name;
484
485 ucsdet_setText(csd, bytes, bLength, &status);
486 match = ucsdet_detect(csd, &status);
487
488 if (match == NULL) {
489 log_err("Encoding detection failure for IBM424_rtl: got no matches.\n");
490 goto bail;
491 }
492
493 name = ucsdet_getName(match, &status);
494 if (strcmp(name, "IBM424_rtl") != 0) {
495 log_data_err("Encoding detection failure for IBM424_rtl: got %s. (Are you missing data?)\n", name);
496 }
497
498 ucsdet_setText(csd, bytes_r, brLength, &status);
499 match = ucsdet_detect(csd, &status);
500
501 if (match == NULL) {
502 log_err("Encoding detection failure for IBM424_ltr: got no matches.\n");
503 goto bail;
504 }
505
506 name = ucsdet_getName(match, &status);
507 if (strcmp(name, "IBM424_ltr") != 0) {
508 log_data_err("Encoding detection failure for IBM424_ltr: got %s. (Are you missing data?)\n", name);
509 }
510
511 bail:
512 freeBytes(bytes);
513 freeBytes(bytes_r);
514 ucsdet_close(csd);
515 }
516
TestIBM420(void)517 static void TestIBM420(void)
518 {
519 UErrorCode status = U_ZERO_ERROR;
520
521 static const UChar chars[] = {
522 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
523 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
524 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
525 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
526 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
527 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
528 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
529 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
530 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
531 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
532 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
533 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
534 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
535 0x0000
536 };
537 static const UChar chars_reverse[] = {
538 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
539 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
540 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
541 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
542 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
543 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
544 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
545 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
546 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
547 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
548 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
549 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
550 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
551 0x0000,
552 };
553
554 int32_t bLength = 0, brLength = 0, cLength = UPRV_LENGTHOF(chars), crLength = UPRV_LENGTHOF(chars_reverse);
555
556 char *bytes = extractBytes(chars, cLength, "IBM420", &bLength);
557 char *bytes_r = extractBytes(chars_reverse, crLength, "IBM420", &brLength);
558
559 UCharsetDetector *csd = ucsdet_open(&status);
560 const UCharsetMatch *match;
561 const char *name;
562
563 ucsdet_setText(csd, bytes, bLength, &status);
564 match = ucsdet_detect(csd, &status);
565
566 if (match == NULL) {
567 log_err("Encoding detection failure for IBM420_rtl: got no matches.\n");
568 goto bail;
569 }
570
571 name = ucsdet_getName(match, &status);
572 if (strcmp(name, "IBM420_rtl") != 0) {
573 log_data_err("Encoding detection failure for IBM420_rtl: got %s. (Are you missing data?)\n", name);
574 }
575
576 ucsdet_setText(csd, bytes_r, brLength, &status);
577 match = ucsdet_detect(csd, &status);
578
579 if (match == NULL) {
580 log_err("Encoding detection failure for IBM420_ltr: got no matches.\n");
581 goto bail;
582 }
583
584 name = ucsdet_getName(match, &status);
585 if (strcmp(name, "IBM420_ltr") != 0) {
586 log_data_err("Encoding detection failure for IBM420_ltr: got %s. (Are you missing data?)\n", name);
587 }
588
589 bail:
590 freeBytes(bytes);
591 freeBytes(bytes_r);
592 ucsdet_close(csd);
593 }
594