• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  ****************************************************************************
5  * Copyright (c) 2005-2016, International Business Machines Corporation and *
6  * others. All Rights Reserved.                                             *
7  ****************************************************************************
8  */
9 
10 #include "unicode/utypes.h"
11 
12 #include "unicode/ucsdet.h"
13 #include "unicode/ucnv.h"
14 #include "unicode/ustring.h"
15 
16 #include "cintltst.h"
17 #include "cmemory.h"
18 
19 #include <stdbool.h>
20 #include <stdlib.h>
21 #include <string.h>
22 
23 #define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type))
24 #define DELETE_ARRAY(array) free(array)
25 
26 static void TestConstruction(void);
27 static void TestUTF8(void);
28 static void TestUTF16(void);
29 static void TestC1Bytes(void);
30 static void TestInputFilter(void);
31 static void TestChaining(void);
32 static void TestBufferOverflow(void);
33 static void TestIBM424(void);
34 static void TestIBM420(void);
35 
36 void addUCsdetTest(TestNode** root);
37 
addUCsdetTest(TestNode ** root)38 void addUCsdetTest(TestNode** root)
39 {
40     addTest(root, &TestConstruction, "ucsdetst/TestConstruction");
41     addTest(root, &TestUTF8, "ucsdetst/TestUTF8");
42     addTest(root, &TestUTF16, "ucsdetst/TestUTF16");
43     addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes");
44     addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter");
45     addTest(root, &TestChaining, "ucsdetst/TestErrorChaining");
46     addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow");
47 #if !UCONFIG_NO_LEGACY_CONVERSION
48     addTest(root, &TestIBM424, "ucsdetst/TestIBM424");
49     addTest(root, &TestIBM420, "ucsdetst/TestIBM420");
50 #endif
51 }
52 
preflight(const UChar * src,int32_t length,UConverter * cnv)53 static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv)
54 {
55     UErrorCode status;
56     char buffer[1024];
57     char *dest, *destLimit = buffer + sizeof(buffer);
58     const UChar *srcLimit = src + length;
59     int32_t result = 0;
60 
61     do {
62         dest = buffer;
63         status = U_ZERO_ERROR;
64         ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, true, &status);
65         result += (int32_t) (dest - buffer);
66     } while (status == U_BUFFER_OVERFLOW_ERROR);
67 
68     return result;
69 }
70 
extractBytes(const UChar * src,int32_t length,const char * codepage,int32_t * byteLength)71 static char *extractBytes(const UChar *src, int32_t length, const char *codepage, int32_t *byteLength)
72 {
73     UErrorCode status = U_ZERO_ERROR;
74     UConverter *cnv = ucnv_open(codepage, &status);
75     int32_t byteCount = preflight(src, length, cnv);
76     const UChar *srcLimit = src + length;
77     char *bytes = NEW_ARRAY(char, byteCount + 1);
78     char *dest = bytes, *destLimit = bytes + byteCount + 1;
79 
80     ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, true, &status);
81     ucnv_close(cnv);
82 
83     *byteLength = byteCount;
84     return bytes;
85 }
86 
freeBytes(char * bytes)87 static void freeBytes(char *bytes)
88 {
89     DELETE_ARRAY(bytes);
90 }
91 
TestConstruction(void)92 static void TestConstruction(void)
93 {
94     UErrorCode status = U_ZERO_ERROR;
95     UCharsetDetector *csd = ucsdet_open(&status);
96     UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
97     const char *name;
98     int32_t count = uenum_count(e, &status);
99     int32_t i, length;
100 
101     for(i = 0; i < count; i += 1) {
102         name = uenum_next(e, &length, &status);
103 
104         if(name == NULL || length <= 0) {
105             log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n");
106         }
107     }
108     /* one past the list of all names must return NULL */
109     name = uenum_next(e, &length, &status);
110     if(name != NULL || length != 0 || U_FAILURE(status)) {
111         log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n");
112     }
113 
114     uenum_close(e);
115     ucsdet_close(csd);
116 }
117 
TestUTF8(void)118 static void TestUTF8(void)
119 {
120     UErrorCode status = U_ZERO_ERROR;
121     static const char ss[] = "This is a string with some non-ascii characters that will "
122                "be converted to UTF-8, then shoved through the detection process.  "
123                "\\u0391\\u0392\\u0393\\u0394\\u0395"
124                "Sure would be nice if our source could contain Unicode directly!";
125     int32_t byteLength = 0, sLength = 0, dLength = 0;
126     UChar s[sizeof(ss)];
127     char *bytes;
128     UCharsetDetector *csd = ucsdet_open(&status);
129     const UCharsetMatch *match;
130     UChar detected[sizeof(ss)];
131 
132     sLength = u_unescape(ss, s, sizeof(ss));
133     bytes = extractBytes(s, sLength, "UTF-8", &byteLength);
134 
135     ucsdet_setText(csd, bytes, byteLength, &status);
136     if (U_FAILURE(status)) {
137         log_err("status is %s\n", u_errorName(status));
138         goto bail;
139     }
140 
141     match = ucsdet_detect(csd, &status);
142 
143     if (match == NULL) {
144         log_err("Detection failure for UTF-8: got no matches.\n");
145         goto bail;
146     }
147 
148     dLength = ucsdet_getUChars(match, detected, sLength, &status);
149 
150     if (u_strCompare(detected, dLength, s, sLength, false) != 0) {
151         log_err("Round-trip test failed!\n");
152     }
153 
154     ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
155 
156 bail:
157     freeBytes(bytes);
158     ucsdet_close(csd);
159 }
160 
TestUTF16(void)161 static void TestUTF16(void)
162 {
163     UErrorCode status = U_ZERO_ERROR;
164     /* Notice the BOM on the start of this string */
165     static const UChar chars[] = {
166         0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
167         0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
168         0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
169         0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
170         0x064a, 0x062a, 0x0000};
171     int32_t beLength = 0, leLength = 0, cLength = UPRV_LENGTHOF(chars);
172     char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength);
173     char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength);
174     UCharsetDetector *csd = ucsdet_open(&status);
175     const UCharsetMatch *match;
176     const char *name;
177     int32_t conf;
178 
179     ucsdet_setText(csd, beBytes, beLength, &status);
180     match = ucsdet_detect(csd, &status);
181 
182     if (match == NULL) {
183         log_err("Encoding detection failure for UTF-16BE: got no matches.\n");
184         goto try_le;
185     }
186 
187     name  = ucsdet_getName(match, &status);
188     conf  = ucsdet_getConfidence(match, &status);
189 
190     if (strcmp(name, "UTF-16BE") != 0) {
191         log_err("Encoding detection failure for UTF-16BE: got %s\n", name);
192     }
193 
194     if (conf != 100) {
195         log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf);
196     }
197 
198 try_le:
199     ucsdet_setText(csd, leBytes, leLength, &status);
200     match = ucsdet_detect(csd, &status);
201 
202     if (match == NULL) {
203         log_err("Encoding detection failure for UTF-16LE: got no matches.\n");
204         goto bail;
205     }
206 
207     name  = ucsdet_getName(match, &status);
208     conf = ucsdet_getConfidence(match, &status);
209 
210 
211     if (strcmp(name, "UTF-16LE") != 0) {
212         log_err("Encoding detection failure for UTF-16LE: got %s\n", name);
213     }
214 
215     if (conf != 100) {
216         log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf);
217     }
218 
219 bail:
220     freeBytes(leBytes);
221     freeBytes(beBytes);
222     ucsdet_close(csd);
223 }
224 
TestC1Bytes(void)225 static void TestC1Bytes(void)
226 {
227 #if !UCONFIG_NO_LEGACY_CONVERSION
228     UErrorCode status = U_ZERO_ERROR;
229     static const char ssISO[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
230     static const char ssWindows[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.";
231     int32_t sISOLength = 0, sWindowsLength = 0;
232     UChar sISO[sizeof(ssISO)];
233     UChar sWindows[sizeof(ssWindows)];
234     int32_t lISO = 0, lWindows = 0;
235     char *bISO;
236     char *bWindows;
237     UCharsetDetector *csd = ucsdet_open(&status);
238     const UCharsetMatch *match;
239     const char *name;
240 
241     sISOLength = u_unescape(ssISO, sISO, sizeof(ssISO));
242     sWindowsLength = u_unescape(ssWindows, sWindows, sizeof(ssWindows));
243     bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO);
244     bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows);
245 
246     ucsdet_setText(csd, bWindows, lWindows, &status);
247     match = ucsdet_detect(csd, &status);
248 
249     if (match == NULL) {
250         log_err("English test with C1 bytes got no matches.\n");
251         goto bail;
252     }
253 
254     name  = ucsdet_getName(match, &status);
255 
256     if (strcmp(name, "windows-1252") != 0) {
257         log_data_err("English text with C1 bytes does not detect as windows-1252, but as %s. (Are you missing data?)\n", name);
258     }
259 
260     ucsdet_setText(csd, bISO, lISO, &status);
261     match = ucsdet_detect(csd, &status);
262 
263     if (match == NULL) {
264         log_err("English text without C1 bytes got no matches.\n");
265         goto bail;
266     }
267 
268     name  = ucsdet_getName(match, &status);
269 
270     if (strcmp(name, "ISO-8859-1") != 0) {
271         log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name);
272     }
273 
274 bail:
275     freeBytes(bWindows);
276     freeBytes(bISO);
277 
278     ucsdet_close(csd);
279 #endif
280 }
281 
TestInputFilter(void)282 static void TestInputFilter(void)
283 {
284     UErrorCode status = U_ZERO_ERROR;
285     static const char ss[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
286     int32_t sLength = 0;
287     UChar s[sizeof(ss)];
288     int32_t byteLength = 0;
289     char *bytes;
290     UCharsetDetector *csd = ucsdet_open(&status);
291     const UCharsetMatch *match;
292     const char *lang, *name;
293 
294     sLength = u_unescape(ss, s, sizeof(ss));
295     bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength);
296 
297     ucsdet_enableInputFilter(csd, true);
298 
299     if (!ucsdet_isInputFilterEnabled(csd)) {
300         log_err("ucsdet_enableInputFilter(csd, true) did not enable input filter!\n");
301     }
302 
303 
304     ucsdet_setText(csd, bytes, byteLength, &status);
305     match = ucsdet_detect(csd, &status);
306 
307     if (match == NULL) {
308         log_err("Turning on the input filter resulted in no matches.\n");
309         goto turn_off;
310     }
311 
312     name = ucsdet_getName(match, &status);
313 
314     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
315         log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
316     } else {
317         lang = ucsdet_getLanguage(match, &status);
318 
319         if (lang == NULL || strcmp(lang, "fr") != 0) {
320             log_err("Input filter did not strip markup!\n");
321         }
322     }
323 
324 turn_off:
325     ucsdet_enableInputFilter(csd, false);
326     ucsdet_setText(csd, bytes, byteLength, &status);
327     match = ucsdet_detect(csd, &status);
328 
329     if (match == NULL) {
330         log_err("Turning off the input filter resulted in no matches.\n");
331         goto bail;
332     }
333 
334     name = ucsdet_getName(match, &status);
335 
336     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
337         log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
338     } else {
339         lang = ucsdet_getLanguage(match, &status);
340 
341         if (lang == NULL || strcmp(lang, "en") != 0) {
342             log_err("Unfiltered input did not detect as English!\n");
343         }
344     }
345 
346 bail:
347     freeBytes(bytes);
348     ucsdet_close(csd);
349 }
350 
TestChaining(void)351 static void TestChaining(void) {
352     UErrorCode status = U_USELESS_COLLATOR_ERROR;
353 
354     ucsdet_open(&status);
355     ucsdet_setText(NULL, NULL, 0, &status);
356     ucsdet_getName(NULL, &status);
357     ucsdet_getConfidence(NULL, &status);
358     ucsdet_getLanguage(NULL, &status);
359     ucsdet_detect(NULL, &status);
360     ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status);
361     ucsdet_detectAll(NULL, NULL, &status);
362     ucsdet_getUChars(NULL, NULL, 0, &status);
363     ucsdet_getUChars(NULL, NULL, 0, &status);
364     ucsdet_close(NULL);
365 
366     /* All of this code should have done nothing. */
367     if (status != U_USELESS_COLLATOR_ERROR) {
368         log_err("Status got changed to %s\n", u_errorName(status));
369     }
370 }
371 
TestBufferOverflow(void)372 static void TestBufferOverflow(void) {
373     UErrorCode status = U_ZERO_ERROR;
374     static const char *testStrings[] = {
375         "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b", /* A partial ISO-2022 shift state at the end */
376         "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24", /* A partial ISO-2022 shift state at the end */
377         "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */
378         "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one at the start */
379         "\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */
380         "\xa1", /* Could be a single byte shift-jis at the end */
381         "\x74\x68\xa1", /* Could be a single byte shift-jis at the end */
382         "\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
383     };
384     static const char *testResults[] = {
385         "windows-1252",
386         "windows-1252",
387         "windows-1252",
388         "windows-1252",
389         "ISO-2022-JP",
390         NULL,
391         NULL,
392         "ISO-8859-1"
393     };
394     int32_t idx = 0;
395     UCharsetDetector *csd = ucsdet_open(&status);
396     const UCharsetMatch *match;
397 
398     ucsdet_setDeclaredEncoding(csd, "ISO-2022-JP", -1, &status);
399 
400     if (U_FAILURE(status)) {
401         log_err("Couldn't open detector. %s\n", u_errorName(status));
402         goto bail;
403     }
404 
405     for (idx = 0; idx < UPRV_LENGTHOF(testStrings); idx++) {
406         status = U_ZERO_ERROR;
407         ucsdet_setText(csd, testStrings[idx], -1, &status);
408         match = ucsdet_detect(csd, &status);
409 
410         if (match == NULL) {
411             if (testResults[idx] != NULL) {
412                 log_err("Unexpectedly got no results at index %d.\n", idx);
413             }
414             else {
415                 log_verbose("Got no result as expected at index %d.\n", idx);
416             }
417             continue;
418         }
419 
420         if (testResults[idx] == NULL || strcmp(ucsdet_getName(match, &status), testResults[idx]) != 0) {
421             log_err("Unexpectedly got %s instead of %s at index %d with confidence %d.\n",
422                 ucsdet_getName(match, &status), testResults[idx], idx, ucsdet_getConfidence(match, &status));
423             goto bail;
424         }
425     }
426 
427 bail:
428     ucsdet_close(csd);
429 }
430 
TestIBM424(void)431 static void TestIBM424(void)
432 {
433     UErrorCode status = U_ZERO_ERROR;
434 
435     static const UChar chars[] = {
436             0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
437             0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
438             0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
439             0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
440             0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
441             0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
442             0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
443             0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
444             0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
445             0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
446             0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
447             0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
448             0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
449             0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
450             0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
451             0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
452             0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
453     };
454 
455     static const UChar chars_reverse[] = {
456             0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
457             0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
458             0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
459             0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
460             0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
461             0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
462             0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
463             0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
464             0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
465             0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
466             0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
467             0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
468             0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
469             0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
470             0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
471             0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
472             0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
473             0x0000
474     };
475 
476     int32_t bLength = 0, brLength = 0, cLength = UPRV_LENGTHOF(chars), crLength = UPRV_LENGTHOF(chars_reverse);
477 
478     char *bytes = extractBytes(chars, cLength, "IBM424", &bLength);
479     char *bytes_r = extractBytes(chars_reverse, crLength, "IBM424", &brLength);
480 
481     UCharsetDetector *csd = ucsdet_open(&status);
482     const UCharsetMatch *match;
483     const char *name;
484 
485     ucsdet_setText(csd, bytes, bLength, &status);
486     match = ucsdet_detect(csd, &status);
487 
488     if (match == NULL) {
489         log_err("Encoding detection failure for IBM424_rtl: got no matches.\n");
490         goto bail;
491     }
492 
493     name  = ucsdet_getName(match, &status);
494     if (strcmp(name, "IBM424_rtl") != 0) {
495         log_data_err("Encoding detection failure for IBM424_rtl: got %s. (Are you missing data?)\n", name);
496     }
497 
498     ucsdet_setText(csd, bytes_r, brLength, &status);
499     match = ucsdet_detect(csd, &status);
500 
501     if (match == NULL) {
502         log_err("Encoding detection failure for IBM424_ltr: got no matches.\n");
503         goto bail;
504     }
505 
506     name  = ucsdet_getName(match, &status);
507     if (strcmp(name, "IBM424_ltr") != 0) {
508         log_data_err("Encoding detection failure for IBM424_ltr: got %s. (Are you missing data?)\n", name);
509     }
510 
511 bail:
512     freeBytes(bytes);
513     freeBytes(bytes_r);
514     ucsdet_close(csd);
515 }
516 
TestIBM420(void)517 static void TestIBM420(void)
518 {
519     UErrorCode status = U_ZERO_ERROR;
520 
521     static const UChar chars[] = {
522         0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
523         0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
524         0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
525         0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
526         0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
527         0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
528         0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
529         0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
530         0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
531         0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
532         0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
533         0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
534         0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
535         0x0000
536     };
537     static const UChar chars_reverse[] = {
538         0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
539         0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
540         0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
541         0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
542         0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
543         0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
544         0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
545         0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
546         0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
547         0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
548         0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
549         0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
550         0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
551         0x0000,
552     };
553 
554     int32_t bLength = 0, brLength = 0, cLength = UPRV_LENGTHOF(chars), crLength = UPRV_LENGTHOF(chars_reverse);
555 
556     char *bytes = extractBytes(chars, cLength, "IBM420", &bLength);
557     char *bytes_r = extractBytes(chars_reverse, crLength, "IBM420", &brLength);
558 
559     UCharsetDetector *csd = ucsdet_open(&status);
560     const UCharsetMatch *match;
561     const char *name;
562 
563     ucsdet_setText(csd, bytes, bLength, &status);
564     match = ucsdet_detect(csd, &status);
565 
566     if (match == NULL) {
567         log_err("Encoding detection failure for IBM420_rtl: got no matches.\n");
568         goto bail;
569     }
570 
571     name  = ucsdet_getName(match, &status);
572     if (strcmp(name, "IBM420_rtl") != 0) {
573         log_data_err("Encoding detection failure for IBM420_rtl: got %s. (Are you missing data?)\n", name);
574     }
575 
576     ucsdet_setText(csd, bytes_r, brLength, &status);
577     match = ucsdet_detect(csd, &status);
578 
579     if (match == NULL) {
580         log_err("Encoding detection failure for IBM420_ltr: got no matches.\n");
581         goto bail;
582     }
583 
584     name  = ucsdet_getName(match, &status);
585     if (strcmp(name, "IBM420_ltr") != 0) {
586         log_data_err("Encoding detection failure for IBM420_ltr: got %s. (Are you missing data?)\n", name);
587     }
588 
589 bail:
590     freeBytes(bytes);
591     freeBytes(bytes_r);
592     ucsdet_close(csd);
593 }
594