1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2005-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9
10 #include "unicode/utypes.h"
11
12 #if !UCONFIG_NO_CONVERSION
13
14 #include "unicode/ucsdet.h"
15
16 #include "csdetect.h"
17 #include "csmatch.h"
18 #include "uenumimp.h"
19
20 #include "cmemory.h"
21 #include "cstring.h"
22 #include "umutex.h"
23 #include "ucln_in.h"
24 #include "uarrsort.h"
25 #include "inputext.h"
26 #include "csrsbcs.h"
27 #include "csrmbcs.h"
28 #include "csrutf8.h"
29 #include "csrucode.h"
30 #include "csr2022.h"
31
32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
34
35 U_NAMESPACE_BEGIN
36
37 struct CSRecognizerInfo : public UMemory {
CSRecognizerInfoCSRecognizerInfo38 CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
39 : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {}
40
~CSRecognizerInfoCSRecognizerInfo41 ~CSRecognizerInfo() {delete recognizer;}
42
43 CharsetRecognizer *recognizer;
44 UBool isDefaultEnabled;
45 };
46
47 U_NAMESPACE_END
48
49 static icu::CSRecognizerInfo **fCSRecognizers = NULL;
50 static icu::UInitOnce gCSRecognizersInitOnce {};
51 static int32_t fCSRecognizers_size = 0;
52
53 U_CDECL_BEGIN
csdet_cleanup(void)54 static UBool U_CALLCONV csdet_cleanup(void)
55 {
56 U_NAMESPACE_USE
57 if (fCSRecognizers != NULL) {
58 for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
59 delete fCSRecognizers[r];
60 fCSRecognizers[r] = NULL;
61 }
62
63 DELETE_ARRAY(fCSRecognizers);
64 fCSRecognizers = NULL;
65 fCSRecognizers_size = 0;
66 }
67 gCSRecognizersInitOnce.reset();
68
69 return true;
70 }
71
72 static int32_t U_CALLCONV
charsetMatchComparator(const void *,const void * left,const void * right)73 charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
74 {
75 U_NAMESPACE_USE
76
77 const CharsetMatch **csm_l = (const CharsetMatch **) left;
78 const CharsetMatch **csm_r = (const CharsetMatch **) right;
79
80 // NOTE: compare is backwards to sort from highest to lowest.
81 return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
82 }
83
initRecognizers(UErrorCode & status)84 static void U_CALLCONV initRecognizers(UErrorCode &status) {
85 U_NAMESPACE_USE
86 ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
87 CSRecognizerInfo *tempArray[] = {
88 new CSRecognizerInfo(new CharsetRecog_UTF8(), true),
89
90 new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), true),
91 new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), true),
92 new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), true),
93 new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), true),
94
95 new CSRecognizerInfo(new CharsetRecog_8859_1(), true),
96 new CSRecognizerInfo(new CharsetRecog_8859_2(), true),
97 new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), true),
98 new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), true),
99 new CSRecognizerInfo(new CharsetRecog_8859_7_el(), true),
100 new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), true),
101 new CSRecognizerInfo(new CharsetRecog_8859_8_he(), true),
102 new CSRecognizerInfo(new CharsetRecog_windows_1251(), true),
103 new CSRecognizerInfo(new CharsetRecog_windows_1256(), true),
104 new CSRecognizerInfo(new CharsetRecog_KOI8_R(), true),
105 new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), true),
106 new CSRecognizerInfo(new CharsetRecog_sjis(), true),
107 new CSRecognizerInfo(new CharsetRecog_gb_18030(), true),
108 new CSRecognizerInfo(new CharsetRecog_euc_jp(), true),
109 new CSRecognizerInfo(new CharsetRecog_euc_kr(), true),
110 new CSRecognizerInfo(new CharsetRecog_big5(), true),
111
112 new CSRecognizerInfo(new CharsetRecog_2022JP(), true),
113 #if !UCONFIG_ONLY_HTML_CONVERSION
114 new CSRecognizerInfo(new CharsetRecog_2022KR(), true),
115 new CSRecognizerInfo(new CharsetRecog_2022CN(), true),
116
117 new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), false),
118 new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), false),
119 new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), false),
120 new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), false)
121 #endif
122 };
123 int32_t rCount = UPRV_LENGTHOF(tempArray);
124
125 fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);
126
127 if (fCSRecognizers == NULL) {
128 status = U_MEMORY_ALLOCATION_ERROR;
129 }
130 else {
131 fCSRecognizers_size = rCount;
132 for (int32_t r = 0; r < rCount; r += 1) {
133 fCSRecognizers[r] = tempArray[r];
134 if (fCSRecognizers[r] == NULL) {
135 status = U_MEMORY_ALLOCATION_ERROR;
136 }
137 }
138 }
139 }
140
141 U_CDECL_END
142
143 U_NAMESPACE_BEGIN
144
setRecognizers(UErrorCode & status)145 void CharsetDetector::setRecognizers(UErrorCode &status)
146 {
147 umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status);
148 }
149
CharsetDetector(UErrorCode & status)150 CharsetDetector::CharsetDetector(UErrorCode &status)
151 : textIn(new InputText(status)), resultArray(NULL),
152 resultCount(0), fStripTags(false), fFreshTextSet(false),
153 fEnabledRecognizers(NULL)
154 {
155 if (U_FAILURE(status)) {
156 return;
157 }
158
159 setRecognizers(status);
160
161 if (U_FAILURE(status)) {
162 return;
163 }
164
165 resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
166
167 if (resultArray == NULL) {
168 status = U_MEMORY_ALLOCATION_ERROR;
169 return;
170 }
171
172 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
173 resultArray[i] = new CharsetMatch();
174
175 if (resultArray[i] == NULL) {
176 status = U_MEMORY_ALLOCATION_ERROR;
177 break;
178 }
179 }
180 }
181
~CharsetDetector()182 CharsetDetector::~CharsetDetector()
183 {
184 delete textIn;
185
186 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
187 delete resultArray[i];
188 }
189
190 uprv_free(resultArray);
191
192 if (fEnabledRecognizers) {
193 uprv_free(fEnabledRecognizers);
194 }
195 }
196
setText(const char * in,int32_t len)197 void CharsetDetector::setText(const char *in, int32_t len)
198 {
199 textIn->setText(in, len);
200 fFreshTextSet = true;
201 }
202
setStripTagsFlag(UBool flag)203 UBool CharsetDetector::setStripTagsFlag(UBool flag)
204 {
205 UBool temp = fStripTags;
206 fStripTags = flag;
207 fFreshTextSet = true;
208 return temp;
209 }
210
getStripTagsFlag() const211 UBool CharsetDetector::getStripTagsFlag() const
212 {
213 return fStripTags;
214 }
215
setDeclaredEncoding(const char * encoding,int32_t len) const216 void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
217 {
218 textIn->setDeclaredEncoding(encoding,len);
219 }
220
getDetectableCount()221 int32_t CharsetDetector::getDetectableCount()
222 {
223 UErrorCode status = U_ZERO_ERROR;
224
225 setRecognizers(status);
226
227 return fCSRecognizers_size;
228 }
229
detect(UErrorCode & status)230 const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
231 {
232 int32_t maxMatchesFound = 0;
233
234 detectAll(maxMatchesFound, status);
235
236 if(maxMatchesFound > 0) {
237 return resultArray[0];
238 } else {
239 return NULL;
240 }
241 }
242
detectAll(int32_t & maxMatchesFound,UErrorCode & status)243 const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
244 {
245 if(!textIn->isSet()) {
246 status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set
247
248 return NULL;
249 } else if (fFreshTextSet) {
250 CharsetRecognizer *csr;
251 int32_t i;
252
253 textIn->MungeInput(fStripTags);
254
255 // Iterate over all possible charsets, remember all that
256 // give a match quality > 0.
257 resultCount = 0;
258 for (i = 0; i < fCSRecognizers_size; i += 1) {
259 csr = fCSRecognizers[i]->recognizer;
260 if (csr->match(textIn, resultArray[resultCount])) {
261 resultCount++;
262 }
263 }
264
265 if (resultCount > 1) {
266 uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, true, &status);
267 }
268 fFreshTextSet = false;
269 }
270
271 maxMatchesFound = resultCount;
272
273 if (maxMatchesFound == 0) {
274 status = U_INVALID_CHAR_FOUND;
275 return NULL;
276 }
277
278 return resultArray;
279 }
280
setDetectableCharset(const char * encoding,UBool enabled,UErrorCode & status)281 void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
282 {
283 if (U_FAILURE(status)) {
284 return;
285 }
286
287 int32_t modIdx = -1;
288 UBool isDefaultVal = false;
289 for (int32_t i = 0; i < fCSRecognizers_size; i++) {
290 CSRecognizerInfo *csrinfo = fCSRecognizers[i];
291 if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) {
292 modIdx = i;
293 isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
294 break;
295 }
296 }
297 if (modIdx < 0) {
298 // No matching encoding found
299 status = U_ILLEGAL_ARGUMENT_ERROR;
300 return;
301 }
302
303 if (fEnabledRecognizers == NULL && !isDefaultVal) {
304 // Create an array storing the non default setting
305 fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
306 if (fEnabledRecognizers == NULL) {
307 status = U_MEMORY_ALLOCATION_ERROR;
308 return;
309 }
310 // Initialize the array with default info
311 for (int32_t i = 0; i < fCSRecognizers_size; i++) {
312 fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
313 }
314 }
315
316 if (fEnabledRecognizers != NULL) {
317 fEnabledRecognizers[modIdx] = enabled;
318 }
319 }
320
321 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
322 {
323 if( index > fCSRecognizers_size-1 || index < 0) {
324 status = U_INDEX_OUTOFBOUNDS_ERROR;
325
326 return 0;
327 } else {
328 return fCSRecognizers[index]->getName();
329 }
330 }*/
331
332 U_NAMESPACE_END
333
334 U_CDECL_BEGIN
335 typedef struct {
336 int32_t currIndex;
337 UBool all;
338 UBool *enabledRecognizers;
339 } Context;
340
341
342
343 static void U_CALLCONV
enumClose(UEnumeration * en)344 enumClose(UEnumeration *en) {
345 if(en->context != NULL) {
346 DELETE_ARRAY(en->context);
347 }
348
349 DELETE_ARRAY(en);
350 }
351
352 static int32_t U_CALLCONV
enumCount(UEnumeration * en,UErrorCode *)353 enumCount(UEnumeration *en, UErrorCode *) {
354 if (((Context *)en->context)->all) {
355 // ucsdet_getAllDetectableCharsets, all charset detector names
356 return fCSRecognizers_size;
357 }
358
359 // Otherwise, ucsdet_getDetectableCharsets - only enabled ones
360 int32_t count = 0;
361 UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
362 if (enabledArray != NULL) {
363 // custom set
364 for (int32_t i = 0; i < fCSRecognizers_size; i++) {
365 if (enabledArray[i]) {
366 count++;
367 }
368 }
369 } else {
370 // default set
371 for (int32_t i = 0; i < fCSRecognizers_size; i++) {
372 if (fCSRecognizers[i]->isDefaultEnabled) {
373 count++;
374 }
375 }
376 }
377 return count;
378 }
379
380 static const char* U_CALLCONV
enumNext(UEnumeration * en,int32_t * resultLength,UErrorCode *)381 enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
382 const char *currName = NULL;
383
384 if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
385 if (((Context *)en->context)->all) {
386 // ucsdet_getAllDetectableCharsets, all charset detector names
387 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
388 ((Context *)en->context)->currIndex++;
389 } else {
390 // ucsdet_getDetectableCharsets
391 UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
392 if (enabledArray != NULL) {
393 // custom set
394 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
395 if (enabledArray[((Context *)en->context)->currIndex]) {
396 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
397 }
398 ((Context *)en->context)->currIndex++;
399 }
400 } else {
401 // default set
402 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
403 if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
404 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
405 }
406 ((Context *)en->context)->currIndex++;
407 }
408 }
409 }
410 }
411
412 if(resultLength != NULL) {
413 *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName);
414 }
415
416 return currName;
417 }
418
419
420 static void U_CALLCONV
enumReset(UEnumeration * en,UErrorCode *)421 enumReset(UEnumeration *en, UErrorCode *) {
422 ((Context *)en->context)->currIndex = 0;
423 }
424
425 static const UEnumeration gCSDetEnumeration = {
426 NULL,
427 NULL,
428 enumClose,
429 enumCount,
430 uenum_unextDefault,
431 enumNext,
432 enumReset
433 };
434
435 U_CDECL_END
436
437 U_NAMESPACE_BEGIN
438
getAllDetectableCharsets(UErrorCode & status)439 UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
440 {
441
442 /* Initialize recognized charsets. */
443 setRecognizers(status);
444
445 if(U_FAILURE(status)) {
446 return 0;
447 }
448
449 UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
450 if (en == NULL) {
451 status = U_MEMORY_ALLOCATION_ERROR;
452 return 0;
453 }
454 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
455 en->context = (void*)NEW_ARRAY(Context, 1);
456 if (en->context == NULL) {
457 status = U_MEMORY_ALLOCATION_ERROR;
458 DELETE_ARRAY(en);
459 return 0;
460 }
461 uprv_memset(en->context, 0, sizeof(Context));
462 ((Context*)en->context)->all = true;
463 return en;
464 }
465
getDetectableCharsets(UErrorCode & status) const466 UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
467 {
468 if(U_FAILURE(status)) {
469 return 0;
470 }
471
472 UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
473 if (en == NULL) {
474 status = U_MEMORY_ALLOCATION_ERROR;
475 return 0;
476 }
477 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
478 en->context = (void*)NEW_ARRAY(Context, 1);
479 if (en->context == NULL) {
480 status = U_MEMORY_ALLOCATION_ERROR;
481 DELETE_ARRAY(en);
482 return 0;
483 }
484 uprv_memset(en->context, 0, sizeof(Context));
485 ((Context*)en->context)->all = false;
486 ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers;
487 return en;
488 }
489
490 U_NAMESPACE_END
491
492 #endif
493