• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  **********************************************************************
5  *   Copyright (C) 2005-2016, International Business Machines
6  *   Corporation and others.  All Rights Reserved.
7  **********************************************************************
8  */
9 
10 #include "unicode/utypes.h"
11 
12 #if !UCONFIG_NO_CONVERSION
13 
14 #include "unicode/ucsdet.h"
15 
16 #include "csdetect.h"
17 #include "csmatch.h"
18 #include "uenumimp.h"
19 
20 #include "cmemory.h"
21 #include "cstring.h"
22 #include "umutex.h"
23 #include "ucln_in.h"
24 #include "uarrsort.h"
25 #include "inputext.h"
26 #include "csrsbcs.h"
27 #include "csrmbcs.h"
28 #include "csrutf8.h"
29 #include "csrucode.h"
30 #include "csr2022.h"
31 
32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
34 
35 U_NAMESPACE_BEGIN
36 
37 struct CSRecognizerInfo : public UMemory {
CSRecognizerInfoCSRecognizerInfo38     CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
39         : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {}
40 
~CSRecognizerInfoCSRecognizerInfo41     ~CSRecognizerInfo() {delete recognizer;}
42 
43     CharsetRecognizer *recognizer;
44     UBool isDefaultEnabled;
45 };
46 
47 U_NAMESPACE_END
48 
49 static icu::CSRecognizerInfo **fCSRecognizers = NULL;
50 static icu::UInitOnce gCSRecognizersInitOnce {};
51 static int32_t fCSRecognizers_size = 0;
52 
53 U_CDECL_BEGIN
csdet_cleanup(void)54 static UBool U_CALLCONV csdet_cleanup(void)
55 {
56     U_NAMESPACE_USE
57     if (fCSRecognizers != NULL) {
58         for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
59             delete fCSRecognizers[r];
60             fCSRecognizers[r] = NULL;
61         }
62 
63         DELETE_ARRAY(fCSRecognizers);
64         fCSRecognizers = NULL;
65         fCSRecognizers_size = 0;
66     }
67     gCSRecognizersInitOnce.reset();
68 
69     return true;
70 }
71 
72 static int32_t U_CALLCONV
charsetMatchComparator(const void *,const void * left,const void * right)73 charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
74 {
75     U_NAMESPACE_USE
76 
77     const CharsetMatch **csm_l = (const CharsetMatch **) left;
78     const CharsetMatch **csm_r = (const CharsetMatch **) right;
79 
80     // NOTE: compare is backwards to sort from highest to lowest.
81     return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
82 }
83 
initRecognizers(UErrorCode & status)84 static void U_CALLCONV initRecognizers(UErrorCode &status) {
85     U_NAMESPACE_USE
86     ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
87     CSRecognizerInfo *tempArray[] = {
88         new CSRecognizerInfo(new CharsetRecog_UTF8(), true),
89 
90         new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), true),
91         new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), true),
92         new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), true),
93         new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), true),
94 
95         new CSRecognizerInfo(new CharsetRecog_8859_1(), true),
96         new CSRecognizerInfo(new CharsetRecog_8859_2(), true),
97         new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), true),
98         new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), true),
99         new CSRecognizerInfo(new CharsetRecog_8859_7_el(), true),
100         new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), true),
101         new CSRecognizerInfo(new CharsetRecog_8859_8_he(), true),
102         new CSRecognizerInfo(new CharsetRecog_windows_1251(), true),
103         new CSRecognizerInfo(new CharsetRecog_windows_1256(), true),
104         new CSRecognizerInfo(new CharsetRecog_KOI8_R(), true),
105         new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), true),
106         new CSRecognizerInfo(new CharsetRecog_sjis(), true),
107         new CSRecognizerInfo(new CharsetRecog_gb_18030(), true),
108         new CSRecognizerInfo(new CharsetRecog_euc_jp(), true),
109         new CSRecognizerInfo(new CharsetRecog_euc_kr(), true),
110         new CSRecognizerInfo(new CharsetRecog_big5(), true),
111 
112         new CSRecognizerInfo(new CharsetRecog_2022JP(), true),
113 #if !UCONFIG_ONLY_HTML_CONVERSION
114         new CSRecognizerInfo(new CharsetRecog_2022KR(), true),
115         new CSRecognizerInfo(new CharsetRecog_2022CN(), true),
116 
117         new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), false),
118         new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), false),
119         new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), false),
120         new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), false)
121 #endif
122     };
123     int32_t rCount = UPRV_LENGTHOF(tempArray);
124 
125     fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);
126 
127     if (fCSRecognizers == NULL) {
128         status = U_MEMORY_ALLOCATION_ERROR;
129     }
130     else {
131         fCSRecognizers_size = rCount;
132         for (int32_t r = 0; r < rCount; r += 1) {
133             fCSRecognizers[r] = tempArray[r];
134             if (fCSRecognizers[r] == NULL) {
135                 status = U_MEMORY_ALLOCATION_ERROR;
136             }
137         }
138     }
139 }
140 
141 U_CDECL_END
142 
143 U_NAMESPACE_BEGIN
144 
setRecognizers(UErrorCode & status)145 void CharsetDetector::setRecognizers(UErrorCode &status)
146 {
147     umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status);
148 }
149 
CharsetDetector(UErrorCode & status)150 CharsetDetector::CharsetDetector(UErrorCode &status)
151   : textIn(new InputText(status)), resultArray(NULL),
152     resultCount(0), fStripTags(false), fFreshTextSet(false),
153     fEnabledRecognizers(NULL)
154 {
155     if (U_FAILURE(status)) {
156         return;
157     }
158 
159     setRecognizers(status);
160 
161     if (U_FAILURE(status)) {
162         return;
163     }
164 
165     resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
166 
167     if (resultArray == NULL) {
168         status = U_MEMORY_ALLOCATION_ERROR;
169         return;
170     }
171 
172     for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
173         resultArray[i] = new CharsetMatch();
174 
175         if (resultArray[i] == NULL) {
176             status = U_MEMORY_ALLOCATION_ERROR;
177             break;
178         }
179     }
180 }
181 
~CharsetDetector()182 CharsetDetector::~CharsetDetector()
183 {
184     delete textIn;
185 
186     for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
187         delete resultArray[i];
188     }
189 
190     uprv_free(resultArray);
191 
192     if (fEnabledRecognizers) {
193         uprv_free(fEnabledRecognizers);
194     }
195 }
196 
setText(const char * in,int32_t len)197 void CharsetDetector::setText(const char *in, int32_t len)
198 {
199     textIn->setText(in, len);
200     fFreshTextSet = true;
201 }
202 
setStripTagsFlag(UBool flag)203 UBool CharsetDetector::setStripTagsFlag(UBool flag)
204 {
205     UBool temp = fStripTags;
206     fStripTags = flag;
207     fFreshTextSet = true;
208     return temp;
209 }
210 
getStripTagsFlag() const211 UBool CharsetDetector::getStripTagsFlag() const
212 {
213     return fStripTags;
214 }
215 
setDeclaredEncoding(const char * encoding,int32_t len) const216 void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
217 {
218     textIn->setDeclaredEncoding(encoding,len);
219 }
220 
getDetectableCount()221 int32_t CharsetDetector::getDetectableCount()
222 {
223     UErrorCode status = U_ZERO_ERROR;
224 
225     setRecognizers(status);
226 
227     return fCSRecognizers_size;
228 }
229 
detect(UErrorCode & status)230 const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
231 {
232     int32_t maxMatchesFound = 0;
233 
234     detectAll(maxMatchesFound, status);
235 
236     if(maxMatchesFound > 0) {
237         return resultArray[0];
238     } else {
239         return NULL;
240     }
241 }
242 
detectAll(int32_t & maxMatchesFound,UErrorCode & status)243 const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
244 {
245     if(!textIn->isSet()) {
246         status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set
247 
248         return NULL;
249     } else if (fFreshTextSet) {
250         CharsetRecognizer *csr;
251         int32_t            i;
252 
253         textIn->MungeInput(fStripTags);
254 
255         // Iterate over all possible charsets, remember all that
256         // give a match quality > 0.
257         resultCount = 0;
258         for (i = 0; i < fCSRecognizers_size; i += 1) {
259             csr = fCSRecognizers[i]->recognizer;
260             if (csr->match(textIn, resultArray[resultCount])) {
261                 resultCount++;
262             }
263         }
264 
265         if (resultCount > 1) {
266             uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, true, &status);
267         }
268         fFreshTextSet = false;
269     }
270 
271     maxMatchesFound = resultCount;
272 
273     if (maxMatchesFound == 0) {
274         status = U_INVALID_CHAR_FOUND;
275         return NULL;
276     }
277 
278     return resultArray;
279 }
280 
setDetectableCharset(const char * encoding,UBool enabled,UErrorCode & status)281 void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
282 {
283     if (U_FAILURE(status)) {
284         return;
285     }
286 
287     int32_t modIdx = -1;
288     UBool isDefaultVal = false;
289     for (int32_t i = 0; i < fCSRecognizers_size; i++) {
290         CSRecognizerInfo *csrinfo = fCSRecognizers[i];
291         if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) {
292             modIdx = i;
293             isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
294             break;
295         }
296     }
297     if (modIdx < 0) {
298         // No matching encoding found
299         status = U_ILLEGAL_ARGUMENT_ERROR;
300         return;
301     }
302 
303     if (fEnabledRecognizers == NULL && !isDefaultVal) {
304         // Create an array storing the non default setting
305         fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
306         if (fEnabledRecognizers == NULL) {
307             status = U_MEMORY_ALLOCATION_ERROR;
308             return;
309         }
310         // Initialize the array with default info
311         for (int32_t i = 0; i < fCSRecognizers_size; i++) {
312             fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
313         }
314     }
315 
316     if (fEnabledRecognizers != NULL) {
317         fEnabledRecognizers[modIdx] = enabled;
318     }
319 }
320 
321 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
322 {
323     if( index > fCSRecognizers_size-1 || index < 0) {
324         status = U_INDEX_OUTOFBOUNDS_ERROR;
325 
326         return 0;
327     } else {
328         return fCSRecognizers[index]->getName();
329     }
330 }*/
331 
332 U_NAMESPACE_END
333 
334 U_CDECL_BEGIN
335 typedef struct {
336     int32_t currIndex;
337     UBool all;
338     UBool *enabledRecognizers;
339 } Context;
340 
341 
342 
343 static void U_CALLCONV
enumClose(UEnumeration * en)344 enumClose(UEnumeration *en) {
345     if(en->context != NULL) {
346         DELETE_ARRAY(en->context);
347     }
348 
349     DELETE_ARRAY(en);
350 }
351 
352 static int32_t U_CALLCONV
enumCount(UEnumeration * en,UErrorCode *)353 enumCount(UEnumeration *en, UErrorCode *) {
354     if (((Context *)en->context)->all) {
355         // ucsdet_getAllDetectableCharsets, all charset detector names
356         return fCSRecognizers_size;
357     }
358 
359     // Otherwise, ucsdet_getDetectableCharsets - only enabled ones
360     int32_t count = 0;
361     UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
362     if (enabledArray != NULL) {
363         // custom set
364         for (int32_t i = 0; i < fCSRecognizers_size; i++) {
365             if (enabledArray[i]) {
366                 count++;
367             }
368         }
369     } else {
370         // default set
371         for (int32_t i = 0; i < fCSRecognizers_size; i++) {
372             if (fCSRecognizers[i]->isDefaultEnabled) {
373                 count++;
374             }
375         }
376     }
377     return count;
378 }
379 
380 static const char* U_CALLCONV
enumNext(UEnumeration * en,int32_t * resultLength,UErrorCode *)381 enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
382     const char *currName = NULL;
383 
384     if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
385         if (((Context *)en->context)->all) {
386             // ucsdet_getAllDetectableCharsets, all charset detector names
387             currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
388             ((Context *)en->context)->currIndex++;
389         } else {
390             // ucsdet_getDetectableCharsets
391             UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
392             if (enabledArray != NULL) {
393                 // custom set
394                 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
395                     if (enabledArray[((Context *)en->context)->currIndex]) {
396                         currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
397                     }
398                     ((Context *)en->context)->currIndex++;
399                 }
400             } else {
401                 // default set
402                 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
403                     if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
404                         currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
405                     }
406                     ((Context *)en->context)->currIndex++;
407                 }
408             }
409         }
410     }
411 
412     if(resultLength != NULL) {
413         *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName);
414     }
415 
416     return currName;
417 }
418 
419 
420 static void U_CALLCONV
enumReset(UEnumeration * en,UErrorCode *)421 enumReset(UEnumeration *en, UErrorCode *) {
422     ((Context *)en->context)->currIndex = 0;
423 }
424 
425 static const UEnumeration gCSDetEnumeration = {
426     NULL,
427     NULL,
428     enumClose,
429     enumCount,
430     uenum_unextDefault,
431     enumNext,
432     enumReset
433 };
434 
435 U_CDECL_END
436 
437 U_NAMESPACE_BEGIN
438 
getAllDetectableCharsets(UErrorCode & status)439 UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
440 {
441 
442     /* Initialize recognized charsets. */
443     setRecognizers(status);
444 
445     if(U_FAILURE(status)) {
446         return 0;
447     }
448 
449     UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
450     if (en == NULL) {
451         status = U_MEMORY_ALLOCATION_ERROR;
452         return 0;
453     }
454     memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
455     en->context = (void*)NEW_ARRAY(Context, 1);
456     if (en->context == NULL) {
457         status = U_MEMORY_ALLOCATION_ERROR;
458         DELETE_ARRAY(en);
459         return 0;
460     }
461     uprv_memset(en->context, 0, sizeof(Context));
462     ((Context*)en->context)->all = true;
463     return en;
464 }
465 
getDetectableCharsets(UErrorCode & status) const466 UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
467 {
468     if(U_FAILURE(status)) {
469         return 0;
470     }
471 
472     UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
473     if (en == NULL) {
474         status = U_MEMORY_ALLOCATION_ERROR;
475         return 0;
476     }
477     memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
478     en->context = (void*)NEW_ARRAY(Context, 1);
479     if (en->context == NULL) {
480         status = U_MEMORY_ALLOCATION_ERROR;
481         DELETE_ARRAY(en);
482         return 0;
483     }
484     uprv_memset(en->context, 0, sizeof(Context));
485     ((Context*)en->context)->all = false;
486     ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers;
487     return en;
488 }
489 
490 U_NAMESPACE_END
491 
492 #endif
493