• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  **********************************************************************
3  *   Copyright (C) 2005-2012, International Business Machines
4  *   Corporation and others.  All Rights Reserved.
5  **********************************************************************
6  */
7 
8 #include "unicode/utypes.h"
9 
10 #if !UCONFIG_NO_CONVERSION
11 
12 #include "unicode/ucsdet.h"
13 
14 #include "csdetect.h"
15 #include "csmatch.h"
16 #include "uenumimp.h"
17 
18 #include "cmemory.h"
19 #include "cstring.h"
20 #include "umutex.h"
21 #include "ucln_in.h"
22 #include "uarrsort.h"
23 #include "inputext.h"
24 #include "csrsbcs.h"
25 #include "csrmbcs.h"
26 #include "csrutf8.h"
27 #include "csrucode.h"
28 #include "csr2022.h"
29 
30 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
31 
32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
34 
35 U_CDECL_BEGIN
36 static icu::CharsetRecognizer **fCSRecognizers = NULL;
37 
38 static int32_t fCSRecognizers_size = 0;
39 
csdet_cleanup(void)40 static UBool U_CALLCONV csdet_cleanup(void)
41 {
42     if (fCSRecognizers != NULL) {
43         for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
44             delete fCSRecognizers[r];
45             fCSRecognizers[r] = NULL;
46         }
47 
48         DELETE_ARRAY(fCSRecognizers);
49         fCSRecognizers = NULL;
50         fCSRecognizers_size = 0;
51     }
52 
53     return TRUE;
54 }
55 
56 static int32_t U_CALLCONV
charsetMatchComparator(const void *,const void * left,const void * right)57 charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
58 {
59     U_NAMESPACE_USE
60 
61     const CharsetMatch **csm_l = (const CharsetMatch **) left;
62     const CharsetMatch **csm_r = (const CharsetMatch **) right;
63 
64     // NOTE: compare is backwards to sort from highest to lowest.
65     return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
66 }
67 
68 U_CDECL_END
69 
70 U_NAMESPACE_BEGIN
71 
setRecognizers(UErrorCode & status)72 void CharsetDetector::setRecognizers(UErrorCode &status)
73 {
74     UBool needsInit;
75     CharsetRecognizer **recognizers;
76 
77     if (U_FAILURE(status)) {
78         return;
79     }
80 
81     UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit);
82 
83     if (needsInit) {
84         CharsetRecognizer *tempArray[] = {
85             new CharsetRecog_UTF8(),
86 
87             new CharsetRecog_UTF_16_BE(),
88             new CharsetRecog_UTF_16_LE(),
89             new CharsetRecog_UTF_32_BE(),
90             new CharsetRecog_UTF_32_LE(),
91 
92             new CharsetRecog_8859_1(),
93             new CharsetRecog_8859_2(),
94             new CharsetRecog_8859_5_ru(),
95             new CharsetRecog_8859_6_ar(),
96             new CharsetRecog_8859_7_el(),
97             new CharsetRecog_8859_8_I_he(),
98             new CharsetRecog_8859_8_he(),
99             new CharsetRecog_windows_1251(),
100             new CharsetRecog_windows_1256(),
101             new CharsetRecog_KOI8_R(),
102             new CharsetRecog_8859_9_tr(),
103             new CharsetRecog_sjis(),
104             new CharsetRecog_gb_18030(),
105             new CharsetRecog_euc_jp(),
106             new CharsetRecog_euc_kr(),
107             new CharsetRecog_big5(),
108 
109             new CharsetRecog_2022JP(),
110             new CharsetRecog_2022KR(),
111             new CharsetRecog_2022CN(),
112 
113             new CharsetRecog_IBM424_he_rtl(),
114             new CharsetRecog_IBM424_he_ltr(),
115             new CharsetRecog_IBM420_ar_rtl(),
116             new CharsetRecog_IBM420_ar_ltr()
117         };
118         int32_t rCount = ARRAY_SIZE(tempArray);
119         int32_t r;
120 
121         recognizers = NEW_ARRAY(CharsetRecognizer *, rCount);
122 
123         if (recognizers == NULL) {
124             status = U_MEMORY_ALLOCATION_ERROR;
125             return;
126         } else {
127             for (r = 0; r < rCount; r += 1) {
128                 recognizers[r] = tempArray[r];
129 
130                 if (recognizers[r] == NULL) {
131                     status = U_MEMORY_ALLOCATION_ERROR;
132                     break;
133                 }
134             }
135         }
136 
137         if (U_SUCCESS(status)) {
138             umtx_lock(NULL);
139             if (fCSRecognizers == NULL) {
140                 fCSRecognizers_size = rCount;
141                 fCSRecognizers = recognizers;
142             }
143             umtx_unlock(NULL);
144         }
145 
146         if (fCSRecognizers != recognizers) {
147             for (r = 0; r < rCount; r += 1) {
148                 delete recognizers[r];
149                 recognizers[r] = NULL;
150             }
151 
152             DELETE_ARRAY(recognizers);
153         }
154 
155         recognizers = NULL;
156         ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
157     }
158 }
159 
CharsetDetector(UErrorCode & status)160 CharsetDetector::CharsetDetector(UErrorCode &status)
161   : textIn(new InputText(status)), resultArray(NULL),
162     resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)
163 {
164     if (U_FAILURE(status)) {
165         return;
166     }
167 
168     setRecognizers(status);
169 
170     if (U_FAILURE(status)) {
171         return;
172     }
173 
174     resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
175 
176     if (resultArray == NULL) {
177         status = U_MEMORY_ALLOCATION_ERROR;
178         return;
179     }
180 
181     for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
182         resultArray[i] = new CharsetMatch();
183 
184         if (resultArray[i] == NULL) {
185             status = U_MEMORY_ALLOCATION_ERROR;
186             break;
187         }
188     }
189 }
190 
~CharsetDetector()191 CharsetDetector::~CharsetDetector()
192 {
193     delete textIn;
194 
195     for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
196         delete resultArray[i];
197     }
198 
199     uprv_free(resultArray);
200 }
201 
setText(const char * in,int32_t len)202 void CharsetDetector::setText(const char *in, int32_t len)
203 {
204     textIn->setText(in, len);
205     fFreshTextSet = TRUE;
206 }
207 
setStripTagsFlag(UBool flag)208 UBool CharsetDetector::setStripTagsFlag(UBool flag)
209 {
210     UBool temp = fStripTags;
211     fStripTags = flag;
212     fFreshTextSet = TRUE;
213     return temp;
214 }
215 
getStripTagsFlag() const216 UBool CharsetDetector::getStripTagsFlag() const
217 {
218     return fStripTags;
219 }
220 
setDeclaredEncoding(const char * encoding,int32_t len) const221 void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
222 {
223     textIn->setDeclaredEncoding(encoding,len);
224 }
225 
getDetectableCount()226 int32_t CharsetDetector::getDetectableCount()
227 {
228     UErrorCode status = U_ZERO_ERROR;
229 
230     setRecognizers(status);
231 
232     return fCSRecognizers_size;
233 }
234 
detect(UErrorCode & status)235 const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
236 {
237     int32_t maxMatchesFound = 0;
238 
239     detectAll(maxMatchesFound, status);
240 
241     if(maxMatchesFound > 0) {
242         return resultArray[0];
243     } else {
244         return NULL;
245     }
246 }
247 
detectAll(int32_t & maxMatchesFound,UErrorCode & status)248 const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
249 {
250     if(!textIn->isSet()) {
251         status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set
252 
253         return NULL;
254     } else if (fFreshTextSet) {
255         CharsetRecognizer *csr;
256         int32_t            i;
257 
258         textIn->MungeInput(fStripTags);
259 
260         // Iterate over all possible charsets, remember all that
261         // give a match quality > 0.
262         resultCount = 0;
263         for (i = 0; i < fCSRecognizers_size; i += 1) {
264             csr = fCSRecognizers[i];
265             if (csr->match(textIn, resultArray[resultCount])) {
266                 resultCount++;
267             }
268         }
269 
270         if (resultCount > 1) {
271             uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
272         }
273         fFreshTextSet = FALSE;
274     }
275 
276     maxMatchesFound = resultCount;
277 
278     return resultArray;
279 }
280 
281 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
282 {
283     if( index > fCSRecognizers_size-1 || index < 0) {
284         status = U_INDEX_OUTOFBOUNDS_ERROR;
285 
286         return 0;
287     } else {
288         return fCSRecognizers[index]->getName();
289     }
290 }*/
291 
292 U_NAMESPACE_END
293 
294 U_CDECL_BEGIN
295 typedef struct {
296     int32_t currIndex;
297 } Context;
298 
299 
300 
301 static void U_CALLCONV
enumClose(UEnumeration * en)302 enumClose(UEnumeration *en) {
303     if(en->context != NULL) {
304         DELETE_ARRAY(en->context);
305     }
306 
307     DELETE_ARRAY(en);
308 }
309 
310 static int32_t U_CALLCONV
enumCount(UEnumeration *,UErrorCode *)311 enumCount(UEnumeration *, UErrorCode *) {
312     return fCSRecognizers_size;
313 }
314 
315 static const char* U_CALLCONV
enumNext(UEnumeration * en,int32_t * resultLength,UErrorCode *)316 enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
317     if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {
318         if(resultLength != NULL) {
319             *resultLength = 0;
320         }
321         return NULL;
322     }
323     const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();
324     if(resultLength != NULL) {
325         *resultLength = (int32_t)uprv_strlen(currName);
326     }
327     ((Context *)en->context)->currIndex++;
328 
329     return currName;
330 }
331 
332 static void U_CALLCONV
enumReset(UEnumeration * en,UErrorCode *)333 enumReset(UEnumeration *en, UErrorCode *) {
334     ((Context *)en->context)->currIndex = 0;
335 }
336 
337 static const UEnumeration gCSDetEnumeration = {
338     NULL,
339     NULL,
340     enumClose,
341     enumCount,
342     uenum_unextDefault,
343     enumNext,
344     enumReset
345 };
346 
347 U_CAPI  UEnumeration * U_EXPORT2
ucsdet_getAllDetectableCharsets(const UCharsetDetector *,UErrorCode * status)348 ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status)
349 {
350     U_NAMESPACE_USE
351 
352     if(U_FAILURE(*status)) {
353         return 0;
354     }
355 
356     /* Initialize recognized charsets. */
357     CharsetDetector::getDetectableCount();
358 
359     UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
360     memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
361     en->context = (void*)NEW_ARRAY(Context, 1);
362     uprv_memset(en->context, 0, sizeof(Context));
363     return en;
364 }
365 U_CDECL_END
366 
367 #endif
368 
369