• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  **********************************************************************
3  *   Copyright (C) 2005-2009, International Business Machines
4  *   Corporation and others.  All Rights Reserved.
5  **********************************************************************
6  */
7 
8 #include "unicode/utypes.h"
9 
10 #if !UCONFIG_NO_CONVERSION
11 
12 #include "unicode/ucsdet.h"
13 
14 #include "csdetect.h"
15 #include "csmatch.h"
16 #include "uenumimp.h"
17 
18 #include "cmemory.h"
19 #include "cstring.h"
20 #include "umutex.h"
21 #include "ucln_in.h"
22 #include "uarrsort.h"
23 #include "inputext.h"
24 #include "csrsbcs.h"
25 #include "csrmbcs.h"
26 #include "csrutf8.h"
27 #include "csrucode.h"
28 #include "csr2022.h"
29 
30 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
31 
32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
34 
35 U_CDECL_BEGIN
36 static U_NAMESPACE_QUALIFIER CharsetRecognizer **fCSRecognizers = NULL;
37 
38 static int32_t fCSRecognizers_size = 0;
39 
csdet_cleanup(void)40 static UBool U_CALLCONV csdet_cleanup(void)
41 {
42     if (fCSRecognizers != NULL) {
43         for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
44             delete fCSRecognizers[r];
45             fCSRecognizers[r] = NULL;
46         }
47 
48         DELETE_ARRAY(fCSRecognizers);
49         fCSRecognizers = NULL;
50         fCSRecognizers_size = 0;
51     }
52 
53     return TRUE;
54 }
55 
56 static int32_t U_CALLCONV
charsetMatchComparator(const void *,const void * left,const void * right)57 charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
58 {
59     U_NAMESPACE_USE
60 
61     const CharsetMatch **csm_l = (const CharsetMatch **) left;
62     const CharsetMatch **csm_r = (const CharsetMatch **) right;
63 
64     // NOTE: compare is backwards to sort from highest to lowest.
65     return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
66 }
67 
68 U_CDECL_END
69 
70 U_NAMESPACE_BEGIN
71 
setRecognizers(UErrorCode & status)72 void CharsetDetector::setRecognizers(UErrorCode &status)
73 {
74     UBool needsInit;
75     CharsetRecognizer **recognizers;
76 
77     if (U_FAILURE(status)) {
78         return;
79     }
80 
81     UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit);
82 
83     if (needsInit) {
84         CharsetRecognizer *tempArray[] = {
85             new CharsetRecog_UTF8(),
86 
87             new CharsetRecog_UTF_16_BE(),
88             new CharsetRecog_UTF_16_LE(),
89             new CharsetRecog_UTF_32_BE(),
90             new CharsetRecog_UTF_32_LE(),
91 
92             new CharsetRecog_8859_1_en(),
93             new CharsetRecog_8859_1_da(),
94             new CharsetRecog_8859_1_de(),
95             new CharsetRecog_8859_1_es(),
96             new CharsetRecog_8859_1_fr(),
97             new CharsetRecog_8859_1_it(),
98             new CharsetRecog_8859_1_nl(),
99             new CharsetRecog_8859_1_no(),
100             new CharsetRecog_8859_1_pt(),
101             new CharsetRecog_8859_1_sv(),
102             new CharsetRecog_8859_2_cs(),
103             new CharsetRecog_8859_2_hu(),
104             new CharsetRecog_8859_2_pl(),
105             new CharsetRecog_8859_2_ro(),
106             new CharsetRecog_8859_5_ru(),
107             new CharsetRecog_8859_6_ar(),
108             new CharsetRecog_8859_7_el(),
109             new CharsetRecog_8859_8_I_he(),
110             new CharsetRecog_8859_8_he(),
111             new CharsetRecog_windows_1251(),
112             new CharsetRecog_windows_1256(),
113             new CharsetRecog_KOI8_R(),
114             new CharsetRecog_8859_9_tr(),
115             new CharsetRecog_sjis(),
116             new CharsetRecog_gb_18030(),
117             new CharsetRecog_euc_jp(),
118             new CharsetRecog_euc_kr(),
119             new CharsetRecog_big5(),
120 
121             new CharsetRecog_2022JP(),
122             new CharsetRecog_2022KR(),
123             new CharsetRecog_2022CN(),
124 
125             new CharsetRecog_IBM424_he_rtl(),
126             new CharsetRecog_IBM424_he_ltr(),
127             new CharsetRecog_IBM420_ar_rtl(),
128             new CharsetRecog_IBM420_ar_ltr()
129         };
130         int32_t rCount = ARRAY_SIZE(tempArray);
131         int32_t r;
132 
133         recognizers = NEW_ARRAY(CharsetRecognizer *, rCount);
134 
135         if (recognizers == NULL) {
136             status = U_MEMORY_ALLOCATION_ERROR;
137             return;
138         } else {
139             for (r = 0; r < rCount; r += 1) {
140                 recognizers[r] = tempArray[r];
141 
142                 if (recognizers[r] == NULL) {
143                     status = U_MEMORY_ALLOCATION_ERROR;
144                     break;
145                 }
146             }
147         }
148 
149         if (U_SUCCESS(status)) {
150             umtx_lock(NULL);
151             if (fCSRecognizers == NULL) {
152                 fCSRecognizers_size = rCount;
153                 fCSRecognizers = recognizers;
154             }
155             umtx_unlock(NULL);
156         }
157 
158         if (fCSRecognizers != recognizers) {
159             for (r = 0; r < rCount; r += 1) {
160                 delete recognizers[r];
161                 recognizers[r] = NULL;
162             }
163 
164             DELETE_ARRAY(recognizers);
165         }
166 
167         recognizers = NULL;
168         ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
169     }
170 }
171 
CharsetDetector(UErrorCode & status)172 CharsetDetector::CharsetDetector(UErrorCode &status)
173   : textIn(new InputText(status)), resultArray(NULL),
174     resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)
175 {
176     if (U_FAILURE(status)) {
177         return;
178     }
179 
180     setRecognizers(status);
181 
182     if (U_FAILURE(status)) {
183         return;
184     }
185 
186     resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
187 
188     if (resultArray == NULL) {
189         status = U_MEMORY_ALLOCATION_ERROR;
190         return;
191     }
192 
193     for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
194         resultArray[i] = new CharsetMatch();
195 
196         if (resultArray[i] == NULL) {
197             status = U_MEMORY_ALLOCATION_ERROR;
198             break;
199         }
200     }
201 }
202 
~CharsetDetector()203 CharsetDetector::~CharsetDetector()
204 {
205     delete textIn;
206 
207     for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
208         delete resultArray[i];
209     }
210 
211     uprv_free(resultArray);
212 }
213 
setText(const char * in,int32_t len)214 void CharsetDetector::setText(const char *in, int32_t len)
215 {
216     textIn->setText(in, len);
217     fFreshTextSet = TRUE;
218 }
219 
setStripTagsFlag(UBool flag)220 UBool CharsetDetector::setStripTagsFlag(UBool flag)
221 {
222     UBool temp = fStripTags;
223     fStripTags = flag;
224     fFreshTextSet = TRUE;
225     return temp;
226 }
227 
getStripTagsFlag() const228 UBool CharsetDetector::getStripTagsFlag() const
229 {
230     return fStripTags;
231 }
232 
setDeclaredEncoding(const char * encoding,int32_t len) const233 void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
234 {
235     textIn->setDeclaredEncoding(encoding,len);
236 }
237 
getDetectableCount()238 int32_t CharsetDetector::getDetectableCount()
239 {
240     UErrorCode status = U_ZERO_ERROR;
241 
242     setRecognizers(status);
243 
244     return fCSRecognizers_size;
245 }
246 
detect(UErrorCode & status)247 const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
248 {
249     int32_t maxMatchesFound = 0;
250 
251     detectAll(maxMatchesFound, status);
252 
253     if(maxMatchesFound > 0) {
254         return resultArray[0];
255     } else {
256         return NULL;
257     }
258 }
259 
detectAll(int32_t & maxMatchesFound,UErrorCode & status)260 const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
261 {
262     if(!textIn->isSet()) {
263         status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set
264 
265         return NULL;
266     } else if(fFreshTextSet) {
267         CharsetRecognizer *csr;
268         int32_t            detectResults;
269         int32_t            confidence;
270         int32_t            i;
271 
272         textIn->MungeInput(fStripTags);
273 
274         // Iterate over all possible charsets, remember all that
275         // give a match quality > 0.
276         resultCount = 0;
277         for (i = 0; i < fCSRecognizers_size; i += 1) {
278             csr = fCSRecognizers[i];
279             detectResults = csr->match(textIn);
280             confidence = detectResults;
281 
282             if (confidence > 0)  {
283                 resultArray[resultCount++]->set(textIn, csr, confidence);
284             }
285         }
286 
287         for(i = resultCount; i < fCSRecognizers_size; i += 1) {
288             resultArray[i]->set(textIn, 0, 0);
289         }
290 
291         uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
292 
293         // Remove duplicate charsets from the results.
294         // Simple minded, brute force approach - check each entry against all that follow.
295         // The first entry of any duplicated set is the one that should be kept because it will
296         // be the one with the highest confidence rating.
297         //   (Duplicate matches have different languages, only the charset is the same)
298         // Because the resultArray contains preallocated CharsetMatch objects, they aren't actually
299         // deleted, just reordered, with the unwanted duplicates placed after the good results.
300         int32_t j, k;
301         for (i=0; i<resultCount; i++) {
302             const char *charSetName = resultArray[i]->getName();
303             for (j=i+1; j<resultCount; ) {
304                 if (uprv_strcmp(charSetName, resultArray[j]->getName()) != 0) {
305                     // Not a duplicate.
306                     j++;
307                 } else {
308                     // Duplicate entry at index j.
309                     CharsetMatch *duplicate = resultArray[j];
310                     for (k=j; k<resultCount-1; k++) {
311                         resultArray[k] = resultArray[k+1];
312                     }
313                     resultCount--;
314                     resultArray[resultCount] = duplicate;
315                 }
316             }
317         }
318 
319         fFreshTextSet = FALSE;
320     }
321 
322     maxMatchesFound = resultCount;
323 
324     return resultArray;
325 }
326 
327 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
328 {
329     if( index > fCSRecognizers_size-1 || index < 0) {
330         status = U_INDEX_OUTOFBOUNDS_ERROR;
331 
332         return 0;
333     } else {
334         return fCSRecognizers[index]->getName();
335     }
336 }*/
337 
338 U_NAMESPACE_END
339 
340 U_CDECL_BEGIN
341 typedef struct {
342     int32_t currIndex;
343 } Context;
344 
345 
346 
347 static void U_CALLCONV
enumClose(UEnumeration * en)348 enumClose(UEnumeration *en) {
349     if(en->context != NULL) {
350         DELETE_ARRAY(en->context);
351     }
352 
353     DELETE_ARRAY(en);
354 }
355 
356 static int32_t U_CALLCONV
enumCount(UEnumeration *,UErrorCode *)357 enumCount(UEnumeration *, UErrorCode *) {
358     return fCSRecognizers_size;
359 }
360 
361 static const char* U_CALLCONV
enumNext(UEnumeration * en,int32_t * resultLength,UErrorCode *)362 enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
363     if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {
364         if(resultLength != NULL) {
365             *resultLength = 0;
366         }
367         return NULL;
368     }
369     const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();
370     if(resultLength != NULL) {
371         *resultLength = (int32_t)uprv_strlen(currName);
372     }
373     ((Context *)en->context)->currIndex++;
374 
375     return currName;
376 }
377 
378 static void U_CALLCONV
enumReset(UEnumeration * en,UErrorCode *)379 enumReset(UEnumeration *en, UErrorCode *) {
380     ((Context *)en->context)->currIndex = 0;
381 }
382 
383 static const UEnumeration gCSDetEnumeration = {
384     NULL,
385     NULL,
386     enumClose,
387     enumCount,
388     uenum_unextDefault,
389     enumNext,
390     enumReset
391 };
392 
393 U_CAPI  UEnumeration * U_EXPORT2
ucsdet_getAllDetectableCharsets(const UCharsetDetector *,UErrorCode * status)394 ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status)
395 {
396     U_NAMESPACE_USE
397 
398     if(U_FAILURE(*status)) {
399         return 0;
400     }
401 
402     /* Initialize recognized charsets. */
403     CharsetDetector::getDetectableCount();
404 
405     UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
406     memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
407     en->context = (void*)NEW_ARRAY(Context, 1);
408     uprv_memset(en->context, 0, sizeof(Context));
409     return en;
410 }
411 U_CDECL_END
412 
413 #endif
414 
415