1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2009, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_CONVERSION
11
12 #include "unicode/ucsdet.h"
13
14 #include "csdetect.h"
15 #include "csmatch.h"
16 #include "uenumimp.h"
17
18 #include "cmemory.h"
19 #include "cstring.h"
20 #include "umutex.h"
21 #include "ucln_in.h"
22 #include "uarrsort.h"
23 #include "inputext.h"
24 #include "csrsbcs.h"
25 #include "csrmbcs.h"
26 #include "csrutf8.h"
27 #include "csrucode.h"
28 #include "csr2022.h"
29
30 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
31
32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
34
35 U_CDECL_BEGIN
36 static U_NAMESPACE_QUALIFIER CharsetRecognizer **fCSRecognizers = NULL;
37
38 static int32_t fCSRecognizers_size = 0;
39
csdet_cleanup(void)40 static UBool U_CALLCONV csdet_cleanup(void)
41 {
42 if (fCSRecognizers != NULL) {
43 for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
44 delete fCSRecognizers[r];
45 fCSRecognizers[r] = NULL;
46 }
47
48 DELETE_ARRAY(fCSRecognizers);
49 fCSRecognizers = NULL;
50 fCSRecognizers_size = 0;
51 }
52
53 return TRUE;
54 }
55
56 static int32_t U_CALLCONV
charsetMatchComparator(const void *,const void * left,const void * right)57 charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
58 {
59 U_NAMESPACE_USE
60
61 const CharsetMatch **csm_l = (const CharsetMatch **) left;
62 const CharsetMatch **csm_r = (const CharsetMatch **) right;
63
64 // NOTE: compare is backwards to sort from highest to lowest.
65 return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
66 }
67
68 U_CDECL_END
69
70 U_NAMESPACE_BEGIN
71
setRecognizers(UErrorCode & status)72 void CharsetDetector::setRecognizers(UErrorCode &status)
73 {
74 UBool needsInit;
75 CharsetRecognizer **recognizers;
76
77 if (U_FAILURE(status)) {
78 return;
79 }
80
81 UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit);
82
83 if (needsInit) {
84 CharsetRecognizer *tempArray[] = {
85 new CharsetRecog_UTF8(),
86
87 new CharsetRecog_UTF_16_BE(),
88 new CharsetRecog_UTF_16_LE(),
89 new CharsetRecog_UTF_32_BE(),
90 new CharsetRecog_UTF_32_LE(),
91
92 new CharsetRecog_8859_1_en(),
93 new CharsetRecog_8859_1_da(),
94 new CharsetRecog_8859_1_de(),
95 new CharsetRecog_8859_1_es(),
96 new CharsetRecog_8859_1_fr(),
97 new CharsetRecog_8859_1_it(),
98 new CharsetRecog_8859_1_nl(),
99 new CharsetRecog_8859_1_no(),
100 new CharsetRecog_8859_1_pt(),
101 new CharsetRecog_8859_1_sv(),
102 new CharsetRecog_8859_2_cs(),
103 new CharsetRecog_8859_2_hu(),
104 new CharsetRecog_8859_2_pl(),
105 new CharsetRecog_8859_2_ro(),
106 new CharsetRecog_8859_5_ru(),
107 new CharsetRecog_8859_6_ar(),
108 new CharsetRecog_8859_7_el(),
109 new CharsetRecog_8859_8_I_he(),
110 new CharsetRecog_8859_8_he(),
111 new CharsetRecog_windows_1251(),
112 new CharsetRecog_windows_1256(),
113 new CharsetRecog_KOI8_R(),
114 new CharsetRecog_8859_9_tr(),
115 new CharsetRecog_sjis(),
116 new CharsetRecog_gb_18030(),
117 new CharsetRecog_euc_jp(),
118 new CharsetRecog_euc_kr(),
119 new CharsetRecog_big5(),
120
121 new CharsetRecog_2022JP(),
122 new CharsetRecog_2022KR(),
123 new CharsetRecog_2022CN(),
124
125 new CharsetRecog_IBM424_he_rtl(),
126 new CharsetRecog_IBM424_he_ltr(),
127 new CharsetRecog_IBM420_ar_rtl(),
128 new CharsetRecog_IBM420_ar_ltr()
129 };
130 int32_t rCount = ARRAY_SIZE(tempArray);
131 int32_t r;
132
133 recognizers = NEW_ARRAY(CharsetRecognizer *, rCount);
134
135 if (recognizers == NULL) {
136 status = U_MEMORY_ALLOCATION_ERROR;
137 return;
138 } else {
139 for (r = 0; r < rCount; r += 1) {
140 recognizers[r] = tempArray[r];
141
142 if (recognizers[r] == NULL) {
143 status = U_MEMORY_ALLOCATION_ERROR;
144 break;
145 }
146 }
147 }
148
149 if (U_SUCCESS(status)) {
150 umtx_lock(NULL);
151 if (fCSRecognizers == NULL) {
152 fCSRecognizers_size = rCount;
153 fCSRecognizers = recognizers;
154 }
155 umtx_unlock(NULL);
156 }
157
158 if (fCSRecognizers != recognizers) {
159 for (r = 0; r < rCount; r += 1) {
160 delete recognizers[r];
161 recognizers[r] = NULL;
162 }
163
164 DELETE_ARRAY(recognizers);
165 }
166
167 recognizers = NULL;
168 ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
169 }
170 }
171
CharsetDetector(UErrorCode & status)172 CharsetDetector::CharsetDetector(UErrorCode &status)
173 : textIn(new InputText(status)), resultArray(NULL),
174 resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)
175 {
176 if (U_FAILURE(status)) {
177 return;
178 }
179
180 setRecognizers(status);
181
182 if (U_FAILURE(status)) {
183 return;
184 }
185
186 resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
187
188 if (resultArray == NULL) {
189 status = U_MEMORY_ALLOCATION_ERROR;
190 return;
191 }
192
193 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
194 resultArray[i] = new CharsetMatch();
195
196 if (resultArray[i] == NULL) {
197 status = U_MEMORY_ALLOCATION_ERROR;
198 break;
199 }
200 }
201 }
202
~CharsetDetector()203 CharsetDetector::~CharsetDetector()
204 {
205 delete textIn;
206
207 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
208 delete resultArray[i];
209 }
210
211 uprv_free(resultArray);
212 }
213
setText(const char * in,int32_t len)214 void CharsetDetector::setText(const char *in, int32_t len)
215 {
216 textIn->setText(in, len);
217 fFreshTextSet = TRUE;
218 }
219
setStripTagsFlag(UBool flag)220 UBool CharsetDetector::setStripTagsFlag(UBool flag)
221 {
222 UBool temp = fStripTags;
223 fStripTags = flag;
224 fFreshTextSet = TRUE;
225 return temp;
226 }
227
getStripTagsFlag() const228 UBool CharsetDetector::getStripTagsFlag() const
229 {
230 return fStripTags;
231 }
232
setDeclaredEncoding(const char * encoding,int32_t len) const233 void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
234 {
235 textIn->setDeclaredEncoding(encoding,len);
236 }
237
getDetectableCount()238 int32_t CharsetDetector::getDetectableCount()
239 {
240 UErrorCode status = U_ZERO_ERROR;
241
242 setRecognizers(status);
243
244 return fCSRecognizers_size;
245 }
246
detect(UErrorCode & status)247 const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
248 {
249 int32_t maxMatchesFound = 0;
250
251 detectAll(maxMatchesFound, status);
252
253 if(maxMatchesFound > 0) {
254 return resultArray[0];
255 } else {
256 return NULL;
257 }
258 }
259
detectAll(int32_t & maxMatchesFound,UErrorCode & status)260 const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
261 {
262 if(!textIn->isSet()) {
263 status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set
264
265 return NULL;
266 } else if(fFreshTextSet) {
267 CharsetRecognizer *csr;
268 int32_t detectResults;
269 int32_t confidence;
270 int32_t i;
271
272 textIn->MungeInput(fStripTags);
273
274 // Iterate over all possible charsets, remember all that
275 // give a match quality > 0.
276 resultCount = 0;
277 for (i = 0; i < fCSRecognizers_size; i += 1) {
278 csr = fCSRecognizers[i];
279 detectResults = csr->match(textIn);
280 confidence = detectResults;
281
282 if (confidence > 0) {
283 resultArray[resultCount++]->set(textIn, csr, confidence);
284 }
285 }
286
287 for(i = resultCount; i < fCSRecognizers_size; i += 1) {
288 resultArray[i]->set(textIn, 0, 0);
289 }
290
291 uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
292
293 // Remove duplicate charsets from the results.
294 // Simple minded, brute force approach - check each entry against all that follow.
295 // The first entry of any duplicated set is the one that should be kept because it will
296 // be the one with the highest confidence rating.
297 // (Duplicate matches have different languages, only the charset is the same)
298 // Because the resultArray contains preallocated CharsetMatch objects, they aren't actually
299 // deleted, just reordered, with the unwanted duplicates placed after the good results.
300 int32_t j, k;
301 for (i=0; i<resultCount; i++) {
302 const char *charSetName = resultArray[i]->getName();
303 for (j=i+1; j<resultCount; ) {
304 if (uprv_strcmp(charSetName, resultArray[j]->getName()) != 0) {
305 // Not a duplicate.
306 j++;
307 } else {
308 // Duplicate entry at index j.
309 CharsetMatch *duplicate = resultArray[j];
310 for (k=j; k<resultCount-1; k++) {
311 resultArray[k] = resultArray[k+1];
312 }
313 resultCount--;
314 resultArray[resultCount] = duplicate;
315 }
316 }
317 }
318
319 fFreshTextSet = FALSE;
320 }
321
322 maxMatchesFound = resultCount;
323
324 return resultArray;
325 }
326
327 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
328 {
329 if( index > fCSRecognizers_size-1 || index < 0) {
330 status = U_INDEX_OUTOFBOUNDS_ERROR;
331
332 return 0;
333 } else {
334 return fCSRecognizers[index]->getName();
335 }
336 }*/
337
338 U_NAMESPACE_END
339
340 U_CDECL_BEGIN
341 typedef struct {
342 int32_t currIndex;
343 } Context;
344
345
346
347 static void U_CALLCONV
enumClose(UEnumeration * en)348 enumClose(UEnumeration *en) {
349 if(en->context != NULL) {
350 DELETE_ARRAY(en->context);
351 }
352
353 DELETE_ARRAY(en);
354 }
355
356 static int32_t U_CALLCONV
enumCount(UEnumeration *,UErrorCode *)357 enumCount(UEnumeration *, UErrorCode *) {
358 return fCSRecognizers_size;
359 }
360
361 static const char* U_CALLCONV
enumNext(UEnumeration * en,int32_t * resultLength,UErrorCode *)362 enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
363 if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {
364 if(resultLength != NULL) {
365 *resultLength = 0;
366 }
367 return NULL;
368 }
369 const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();
370 if(resultLength != NULL) {
371 *resultLength = (int32_t)uprv_strlen(currName);
372 }
373 ((Context *)en->context)->currIndex++;
374
375 return currName;
376 }
377
378 static void U_CALLCONV
enumReset(UEnumeration * en,UErrorCode *)379 enumReset(UEnumeration *en, UErrorCode *) {
380 ((Context *)en->context)->currIndex = 0;
381 }
382
383 static const UEnumeration gCSDetEnumeration = {
384 NULL,
385 NULL,
386 enumClose,
387 enumCount,
388 uenum_unextDefault,
389 enumNext,
390 enumReset
391 };
392
393 U_CAPI UEnumeration * U_EXPORT2
ucsdet_getAllDetectableCharsets(const UCharsetDetector *,UErrorCode * status)394 ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status)
395 {
396 U_NAMESPACE_USE
397
398 if(U_FAILURE(*status)) {
399 return 0;
400 }
401
402 /* Initialize recognized charsets. */
403 CharsetDetector::getDetectableCount();
404
405 UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
406 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
407 en->context = (void*)NEW_ARRAY(Context, 1);
408 uprv_memset(en->context, 0, sizeof(Context));
409 return en;
410 }
411 U_CDECL_END
412
413 #endif
414
415