1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_CONVERSION
11
12 #include "unicode/ucsdet.h"
13
14 #include "csdetect.h"
15 #include "csmatch.h"
16 #include "uenumimp.h"
17
18 #include "cmemory.h"
19 #include "cstring.h"
20 #include "umutex.h"
21 #include "ucln_in.h"
22 #include "uarrsort.h"
23 #include "inputext.h"
24 #include "csrsbcs.h"
25 #include "csrmbcs.h"
26 #include "csrutf8.h"
27 #include "csrucode.h"
28 #include "csr2022.h"
29
30 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
31
32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
34
35 U_CDECL_BEGIN
36 static icu::CharsetRecognizer **fCSRecognizers = NULL;
37
38 static int32_t fCSRecognizers_size = 0;
39
csdet_cleanup(void)40 static UBool U_CALLCONV csdet_cleanup(void)
41 {
42 if (fCSRecognizers != NULL) {
43 for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
44 delete fCSRecognizers[r];
45 fCSRecognizers[r] = NULL;
46 }
47
48 DELETE_ARRAY(fCSRecognizers);
49 fCSRecognizers = NULL;
50 fCSRecognizers_size = 0;
51 }
52
53 return TRUE;
54 }
55
56 static int32_t U_CALLCONV
charsetMatchComparator(const void *,const void * left,const void * right)57 charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
58 {
59 U_NAMESPACE_USE
60
61 const CharsetMatch **csm_l = (const CharsetMatch **) left;
62 const CharsetMatch **csm_r = (const CharsetMatch **) right;
63
64 // NOTE: compare is backwards to sort from highest to lowest.
65 return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
66 }
67
68 U_CDECL_END
69
70 U_NAMESPACE_BEGIN
71
setRecognizers(UErrorCode & status)72 void CharsetDetector::setRecognizers(UErrorCode &status)
73 {
74 UBool needsInit;
75 CharsetRecognizer **recognizers;
76
77 if (U_FAILURE(status)) {
78 return;
79 }
80
81 UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit);
82
83 if (needsInit) {
84 CharsetRecognizer *tempArray[] = {
85 new CharsetRecog_UTF8(),
86
87 new CharsetRecog_UTF_16_BE(),
88 new CharsetRecog_UTF_16_LE(),
89 new CharsetRecog_UTF_32_BE(),
90 new CharsetRecog_UTF_32_LE(),
91
92 new CharsetRecog_8859_1(),
93 new CharsetRecog_8859_2(),
94 new CharsetRecog_8859_5_ru(),
95 new CharsetRecog_8859_6_ar(),
96 new CharsetRecog_8859_7_el(),
97 new CharsetRecog_8859_8_I_he(),
98 new CharsetRecog_8859_8_he(),
99 new CharsetRecog_windows_1251(),
100 new CharsetRecog_windows_1256(),
101 new CharsetRecog_KOI8_R(),
102 new CharsetRecog_8859_9_tr(),
103 new CharsetRecog_sjis(),
104 new CharsetRecog_gb_18030(),
105 new CharsetRecog_euc_jp(),
106 new CharsetRecog_euc_kr(),
107 new CharsetRecog_big5(),
108
109 new CharsetRecog_2022JP(),
110 new CharsetRecog_2022KR(),
111 new CharsetRecog_2022CN(),
112
113 new CharsetRecog_IBM424_he_rtl(),
114 new CharsetRecog_IBM424_he_ltr(),
115 new CharsetRecog_IBM420_ar_rtl(),
116 new CharsetRecog_IBM420_ar_ltr()
117 };
118 int32_t rCount = ARRAY_SIZE(tempArray);
119 int32_t r;
120
121 recognizers = NEW_ARRAY(CharsetRecognizer *, rCount);
122
123 if (recognizers == NULL) {
124 status = U_MEMORY_ALLOCATION_ERROR;
125 return;
126 } else {
127 for (r = 0; r < rCount; r += 1) {
128 recognizers[r] = tempArray[r];
129
130 if (recognizers[r] == NULL) {
131 status = U_MEMORY_ALLOCATION_ERROR;
132 break;
133 }
134 }
135 }
136
137 if (U_SUCCESS(status)) {
138 umtx_lock(NULL);
139 if (fCSRecognizers == NULL) {
140 fCSRecognizers_size = rCount;
141 fCSRecognizers = recognizers;
142 }
143 umtx_unlock(NULL);
144 }
145
146 if (fCSRecognizers != recognizers) {
147 for (r = 0; r < rCount; r += 1) {
148 delete recognizers[r];
149 recognizers[r] = NULL;
150 }
151
152 DELETE_ARRAY(recognizers);
153 }
154
155 recognizers = NULL;
156 ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
157 }
158 }
159
CharsetDetector(UErrorCode & status)160 CharsetDetector::CharsetDetector(UErrorCode &status)
161 : textIn(new InputText(status)), resultArray(NULL),
162 resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)
163 {
164 if (U_FAILURE(status)) {
165 return;
166 }
167
168 setRecognizers(status);
169
170 if (U_FAILURE(status)) {
171 return;
172 }
173
174 resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
175
176 if (resultArray == NULL) {
177 status = U_MEMORY_ALLOCATION_ERROR;
178 return;
179 }
180
181 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
182 resultArray[i] = new CharsetMatch();
183
184 if (resultArray[i] == NULL) {
185 status = U_MEMORY_ALLOCATION_ERROR;
186 break;
187 }
188 }
189 }
190
~CharsetDetector()191 CharsetDetector::~CharsetDetector()
192 {
193 delete textIn;
194
195 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
196 delete resultArray[i];
197 }
198
199 uprv_free(resultArray);
200 }
201
setText(const char * in,int32_t len)202 void CharsetDetector::setText(const char *in, int32_t len)
203 {
204 textIn->setText(in, len);
205 fFreshTextSet = TRUE;
206 }
207
setStripTagsFlag(UBool flag)208 UBool CharsetDetector::setStripTagsFlag(UBool flag)
209 {
210 UBool temp = fStripTags;
211 fStripTags = flag;
212 fFreshTextSet = TRUE;
213 return temp;
214 }
215
getStripTagsFlag() const216 UBool CharsetDetector::getStripTagsFlag() const
217 {
218 return fStripTags;
219 }
220
setDeclaredEncoding(const char * encoding,int32_t len) const221 void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
222 {
223 textIn->setDeclaredEncoding(encoding,len);
224 }
225
getDetectableCount()226 int32_t CharsetDetector::getDetectableCount()
227 {
228 UErrorCode status = U_ZERO_ERROR;
229
230 setRecognizers(status);
231
232 return fCSRecognizers_size;
233 }
234
detect(UErrorCode & status)235 const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
236 {
237 int32_t maxMatchesFound = 0;
238
239 detectAll(maxMatchesFound, status);
240
241 if(maxMatchesFound > 0) {
242 return resultArray[0];
243 } else {
244 return NULL;
245 }
246 }
247
detectAll(int32_t & maxMatchesFound,UErrorCode & status)248 const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
249 {
250 if(!textIn->isSet()) {
251 status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set
252
253 return NULL;
254 } else if (fFreshTextSet) {
255 CharsetRecognizer *csr;
256 int32_t i;
257
258 textIn->MungeInput(fStripTags);
259
260 // Iterate over all possible charsets, remember all that
261 // give a match quality > 0.
262 resultCount = 0;
263 for (i = 0; i < fCSRecognizers_size; i += 1) {
264 csr = fCSRecognizers[i];
265 if (csr->match(textIn, resultArray[resultCount])) {
266 resultCount++;
267 }
268 }
269
270 if (resultCount > 1) {
271 uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
272 }
273 fFreshTextSet = FALSE;
274 }
275
276 maxMatchesFound = resultCount;
277
278 return resultArray;
279 }
280
281 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
282 {
283 if( index > fCSRecognizers_size-1 || index < 0) {
284 status = U_INDEX_OUTOFBOUNDS_ERROR;
285
286 return 0;
287 } else {
288 return fCSRecognizers[index]->getName();
289 }
290 }*/
291
292 U_NAMESPACE_END
293
294 U_CDECL_BEGIN
295 typedef struct {
296 int32_t currIndex;
297 } Context;
298
299
300
301 static void U_CALLCONV
enumClose(UEnumeration * en)302 enumClose(UEnumeration *en) {
303 if(en->context != NULL) {
304 DELETE_ARRAY(en->context);
305 }
306
307 DELETE_ARRAY(en);
308 }
309
310 static int32_t U_CALLCONV
enumCount(UEnumeration *,UErrorCode *)311 enumCount(UEnumeration *, UErrorCode *) {
312 return fCSRecognizers_size;
313 }
314
315 static const char* U_CALLCONV
enumNext(UEnumeration * en,int32_t * resultLength,UErrorCode *)316 enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
317 if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {
318 if(resultLength != NULL) {
319 *resultLength = 0;
320 }
321 return NULL;
322 }
323 const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();
324 if(resultLength != NULL) {
325 *resultLength = (int32_t)uprv_strlen(currName);
326 }
327 ((Context *)en->context)->currIndex++;
328
329 return currName;
330 }
331
332 static void U_CALLCONV
enumReset(UEnumeration * en,UErrorCode *)333 enumReset(UEnumeration *en, UErrorCode *) {
334 ((Context *)en->context)->currIndex = 0;
335 }
336
337 static const UEnumeration gCSDetEnumeration = {
338 NULL,
339 NULL,
340 enumClose,
341 enumCount,
342 uenum_unextDefault,
343 enumNext,
344 enumReset
345 };
346
347 U_CAPI UEnumeration * U_EXPORT2
ucsdet_getAllDetectableCharsets(const UCharsetDetector *,UErrorCode * status)348 ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status)
349 {
350 U_NAMESPACE_USE
351
352 if(U_FAILURE(*status)) {
353 return 0;
354 }
355
356 /* Initialize recognized charsets. */
357 CharsetDetector::getDetectableCount();
358
359 UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
360 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
361 en->context = (void*)NEW_ARRAY(Context, 1);
362 uprv_memset(en->context, 0, sizeof(Context));
363 return en;
364 }
365 U_CDECL_END
366
367 #endif
368
369