1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_CONVERSION
11
12 #include "csmatch.h"
13 #include "csrmbcs.h"
14
15 #include <math.h>
16
17 U_NAMESPACE_BEGIN
18
19 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
20
21 #define min(x,y) (((x)<(y))?(x):(y))
22
23 static const uint16_t commonChars_sjis [] = {
24 // TODO: This set of data comes from the character frequency-
25 // of-occurence analysis tool. The data needs to be moved
26 // into a resource and loaded from there.
27 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
28 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
29 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
30 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
31 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
32 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
33
34 static const uint16_t commonChars_euc_jp[] = {
35 // TODO: This set of data comes from the character frequency-
36 // of-occurence analysis tool. The data needs to be moved
37 // into a resource and loaded from there.
38 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
39 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
40 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
41 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
42 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
43 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
44 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
45 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
46 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
47 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
48
49 static const uint16_t commonChars_euc_kr[] = {
50 // TODO: This set of data comes from the character frequency-
51 // of-occurence analysis tool. The data needs to be moved
52 // into a resource and loaded from there.
53 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
54 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
55 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
56 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
57 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
58 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
59 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
60 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
61 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
62 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
63
64 static const uint16_t commonChars_big5[] = {
65 // TODO: This set of data comes from the character frequency-
66 // of-occurence analysis tool. The data needs to be moved
67 // into a resource and loaded from there.
68 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
69 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
70 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
71 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
72 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
73 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
74 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
75 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
76 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
77 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
78
79 static const uint16_t commonChars_gb_18030[] = {
80 // TODO: This set of data comes from the character frequency-
81 // of-occurence analysis tool. The data needs to be moved
82 // into a resource and loaded from there.
83 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
84 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
85 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
86 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
87 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
88 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
89 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
90 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
91 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
92 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
93
binarySearch(const uint16_t * array,int32_t len,uint16_t value)94 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
95 {
96 int32_t start = 0, end = len-1;
97 int32_t mid = (start+end)/2;
98
99 while(start <= end) {
100 if(array[mid] == value) {
101 return mid;
102 }
103
104 if(array[mid] < value){
105 start = mid+1;
106 } else {
107 end = mid-1;
108 }
109
110 mid = (start+end)/2;
111 }
112
113 return -1;
114 }
115
IteratedChar()116 IteratedChar::IteratedChar() :
117 charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
118 {
119 // nothing else to do.
120 }
121
122 /*void IteratedChar::reset()
123 {
124 charValue = 0;
125 index = -1;
126 nextIndex = 0;
127 error = FALSE;
128 done = FALSE;
129 }*/
130
nextByte(InputText * det)131 int32_t IteratedChar::nextByte(InputText *det)
132 {
133 if (nextIndex >= det->fRawLength) {
134 done = TRUE;
135
136 return -1;
137 }
138
139 return det->fRawInput[nextIndex++];
140 }
141
~CharsetRecog_mbcs()142 CharsetRecog_mbcs::~CharsetRecog_mbcs()
143 {
144 // nothing to do.
145 }
146
match_mbcs(InputText * det,const uint16_t commonChars[],int32_t commonCharsLen) const147 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
148 int32_t singleByteCharCount = 0;
149 int32_t doubleByteCharCount = 0;
150 int32_t commonCharCount = 0;
151 int32_t badCharCount = 0;
152 int32_t totalCharCount = 0;
153 int32_t confidence = 0;
154 IteratedChar iter;
155
156 while (nextChar(&iter, det)) {
157 totalCharCount++;
158
159 if (iter.error) {
160 badCharCount++;
161 } else {
162 if (iter.charValue <= 0xFF) {
163 singleByteCharCount++;
164 } else {
165 doubleByteCharCount++;
166
167 if (commonChars != 0) {
168 if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
169 commonCharCount += 1;
170 }
171 }
172 }
173 }
174
175
176 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
177 // Bail out early if the byte data is not matching the encoding scheme.
178 // break detectBlock;
179 return confidence;
180 }
181 }
182
183 if (doubleByteCharCount <= 10 && badCharCount == 0) {
184 // Not many multi-byte chars.
185 if (doubleByteCharCount == 0 && totalCharCount < 10) {
186 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
187 // We don't have enough data to have any confidence.
188 // Statistical analysis of single byte non-ASCII charcters would probably help here.
189 confidence = 0;
190 }
191 else {
192 // ASCII or ISO file? It's probably not our encoding,
193 // but is not incompatible with our encoding, so don't give it a zero.
194 confidence = 10;
195 }
196
197 return confidence;
198 }
199
200 //
201 // No match if there are too many characters that don't fit the encoding scheme.
202 // (should we have zero tolerance for these?)
203 //
204 if (doubleByteCharCount < 20*badCharCount) {
205 confidence = 0;
206
207 return confidence;
208 }
209
210 if (commonChars == 0) {
211 // We have no statistics on frequently occuring characters.
212 // Assess confidence purely on having a reasonable number of
213 // multi-byte characters (the more the better)
214 confidence = 30 + doubleByteCharCount - 20*badCharCount;
215
216 if (confidence > 100) {
217 confidence = 100;
218 }
219 } else {
220 //
221 // Frequency of occurence statistics exist.
222 //
223
224 double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
225 double scaleFactor = 90.0 / maxVal;
226 confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
227
228 confidence = min(confidence, 100);
229 }
230
231 if (confidence < 0) {
232 confidence = 0;
233 }
234
235 return confidence;
236 }
237
~CharsetRecog_sjis()238 CharsetRecog_sjis::~CharsetRecog_sjis()
239 {
240 // nothing to do
241 }
242
nextChar(IteratedChar * it,InputText * det) const243 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
244 it->index = it->nextIndex;
245 it->error = FALSE;
246
247 int32_t firstByte = it->charValue = it->nextByte(det);
248
249 if (firstByte < 0) {
250 return FALSE;
251 }
252
253 if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
254 return TRUE;
255 }
256
257 int32_t secondByte = it->nextByte(det);
258 if (secondByte >= 0) {
259 it->charValue = (firstByte << 8) | secondByte;
260 }
261 // else we'll handle the error later.
262
263 if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
264 // Illegal second byte value.
265 it->error = TRUE;
266 }
267
268 return TRUE;
269 }
270
match(InputText * det,CharsetMatch * results) const271 UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
272 int32_t confidence = match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis));
273 results->set(det, this, confidence);
274 return (confidence > 0);
275 }
276
getName() const277 const char *CharsetRecog_sjis::getName() const
278 {
279 return "Shift_JIS";
280 }
281
getLanguage() const282 const char *CharsetRecog_sjis::getLanguage() const
283 {
284 return "ja";
285 }
286
~CharsetRecog_euc()287 CharsetRecog_euc::~CharsetRecog_euc()
288 {
289 // nothing to do
290 }
291
nextChar(IteratedChar * it,InputText * det) const292 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
293 int32_t firstByte = 0;
294 int32_t secondByte = 0;
295 int32_t thirdByte = 0;
296
297 it->index = it->nextIndex;
298 it->error = FALSE;
299 firstByte = it->charValue = it->nextByte(det);
300
301 if (firstByte < 0) {
302 // Ran off the end of the input data
303 return FALSE;
304 }
305
306 if (firstByte <= 0x8D) {
307 // single byte char
308 return TRUE;
309 }
310
311 secondByte = it->nextByte(det);
312 if (secondByte >= 0) {
313 it->charValue = (it->charValue << 8) | secondByte;
314 }
315 // else we'll handle the error later.
316
317 if (firstByte >= 0xA1 && firstByte <= 0xFE) {
318 // Two byte Char
319 if (secondByte < 0xA1) {
320 it->error = TRUE;
321 }
322
323 return TRUE;
324 }
325
326 if (firstByte == 0x8E) {
327 // Code Set 2.
328 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
329 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
330 // We don't know which we've got.
331 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
332 // bytes will look like a well formed 2 byte char.
333 if (secondByte < 0xA1) {
334 it->error = TRUE;
335 }
336
337 return TRUE;
338 }
339
340 if (firstByte == 0x8F) {
341 // Code set 3.
342 // Three byte total char size, two bytes of actual char value.
343 thirdByte = it->nextByte(det);
344 it->charValue = (it->charValue << 8) | thirdByte;
345
346 if (thirdByte < 0xa1) {
347 // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
348 it->error = TRUE;
349 }
350 }
351
352 return TRUE;
353
354 }
355
~CharsetRecog_euc_jp()356 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
357 {
358 // nothing to do
359 }
360
getName() const361 const char *CharsetRecog_euc_jp::getName() const
362 {
363 return "EUC-JP";
364 }
365
getLanguage() const366 const char *CharsetRecog_euc_jp::getLanguage() const
367 {
368 return "ja";
369 }
370
match(InputText * det,CharsetMatch * results) const371 UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
372 {
373 int32_t confidence = match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp));
374 results->set(det, this, confidence);
375 return (confidence > 0);
376 }
377
~CharsetRecog_euc_kr()378 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
379 {
380 // nothing to do
381 }
382
getName() const383 const char *CharsetRecog_euc_kr::getName() const
384 {
385 return "EUC-KR";
386 }
387
getLanguage() const388 const char *CharsetRecog_euc_kr::getLanguage() const
389 {
390 return "ko";
391 }
392
match(InputText * det,CharsetMatch * results) const393 UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
394 {
395 int32_t confidence = match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr));
396 results->set(det, this, confidence);
397 return (confidence > 0);
398 }
399
~CharsetRecog_big5()400 CharsetRecog_big5::~CharsetRecog_big5()
401 {
402 // nothing to do
403 }
404
nextChar(IteratedChar * it,InputText * det) const405 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
406 {
407 int32_t firstByte;
408
409 it->index = it->nextIndex;
410 it->error = FALSE;
411 firstByte = it->charValue = it->nextByte(det);
412
413 if (firstByte < 0) {
414 return FALSE;
415 }
416
417 if (firstByte <= 0x7F || firstByte == 0xFF) {
418 // single byte character.
419 return TRUE;
420 }
421
422 int32_t secondByte = it->nextByte(det);
423 if (secondByte >= 0) {
424 it->charValue = (it->charValue << 8) | secondByte;
425 }
426 // else we'll handle the error later.
427
428 if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
429 it->error = TRUE;
430 }
431
432 return TRUE;
433 }
434
getName() const435 const char *CharsetRecog_big5::getName() const
436 {
437 return "Big5";
438 }
439
getLanguage() const440 const char *CharsetRecog_big5::getLanguage() const
441 {
442 return "zh";
443 }
444
match(InputText * det,CharsetMatch * results) const445 UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
446 {
447 int32_t confidence = match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5));
448 results->set(det, this, confidence);
449 return (confidence > 0);
450 }
451
~CharsetRecog_gb_18030()452 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
453 {
454 // nothing to do
455 }
456
nextChar(IteratedChar * it,InputText * det) const457 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
458 int32_t firstByte = 0;
459 int32_t secondByte = 0;
460 int32_t thirdByte = 0;
461 int32_t fourthByte = 0;
462
463 it->index = it->nextIndex;
464 it->error = FALSE;
465 firstByte = it->charValue = it->nextByte(det);
466
467 if (firstByte < 0) {
468 // Ran off the end of the input data
469 return FALSE;
470 }
471
472 if (firstByte <= 0x80) {
473 // single byte char
474 return TRUE;
475 }
476
477 secondByte = it->nextByte(det);
478 if (secondByte >= 0) {
479 it->charValue = (it->charValue << 8) | secondByte;
480 }
481 // else we'll handle the error later.
482
483 if (firstByte >= 0x81 && firstByte <= 0xFE) {
484 // Two byte Char
485 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
486 return TRUE;
487 }
488
489 // Four byte char
490 if (secondByte >= 0x30 && secondByte <= 0x39) {
491 thirdByte = it->nextByte(det);
492
493 if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
494 fourthByte = it->nextByte(det);
495
496 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
497 it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
498
499 return TRUE;
500 }
501 }
502 }
503
504 // Something wasn't valid, or we ran out of data (-1).
505 it->error = TRUE;
506 }
507
508 return TRUE;
509 }
510
getName() const511 const char *CharsetRecog_gb_18030::getName() const
512 {
513 return "GB18030";
514 }
515
getLanguage() const516 const char *CharsetRecog_gb_18030::getLanguage() const
517 {
518 return "zh";
519 }
520
match(InputText * det,CharsetMatch * results) const521 UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
522 {
523 int32_t confidence = match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030));
524 results->set(det, this, confidence);
525 return (confidence > 0);
526 }
527
528 U_NAMESPACE_END
529 #endif
530