1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2008, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_CONVERSION
11
12 #include "csrmbcs.h"
13
14 #include <math.h>
15
16 U_NAMESPACE_BEGIN
17
18 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
19
20 #define min(x,y) (((x)<(y))?(x):(y))
21
22 static const uint16_t commonChars_sjis [] = {
23 // TODO: This set of data comes from the character frequency-
24 // of-occurence analysis tool. The data needs to be moved
25 // into a resource and loaded from there.
26 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
27 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
28 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
29 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
30 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
31 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
32
33 static const uint16_t commonChars_euc_jp[] = {
34 // TODO: This set of data comes from the character frequency-
35 // of-occurence analysis tool. The data needs to be moved
36 // into a resource and loaded from there.
37 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
38 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
39 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
40 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
41 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
42 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
43 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
44 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
45 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
46 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
47
48 static const uint16_t commonChars_euc_kr[] = {
49 // TODO: This set of data comes from the character frequency-
50 // of-occurence analysis tool. The data needs to be moved
51 // into a resource and loaded from there.
52 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
53 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
54 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
55 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
56 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
57 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
58 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
59 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
60 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
61 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
62
63 static const uint16_t commonChars_big5[] = {
64 // TODO: This set of data comes from the character frequency-
65 // of-occurence analysis tool. The data needs to be moved
66 // into a resource and loaded from there.
67 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
68 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
69 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
70 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
71 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
72 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
73 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
74 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
75 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
76 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
77
78 static const uint16_t commonChars_gb_18030[] = {
79 // TODO: This set of data comes from the character frequency-
80 // of-occurence analysis tool. The data needs to be moved
81 // into a resource and loaded from there.
82 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
83 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
84 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
85 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
86 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
87 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
88 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
89 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
90 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
91 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
92
binarySearch(const uint16_t * array,int32_t len,uint16_t value)93 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
94 {
95 int32_t start = 0, end = len-1;
96 int32_t mid = (start+end)/2;
97
98 while(start <= end) {
99 if(array[mid] == value) {
100 return mid;
101 }
102
103 if(array[mid] < value){
104 start = mid+1;
105 } else {
106 end = mid-1;
107 }
108
109 mid = (start+end)/2;
110 }
111
112 return -1;
113 }
114
IteratedChar()115 IteratedChar::IteratedChar() :
116 charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
117 {
118 // nothing else to do.
119 }
120
121 /*void IteratedChar::reset()
122 {
123 charValue = 0;
124 index = -1;
125 nextIndex = 0;
126 error = FALSE;
127 done = FALSE;
128 }*/
129
nextByte(InputText * det)130 int32_t IteratedChar::nextByte(InputText *det)
131 {
132 if (nextIndex >= det->fRawLength) {
133 done = TRUE;
134
135 return -1;
136 }
137
138 return det->fRawInput[nextIndex++];
139 }
140
~CharsetRecog_mbcs()141 CharsetRecog_mbcs::~CharsetRecog_mbcs()
142 {
143 // nothing to do.
144 }
145
match_mbcs(InputText * det,const uint16_t commonChars[],int32_t commonCharsLen)146 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) {
147 int32_t singleByteCharCount = 0;
148 int32_t doubleByteCharCount = 0;
149 int32_t commonCharCount = 0;
150 int32_t badCharCount = 0;
151 int32_t totalCharCount = 0;
152 int32_t confidence = 0;
153 IteratedChar iter;
154
155 while (nextChar(&iter, det)) {
156 totalCharCount++;
157
158 if (iter.error) {
159 badCharCount++;
160 } else {
161 if (iter.charValue <= 0xFF) {
162 singleByteCharCount++;
163 } else {
164 doubleByteCharCount++;
165
166 if (commonChars != 0) {
167 if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
168 commonCharCount += 1;
169 }
170 }
171 }
172 }
173
174
175 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
176 // Bail out early if the byte data is not matching the encoding scheme.
177 // break detectBlock;
178 return confidence;
179 }
180 }
181
182 if (doubleByteCharCount <= 10 && badCharCount == 0) {
183 // Not many multi-byte chars.
184 if (doubleByteCharCount == 0 && totalCharCount < 10) {
185 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
186 // We don't have enough data to have any confidence.
187 // Statistical analysis of single byte non-ASCII charcters would probably help here.
188 confidence = 0;
189 }
190 else {
191 // ASCII or ISO file? It's probably not our encoding,
192 // but is not incompatible with our encoding, so don't give it a zero.
193 confidence = 10;
194 }
195
196 return confidence;
197 }
198
199 //
200 // No match if there are too many characters that don't fit the encoding scheme.
201 // (should we have zero tolerance for these?)
202 //
203 if (doubleByteCharCount < 20*badCharCount) {
204 confidence = 0;
205
206 return confidence;
207 }
208
209 if (commonChars == 0) {
210 // We have no statistics on frequently occuring characters.
211 // Assess confidence purely on having a reasonable number of
212 // multi-byte characters (the more the better)
213 confidence = 30 + doubleByteCharCount - 20*badCharCount;
214
215 if (confidence > 100) {
216 confidence = 100;
217 }
218 } else {
219 //
220 // Frequency of occurence statistics exist.
221 //
222
223 double maxVal = log10((double)doubleByteCharCount / 4); /*(float)?*/
224 double scaleFactor = 90.0 / maxVal;
225 confidence = (int32_t)(log10((double)commonCharCount+1) * scaleFactor + 10.0);
226
227 confidence = min(confidence, 100);
228 }
229
230 if (confidence < 0) {
231 confidence = 0;
232 }
233
234 return confidence;
235 }
236
~CharsetRecog_sjis()237 CharsetRecog_sjis::~CharsetRecog_sjis()
238 {
239 // nothing to do
240 }
241
nextChar(IteratedChar * it,InputText * det)242 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) {
243 it->index = it->nextIndex;
244 it->error = FALSE;
245
246 int32_t firstByte = it->charValue = it->nextByte(det);
247
248 if (firstByte < 0) {
249 return FALSE;
250 }
251
252 if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
253 return TRUE;
254 }
255
256 int32_t secondByte = it->nextByte(det);
257 if (secondByte >= 0) {
258 it->charValue = (firstByte << 8) | secondByte;
259 }
260 // else we'll handle the error later.
261
262 if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
263 // Illegal second byte value.
264 it->error = TRUE;
265 }
266
267 return TRUE;
268 }
269
match(InputText * det)270 int32_t CharsetRecog_sjis::match(InputText* det)
271 {
272 return match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis));
273 }
274
getName() const275 const char *CharsetRecog_sjis::getName() const
276 {
277 return "Shift_JIS";
278 }
279
getLanguage() const280 const char *CharsetRecog_sjis::getLanguage() const
281 {
282 return "ja";
283 }
284
~CharsetRecog_euc()285 CharsetRecog_euc::~CharsetRecog_euc()
286 {
287 // nothing to do
288 }
289
nextChar(IteratedChar * it,InputText * det)290 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) {
291 int32_t firstByte = 0;
292 int32_t secondByte = 0;
293 int32_t thirdByte = 0;
294
295 it->index = it->nextIndex;
296 it->error = FALSE;
297 firstByte = it->charValue = it->nextByte(det);
298
299 if (firstByte < 0) {
300 // Ran off the end of the input data
301 return FALSE;
302 }
303
304 if (firstByte <= 0x8D) {
305 // single byte char
306 return TRUE;
307 }
308
309 secondByte = it->nextByte(det);
310 if (secondByte >= 0) {
311 it->charValue = (it->charValue << 8) | secondByte;
312 }
313 // else we'll handle the error later.
314
315 if (firstByte >= 0xA1 && firstByte <= 0xFE) {
316 // Two byte Char
317 if (secondByte < 0xA1) {
318 it->error = TRUE;
319 }
320
321 return TRUE;
322 }
323
324 if (firstByte == 0x8E) {
325 // Code Set 2.
326 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
327 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
328 // We don't know which we've got.
329 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
330 // bytes will look like a well formed 2 byte char.
331 if (secondByte < 0xA1) {
332 it->error = TRUE;
333 }
334
335 return TRUE;
336 }
337
338 if (firstByte == 0x8F) {
339 // Code set 3.
340 // Three byte total char size, two bytes of actual char value.
341 thirdByte = it->nextByte(det);
342 it->charValue = (it->charValue << 8) | thirdByte;
343
344 if (thirdByte < 0xa1) {
345 // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
346 it->error = TRUE;
347 }
348 }
349
350 return TRUE;
351
352 }
353
~CharsetRecog_euc_jp()354 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
355 {
356 // nothing to do
357 }
358
getName() const359 const char *CharsetRecog_euc_jp::getName() const
360 {
361 return "EUC-JP";
362 }
363
getLanguage() const364 const char *CharsetRecog_euc_jp::getLanguage() const
365 {
366 return "ja";
367 }
368
match(InputText * det)369 int32_t CharsetRecog_euc_jp::match(InputText *det)
370 {
371 return match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp));
372 }
373
~CharsetRecog_euc_kr()374 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
375 {
376 // nothing to do
377 }
378
getName() const379 const char *CharsetRecog_euc_kr::getName() const
380 {
381 return "EUC-KR";
382 }
383
getLanguage() const384 const char *CharsetRecog_euc_kr::getLanguage() const
385 {
386 return "ko";
387 }
388
match(InputText * det)389 int32_t CharsetRecog_euc_kr::match(InputText *det)
390 {
391 return match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr));
392 }
393
~CharsetRecog_big5()394 CharsetRecog_big5::~CharsetRecog_big5()
395 {
396 // nothing to do
397 }
398
nextChar(IteratedChar * it,InputText * det)399 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det)
400 {
401 int32_t firstByte;
402
403 it->index = it->nextIndex;
404 it->error = FALSE;
405 firstByte = it->charValue = it->nextByte(det);
406
407 if (firstByte < 0) {
408 return FALSE;
409 }
410
411 if (firstByte <= 0x7F || firstByte == 0xFF) {
412 // single byte character.
413 return TRUE;
414 }
415
416 int32_t secondByte = it->nextByte(det);
417 if (secondByte >= 0) {
418 it->charValue = (it->charValue << 8) | secondByte;
419 }
420 // else we'll handle the error later.
421
422 if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
423 it->error = TRUE;
424 }
425
426 return TRUE;
427 }
428
getName() const429 const char *CharsetRecog_big5::getName() const
430 {
431 return "Big5";
432 }
433
getLanguage() const434 const char *CharsetRecog_big5::getLanguage() const
435 {
436 return "zh";
437 }
438
match(InputText * det)439 int32_t CharsetRecog_big5::match(InputText *det)
440 {
441 return match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5));
442 }
443
~CharsetRecog_gb_18030()444 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
445 {
446 // nothing to do
447 }
448
nextChar(IteratedChar * it,InputText * det)449 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) {
450 int32_t firstByte = 0;
451 int32_t secondByte = 0;
452 int32_t thirdByte = 0;
453 int32_t fourthByte = 0;
454
455 it->index = it->nextIndex;
456 it->error = FALSE;
457 firstByte = it->charValue = it->nextByte(det);
458
459 if (firstByte < 0) {
460 // Ran off the end of the input data
461 return FALSE;
462 }
463
464 if (firstByte <= 0x80) {
465 // single byte char
466 return TRUE;
467 }
468
469 secondByte = it->nextByte(det);
470 if (secondByte >= 0) {
471 it->charValue = (it->charValue << 8) | secondByte;
472 }
473 // else we'll handle the error later.
474
475 if (firstByte >= 0x81 && firstByte <= 0xFE) {
476 // Two byte Char
477 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
478 return TRUE;
479 }
480
481 // Four byte char
482 if (secondByte >= 0x30 && secondByte <= 0x39) {
483 thirdByte = it->nextByte(det);
484
485 if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
486 fourthByte = it->nextByte(det);
487
488 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
489 it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
490
491 return TRUE;
492 }
493 }
494 }
495
496 // Something wasn't valid, or we ran out of data (-1).
497 it->error = TRUE;
498 }
499
500 return TRUE;
501 }
502
getName() const503 const char *CharsetRecog_gb_18030::getName() const
504 {
505 return "GB18030";
506 }
507
getLanguage() const508 const char *CharsetRecog_gb_18030::getLanguage() const
509 {
510 return "zh";
511 }
512
match(InputText * det)513 int32_t CharsetRecog_gb_18030::match(InputText *det)
514 {
515 return match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030));
516 }
517
518 U_NAMESPACE_END
519 #endif
520