• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  **********************************************************************
3  *   Copyright (C) 2005-2008, International Business Machines
4  *   Corporation and others.  All Rights Reserved.
5  **********************************************************************
6  */
7 
8 #include "unicode/utypes.h"
9 
10 #if !UCONFIG_NO_CONVERSION
11 
12 #include "csrmbcs.h"
13 
14 #include <math.h>
15 
16 U_NAMESPACE_BEGIN
17 
18 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
19 
20 #define min(x,y) (((x)<(y))?(x):(y))
21 
22 static const uint16_t commonChars_sjis [] = {
23 // TODO:  This set of data comes from the character frequency-
24 //        of-occurence analysis tool.  The data needs to be moved
25 //        into a resource and loaded from there.
26 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
27 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
28 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
29 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
30 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
31 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
32 
33 static const uint16_t commonChars_euc_jp[] = {
34 // TODO:  This set of data comes from the character frequency-
35 //        of-occurence analysis tool.  The data needs to be moved
36 //        into a resource and loaded from there.
37 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
38 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
39 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
40 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
41 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
42 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
43 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
44 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
45 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
46 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
47 
48 static const uint16_t commonChars_euc_kr[] = {
49 // TODO:  This set of data comes from the character frequency-
50 //        of-occurence analysis tool.  The data needs to be moved
51 //        into a resource and loaded from there.
52 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
53 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
54 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
55 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
56 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
57 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
58 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
59 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
60 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
61 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
62 
63 static const uint16_t commonChars_big5[] = {
64 // TODO:  This set of data comes from the character frequency-
65 //        of-occurence analysis tool.  The data needs to be moved
66 //        into a resource and loaded from there.
67 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
68 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
69 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
70 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
71 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
72 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
73 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
74 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
75 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
76 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
77 
78 static const uint16_t commonChars_gb_18030[] = {
79 // TODO:  This set of data comes from the character frequency-
80 //        of-occurence analysis tool.  The data needs to be moved
81 //        into a resource and loaded from there.
82 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
83 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
84 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
85 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
86 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
87 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
88 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
89 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
90 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
91 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
92 
binarySearch(const uint16_t * array,int32_t len,uint16_t value)93 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
94 {
95     int32_t start = 0, end = len-1;
96     int32_t mid = (start+end)/2;
97 
98     while(start <= end) {
99         if(array[mid] == value) {
100             return mid;
101         }
102 
103         if(array[mid] < value){
104             start = mid+1;
105         } else {
106             end = mid-1;
107         }
108 
109         mid = (start+end)/2;
110     }
111 
112     return -1;
113 }
114 
IteratedChar()115 IteratedChar::IteratedChar() :
116 charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
117 {
118     // nothing else to do.
119 }
120 
121 /*void IteratedChar::reset()
122 {
123     charValue = 0;
124     index     = -1;
125     nextIndex = 0;
126     error     = FALSE;
127     done      = FALSE;
128 }*/
129 
nextByte(InputText * det)130 int32_t IteratedChar::nextByte(InputText *det)
131 {
132     if (nextIndex >= det->fRawLength) {
133         done = TRUE;
134 
135         return -1;
136     }
137 
138     return det->fRawInput[nextIndex++];
139 }
140 
~CharsetRecog_mbcs()141 CharsetRecog_mbcs::~CharsetRecog_mbcs()
142 {
143     // nothing to do.
144 }
145 
match_mbcs(InputText * det,const uint16_t commonChars[],int32_t commonCharsLen)146 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) {
147     int32_t singleByteCharCount = 0;
148     int32_t doubleByteCharCount = 0;
149     int32_t commonCharCount     = 0;
150     int32_t badCharCount        = 0;
151     int32_t totalCharCount      = 0;
152     int32_t confidence          = 0;
153     IteratedChar iter;
154 
155     while (nextChar(&iter, det)) {
156         totalCharCount++;
157 
158         if (iter.error) {
159             badCharCount++;
160         } else {
161             if (iter.charValue <= 0xFF) {
162                 singleByteCharCount++;
163             } else {
164                 doubleByteCharCount++;
165 
166                 if (commonChars != 0) {
167                     if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
168                         commonCharCount += 1;
169                     }
170                 }
171             }
172         }
173 
174 
175         if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
176             // Bail out early if the byte data is not matching the encoding scheme.
177             // break detectBlock;
178             return confidence;
179         }
180     }
181 
182     if (doubleByteCharCount <= 10 && badCharCount == 0) {
183         // Not many multi-byte chars.
184         if (doubleByteCharCount == 0 && totalCharCount < 10) {
185             // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
186             // We don't have enough data to have any confidence.
187             // Statistical analysis of single byte non-ASCII charcters would probably help here.
188             confidence = 0;
189         }
190         else {
191             //   ASCII or ISO file?  It's probably not our encoding,
192             //   but is not incompatible with our encoding, so don't give it a zero.
193             confidence = 10;
194         }
195 
196         return confidence;
197     }
198 
199     //
200     //  No match if there are too many characters that don't fit the encoding scheme.
201     //    (should we have zero tolerance for these?)
202     //
203     if (doubleByteCharCount < 20*badCharCount) {
204         confidence = 0;
205 
206         return confidence;
207     }
208 
209     if (commonChars == 0) {
210         // We have no statistics on frequently occuring characters.
211         //  Assess confidence purely on having a reasonable number of
212         //  multi-byte characters (the more the better)
213         confidence = 30 + doubleByteCharCount - 20*badCharCount;
214 
215         if (confidence > 100) {
216             confidence = 100;
217         }
218     } else {
219         //
220         // Frequency of occurence statistics exist.
221         //
222 
223         double maxVal = log10((double)doubleByteCharCount / 4); /*(float)?*/
224         double scaleFactor = 90.0 / maxVal;
225         confidence = (int32_t)(log10((double)commonCharCount+1) * scaleFactor + 10.0);
226 
227         confidence = min(confidence, 100);
228     }
229 
230     if (confidence < 0) {
231         confidence = 0;
232     }
233 
234     return confidence;
235 }
236 
~CharsetRecog_sjis()237 CharsetRecog_sjis::~CharsetRecog_sjis()
238 {
239     // nothing to do
240 }
241 
nextChar(IteratedChar * it,InputText * det)242 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) {
243     it->index = it->nextIndex;
244     it->error = FALSE;
245 
246     int32_t firstByte = it->charValue = it->nextByte(det);
247 
248     if (firstByte < 0) {
249         return FALSE;
250     }
251 
252     if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
253         return TRUE;
254     }
255 
256     int32_t secondByte = it->nextByte(det);
257     if (secondByte >= 0) {
258         it->charValue = (firstByte << 8) | secondByte;
259     }
260     // else we'll handle the error later.
261 
262     if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
263         // Illegal second byte value.
264         it->error = TRUE;
265     }
266 
267     return TRUE;
268 }
269 
match(InputText * det)270 int32_t CharsetRecog_sjis::match(InputText* det)
271 {
272     return match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis));
273 }
274 
getName() const275 const char *CharsetRecog_sjis::getName() const
276 {
277     return "Shift_JIS";
278 }
279 
getLanguage() const280 const char *CharsetRecog_sjis::getLanguage() const
281 {
282     return "ja";
283 }
284 
~CharsetRecog_euc()285 CharsetRecog_euc::~CharsetRecog_euc()
286 {
287     // nothing to do
288 }
289 
nextChar(IteratedChar * it,InputText * det)290 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) {
291     int32_t firstByte  = 0;
292     int32_t secondByte = 0;
293     int32_t thirdByte  = 0;
294 
295     it->index = it->nextIndex;
296     it->error = FALSE;
297     firstByte = it->charValue = it->nextByte(det);
298 
299     if (firstByte < 0) {
300         // Ran off the end of the input data
301         return FALSE;
302     }
303 
304     if (firstByte <= 0x8D) {
305         // single byte char
306         return TRUE;
307     }
308 
309     secondByte = it->nextByte(det);
310     if (secondByte >= 0) {
311         it->charValue = (it->charValue << 8) | secondByte;
312     }
313     // else we'll handle the error later.
314 
315     if (firstByte >= 0xA1 && firstByte <= 0xFE) {
316         // Two byte Char
317         if (secondByte < 0xA1) {
318             it->error = TRUE;
319         }
320 
321         return TRUE;
322     }
323 
324     if (firstByte == 0x8E) {
325         // Code Set 2.
326         //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
327         //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
328         // We don't know which we've got.
329         // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
330         //   bytes will look like a well formed 2 byte char.
331         if (secondByte < 0xA1) {
332             it->error = TRUE;
333         }
334 
335         return TRUE;
336     }
337 
338     if (firstByte == 0x8F) {
339         // Code set 3.
340         // Three byte total char size, two bytes of actual char value.
341         thirdByte    = it->nextByte(det);
342         it->charValue = (it->charValue << 8) | thirdByte;
343 
344         if (thirdByte < 0xa1) {
345             // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
346             it->error = TRUE;
347         }
348     }
349 
350     return TRUE;
351 
352 }
353 
~CharsetRecog_euc_jp()354 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
355 {
356     // nothing to do
357 }
358 
getName() const359 const char *CharsetRecog_euc_jp::getName() const
360 {
361     return "EUC-JP";
362 }
363 
getLanguage() const364 const char *CharsetRecog_euc_jp::getLanguage() const
365 {
366     return "ja";
367 }
368 
match(InputText * det)369 int32_t CharsetRecog_euc_jp::match(InputText *det)
370 {
371     return match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp));
372 }
373 
~CharsetRecog_euc_kr()374 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
375 {
376     // nothing to do
377 }
378 
getName() const379 const char *CharsetRecog_euc_kr::getName() const
380 {
381     return "EUC-KR";
382 }
383 
getLanguage() const384 const char *CharsetRecog_euc_kr::getLanguage() const
385 {
386     return "ko";
387 }
388 
match(InputText * det)389 int32_t CharsetRecog_euc_kr::match(InputText *det)
390 {
391     return match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr));
392 }
393 
~CharsetRecog_big5()394 CharsetRecog_big5::~CharsetRecog_big5()
395 {
396     // nothing to do
397 }
398 
nextChar(IteratedChar * it,InputText * det)399 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det)
400 {
401     int32_t firstByte;
402 
403     it->index = it->nextIndex;
404     it->error = FALSE;
405     firstByte = it->charValue = it->nextByte(det);
406 
407     if (firstByte < 0) {
408         return FALSE;
409     }
410 
411     if (firstByte <= 0x7F || firstByte == 0xFF) {
412         // single byte character.
413         return TRUE;
414     }
415 
416     int32_t secondByte = it->nextByte(det);
417     if (secondByte >= 0)  {
418         it->charValue = (it->charValue << 8) | secondByte;
419     }
420     // else we'll handle the error later.
421 
422     if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
423         it->error = TRUE;
424     }
425 
426     return TRUE;
427 }
428 
getName() const429 const char *CharsetRecog_big5::getName() const
430 {
431     return "Big5";
432 }
433 
getLanguage() const434 const char *CharsetRecog_big5::getLanguage() const
435 {
436     return "zh";
437 }
438 
match(InputText * det)439 int32_t CharsetRecog_big5::match(InputText *det)
440 {
441     return match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5));
442 }
443 
~CharsetRecog_gb_18030()444 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
445 {
446     // nothing to do
447 }
448 
nextChar(IteratedChar * it,InputText * det)449 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) {
450     int32_t firstByte  = 0;
451     int32_t secondByte = 0;
452     int32_t thirdByte  = 0;
453     int32_t fourthByte = 0;
454 
455     it->index = it->nextIndex;
456     it->error = FALSE;
457     firstByte = it->charValue = it->nextByte(det);
458 
459     if (firstByte < 0) {
460         // Ran off the end of the input data
461         return FALSE;
462     }
463 
464     if (firstByte <= 0x80) {
465         // single byte char
466         return TRUE;
467     }
468 
469     secondByte = it->nextByte(det);
470     if (secondByte >= 0) {
471         it->charValue = (it->charValue << 8) | secondByte;
472     }
473     // else we'll handle the error later.
474 
475     if (firstByte >= 0x81 && firstByte <= 0xFE) {
476         // Two byte Char
477         if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
478             return TRUE;
479         }
480 
481         // Four byte char
482         if (secondByte >= 0x30 && secondByte <= 0x39) {
483             thirdByte = it->nextByte(det);
484 
485             if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
486                 fourthByte = it->nextByte(det);
487 
488                 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
489                     it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
490 
491                     return TRUE;
492                 }
493             }
494         }
495 
496         // Something wasn't valid, or we ran out of data (-1).
497         it->error = TRUE;
498     }
499 
500     return TRUE;
501 }
502 
getName() const503 const char *CharsetRecog_gb_18030::getName() const
504 {
505     return "GB18030";
506 }
507 
getLanguage() const508 const char *CharsetRecog_gb_18030::getLanguage() const
509 {
510     return "zh";
511 }
512 
match(InputText * det)513 int32_t CharsetRecog_gb_18030::match(InputText *det)
514 {
515     return match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030));
516 }
517 
518 U_NAMESPACE_END
519 #endif
520