• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  **********************************************************************
5  *   Copyright (C) 2005-2016, International Business Machines
6  *   Corporation and others.  All Rights Reserved.
7  **********************************************************************
8  */
9 
10 #include "unicode/utypes.h"
11 
12 #if !UCONFIG_NO_CONVERSION
13 
14 #include "cmemory.h"
15 #include "csmatch.h"
16 #include "csrmbcs.h"
17 
18 #include <math.h>
19 
20 U_NAMESPACE_BEGIN
21 
22 #define min(x,y) (((x)<(y))?(x):(y))
23 
24 static const uint16_t commonChars_sjis [] = {
25 // TODO:  This set of data comes from the character frequency-
26 //        of-occurrence analysis tool.  The data needs to be moved
27 //        into a resource and loaded from there.
28 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
29 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
30 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
31 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
32 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
33 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
34 
35 static const uint16_t commonChars_euc_jp[] = {
36 // TODO:  This set of data comes from the character frequency-
37 //        of-occurrence analysis tool.  The data needs to be moved
38 //        into a resource and loaded from there.
39 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
40 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
41 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
42 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
43 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
44 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
45 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
46 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
47 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
48 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
49 
50 static const uint16_t commonChars_euc_kr[] = {
51 // TODO:  This set of data comes from the character frequency-
52 //        of-occurrence analysis tool.  The data needs to be moved
53 //        into a resource and loaded from there.
54 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
55 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
56 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
57 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
58 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
59 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
60 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
61 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
62 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
63 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
64 
65 static const uint16_t commonChars_big5[] = {
66 // TODO:  This set of data comes from the character frequency-
67 //        of-occurrence analysis tool.  The data needs to be moved
68 //        into a resource and loaded from there.
69 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
70 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
71 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
72 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
73 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
74 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
75 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
76 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
77 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
78 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
79 
80 static const uint16_t commonChars_gb_18030[] = {
81 // TODO:  This set of data comes from the character frequency-
82 //        of-occurrence analysis tool.  The data needs to be moved
83 //        into a resource and loaded from there.
84 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
85 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
86 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
87 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
88 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
89 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
90 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
91 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
92 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
93 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
94 
binarySearch(const uint16_t * array,int32_t len,uint16_t value)95 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
96 {
97     int32_t start = 0, end = len-1;
98     int32_t mid = (start+end)/2;
99 
100     while(start <= end) {
101         if(array[mid] == value) {
102             return mid;
103         }
104 
105         if(array[mid] < value){
106             start = mid+1;
107         } else {
108             end = mid-1;
109         }
110 
111         mid = (start+end)/2;
112     }
113 
114     return -1;
115 }
116 
IteratedChar()117 IteratedChar::IteratedChar() :
118 charValue(0), index(-1), nextIndex(0), error(false), done(false)
119 {
120     // nothing else to do.
121 }
122 
123 /*void IteratedChar::reset()
124 {
125     charValue = 0;
126     index     = -1;
127     nextIndex = 0;
128     error     = false;
129     done      = false;
130 }*/
131 
nextByte(InputText * det)132 int32_t IteratedChar::nextByte(InputText *det)
133 {
134     if (nextIndex >= det->fRawLength) {
135         done = true;
136 
137         return -1;
138     }
139 
140     return det->fRawInput[nextIndex++];
141 }
142 
~CharsetRecog_mbcs()143 CharsetRecog_mbcs::~CharsetRecog_mbcs()
144 {
145     // nothing to do.
146 }
147 
match_mbcs(InputText * det,const uint16_t commonChars[],int32_t commonCharsLen) const148 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
149     int32_t doubleByteCharCount = 0;
150     int32_t commonCharCount     = 0;
151     int32_t badCharCount        = 0;
152     int32_t totalCharCount      = 0;
153     int32_t confidence          = 0;
154     IteratedChar iter;
155 
156     while (nextChar(&iter, det)) {
157         totalCharCount++;
158 
159         if (iter.error) {
160             badCharCount++;
161         } else {
162             if (iter.charValue > 0xFF) {
163                 doubleByteCharCount++;
164 
165                 if (commonChars != 0) {
166                     if (binarySearch(commonChars, commonCharsLen, static_cast<uint16_t>(iter.charValue)) >= 0){
167                         commonCharCount += 1;
168                     }
169                 }
170             }
171         }
172 
173 
174         if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
175             // Bail out early if the byte data is not matching the encoding scheme.
176             // break detectBlock;
177             return confidence;
178         }
179     }
180 
181     if (doubleByteCharCount <= 10 && badCharCount == 0) {
182         // Not many multi-byte chars.
183         if (doubleByteCharCount == 0 && totalCharCount < 10) {
184             // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
185             // We don't have enough data to have any confidence.
186             // Statistical analysis of single byte non-ASCII characters would probably help here.
187             confidence = 0;
188         }
189         else {
190             //   ASCII or ISO file?  It's probably not our encoding,
191             //   but is not incompatible with our encoding, so don't give it a zero.
192             confidence = 10;
193         }
194 
195         return confidence;
196     }
197 
198     //
199     //  No match if there are too many characters that don't fit the encoding scheme.
200     //    (should we have zero tolerance for these?)
201     //
202     if (doubleByteCharCount < 20*badCharCount) {
203         confidence = 0;
204 
205         return confidence;
206     }
207 
208     if (commonChars == 0) {
209         // We have no statistics on frequently occurring characters.
210         //  Assess confidence purely on having a reasonable number of
211         //  multi-byte characters (the more the better)
212         confidence = 30 + doubleByteCharCount - 20*badCharCount;
213 
214         if (confidence > 100) {
215             confidence = 100;
216         }
217     } else {
218         //
219         // Frequency of occurrence statistics exist.
220         //
221 
222         double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
223         double scaleFactor = 90.0 / maxVal;
224         confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
225 
226         confidence = min(confidence, 100);
227     }
228 
229     if (confidence < 0) {
230         confidence = 0;
231     }
232 
233     return confidence;
234 }
235 
~CharsetRecog_sjis()236 CharsetRecog_sjis::~CharsetRecog_sjis()
237 {
238     // nothing to do
239 }
240 
nextChar(IteratedChar * it,InputText * det) const241 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
242     it->index = it->nextIndex;
243     it->error = false;
244 
245     int32_t firstByte = it->charValue = it->nextByte(det);
246 
247     if (firstByte < 0) {
248         return false;
249     }
250 
251     if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
252         return true;
253     }
254 
255     int32_t secondByte = it->nextByte(det);
256     if (secondByte >= 0) {
257         it->charValue = (firstByte << 8) | secondByte;
258     }
259     // else we'll handle the error later.
260 
261     if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
262         // Illegal second byte value.
263         it->error = true;
264     }
265 
266     return true;
267 }
268 
match(InputText * det,CharsetMatch * results) const269 UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
270     int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
271     results->set(det, this, confidence);
272     return (confidence > 0);
273 }
274 
getName() const275 const char *CharsetRecog_sjis::getName() const
276 {
277     return "Shift_JIS";
278 }
279 
getLanguage() const280 const char *CharsetRecog_sjis::getLanguage() const
281 {
282     return "ja";
283 }
284 
~CharsetRecog_euc()285 CharsetRecog_euc::~CharsetRecog_euc()
286 {
287     // nothing to do
288 }
289 
nextChar(IteratedChar * it,InputText * det) const290 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
291     int32_t firstByte  = 0;
292     int32_t secondByte = 0;
293     int32_t thirdByte  = 0;
294 
295     it->index = it->nextIndex;
296     it->error = false;
297     firstByte = it->charValue = it->nextByte(det);
298 
299     if (firstByte < 0) {
300         // Ran off the end of the input data
301         return false;
302     }
303 
304     if (firstByte <= 0x8D) {
305         // single byte char
306         return true;
307     }
308 
309     secondByte = it->nextByte(det);
310     if (secondByte >= 0) {
311         it->charValue = (it->charValue << 8) | secondByte;
312     }
313     // else we'll handle the error later.
314 
315     if (firstByte >= 0xA1 && firstByte <= 0xFE) {
316         // Two byte Char
317         if (secondByte < 0xA1) {
318             it->error = true;
319         }
320 
321         return true;
322     }
323 
324     if (firstByte == 0x8E) {
325         // Code Set 2.
326         //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
327         //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
328         // We don't know which we've got.
329         // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
330         //   bytes will look like a well formed 2 byte char.
331         if (secondByte < 0xA1) {
332             it->error = true;
333         }
334 
335         return true;
336     }
337 
338     if (firstByte == 0x8F) {
339         // Code set 3.
340         // Three byte total char size, two bytes of actual char value.
341         thirdByte    = it->nextByte(det);
342         it->charValue = (it->charValue << 8) | thirdByte;
343 
344         if (thirdByte < 0xa1) {
345             // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
346             it->error = true;
347         }
348     }
349 
350     return true;
351 
352 }
353 
~CharsetRecog_euc_jp()354 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
355 {
356     // nothing to do
357 }
358 
getName() const359 const char *CharsetRecog_euc_jp::getName() const
360 {
361     return "EUC-JP";
362 }
363 
getLanguage() const364 const char *CharsetRecog_euc_jp::getLanguage() const
365 {
366     return "ja";
367 }
368 
match(InputText * det,CharsetMatch * results) const369 UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
370 {
371     int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
372     results->set(det, this, confidence);
373     return (confidence > 0);
374 }
375 
~CharsetRecog_euc_kr()376 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
377 {
378     // nothing to do
379 }
380 
getName() const381 const char *CharsetRecog_euc_kr::getName() const
382 {
383     return "EUC-KR";
384 }
385 
getLanguage() const386 const char *CharsetRecog_euc_kr::getLanguage() const
387 {
388     return "ko";
389 }
390 
match(InputText * det,CharsetMatch * results) const391 UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
392 {
393     int32_t confidence =  match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
394     results->set(det, this, confidence);
395     return (confidence > 0);
396 }
397 
~CharsetRecog_big5()398 CharsetRecog_big5::~CharsetRecog_big5()
399 {
400     // nothing to do
401 }
402 
nextChar(IteratedChar * it,InputText * det) const403 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
404 {
405     int32_t firstByte;
406 
407     it->index = it->nextIndex;
408     it->error = false;
409     firstByte = it->charValue = it->nextByte(det);
410 
411     if (firstByte < 0) {
412         return false;
413     }
414 
415     if (firstByte <= 0x7F || firstByte == 0xFF) {
416         // single byte character.
417         return true;
418     }
419 
420     int32_t secondByte = it->nextByte(det);
421     if (secondByte >= 0)  {
422         it->charValue = (it->charValue << 8) | secondByte;
423     }
424     // else we'll handle the error later.
425 
426     if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
427         it->error = true;
428     }
429 
430     return true;
431 }
432 
getName() const433 const char *CharsetRecog_big5::getName() const
434 {
435     return "Big5";
436 }
437 
getLanguage() const438 const char *CharsetRecog_big5::getLanguage() const
439 {
440     return "zh";
441 }
442 
match(InputText * det,CharsetMatch * results) const443 UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
444 {
445     int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
446     results->set(det, this, confidence);
447     return (confidence > 0);
448 }
449 
~CharsetRecog_gb_18030()450 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
451 {
452     // nothing to do
453 }
454 
nextChar(IteratedChar * it,InputText * det) const455 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
456     int32_t firstByte  = 0;
457     int32_t secondByte = 0;
458     int32_t thirdByte  = 0;
459     int32_t fourthByte = 0;
460 
461     it->index = it->nextIndex;
462     it->error = false;
463     firstByte = it->charValue = it->nextByte(det);
464 
465     if (firstByte < 0) {
466         // Ran off the end of the input data
467         return false;
468     }
469 
470     if (firstByte <= 0x80) {
471         // single byte char
472         return true;
473     }
474 
475     secondByte = it->nextByte(det);
476     if (secondByte >= 0) {
477         it->charValue = (it->charValue << 8) | secondByte;
478     }
479     // else we'll handle the error later.
480 
481     if (firstByte >= 0x81 && firstByte <= 0xFE) {
482         // Two byte Char
483         if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
484             return true;
485         }
486 
487         // Four byte char
488         if (secondByte >= 0x30 && secondByte <= 0x39) {
489             thirdByte = it->nextByte(det);
490 
491             if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
492                 fourthByte = it->nextByte(det);
493 
494                 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
495                     it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
496 
497                     return true;
498                 }
499             }
500         }
501 
502         // Something wasn't valid, or we ran out of data (-1).
503         it->error = true;
504     }
505 
506     return true;
507 }
508 
getName() const509 const char *CharsetRecog_gb_18030::getName() const
510 {
511     return "GB18030";
512 }
513 
getLanguage() const514 const char *CharsetRecog_gb_18030::getLanguage() const
515 {
516     return "zh";
517 }
518 
match(InputText * det,CharsetMatch * results) const519 UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
520 {
521     int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
522     results->set(det, this, confidence);
523     return (confidence > 0);
524 }
525 
526 U_NAMESPACE_END
527 #endif
528