1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2005-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9
10 #include "unicode/utypes.h"
11
12 #if !UCONFIG_NO_CONVERSION
13
14 #include "cmemory.h"
15 #include "csmatch.h"
16 #include "csrmbcs.h"
17
18 #include <math.h>
19
20 U_NAMESPACE_BEGIN
21
22 #define min(x,y) (((x)<(y))?(x):(y))
23
24 static const uint16_t commonChars_sjis [] = {
25 // TODO: This set of data comes from the character frequency-
26 // of-occurrence analysis tool. The data needs to be moved
27 // into a resource and loaded from there.
28 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
29 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
30 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
31 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
32 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
33 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
34
35 static const uint16_t commonChars_euc_jp[] = {
36 // TODO: This set of data comes from the character frequency-
37 // of-occurrence analysis tool. The data needs to be moved
38 // into a resource and loaded from there.
39 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
40 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
41 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
42 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
43 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
44 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
45 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
46 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
47 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
48 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
49
50 static const uint16_t commonChars_euc_kr[] = {
51 // TODO: This set of data comes from the character frequency-
52 // of-occurrence analysis tool. The data needs to be moved
53 // into a resource and loaded from there.
54 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
55 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
56 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
57 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
58 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
59 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
60 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
61 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
62 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
63 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
64
65 static const uint16_t commonChars_big5[] = {
66 // TODO: This set of data comes from the character frequency-
67 // of-occurrence analysis tool. The data needs to be moved
68 // into a resource and loaded from there.
69 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
70 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
71 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
72 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
73 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
74 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
75 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
76 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
77 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
78 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
79
80 static const uint16_t commonChars_gb_18030[] = {
81 // TODO: This set of data comes from the character frequency-
82 // of-occurrence analysis tool. The data needs to be moved
83 // into a resource and loaded from there.
84 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
85 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
86 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
87 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
88 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
89 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
90 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
91 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
92 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
93 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
94
binarySearch(const uint16_t * array,int32_t len,uint16_t value)95 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
96 {
97 int32_t start = 0, end = len-1;
98 int32_t mid = (start+end)/2;
99
100 while(start <= end) {
101 if(array[mid] == value) {
102 return mid;
103 }
104
105 if(array[mid] < value){
106 start = mid+1;
107 } else {
108 end = mid-1;
109 }
110
111 mid = (start+end)/2;
112 }
113
114 return -1;
115 }
116
IteratedChar()117 IteratedChar::IteratedChar() :
118 charValue(0), index(-1), nextIndex(0), error(false), done(false)
119 {
120 // nothing else to do.
121 }
122
123 /*void IteratedChar::reset()
124 {
125 charValue = 0;
126 index = -1;
127 nextIndex = 0;
128 error = false;
129 done = false;
130 }*/
131
nextByte(InputText * det)132 int32_t IteratedChar::nextByte(InputText *det)
133 {
134 if (nextIndex >= det->fRawLength) {
135 done = true;
136
137 return -1;
138 }
139
140 return det->fRawInput[nextIndex++];
141 }
142
~CharsetRecog_mbcs()143 CharsetRecog_mbcs::~CharsetRecog_mbcs()
144 {
145 // nothing to do.
146 }
147
match_mbcs(InputText * det,const uint16_t commonChars[],int32_t commonCharsLen) const148 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
149 int32_t doubleByteCharCount = 0;
150 int32_t commonCharCount = 0;
151 int32_t badCharCount = 0;
152 int32_t totalCharCount = 0;
153 int32_t confidence = 0;
154 IteratedChar iter;
155
156 while (nextChar(&iter, det)) {
157 totalCharCount++;
158
159 if (iter.error) {
160 badCharCount++;
161 } else {
162 if (iter.charValue > 0xFF) {
163 doubleByteCharCount++;
164
165 if (commonChars != 0) {
166 if (binarySearch(commonChars, commonCharsLen, static_cast<uint16_t>(iter.charValue)) >= 0){
167 commonCharCount += 1;
168 }
169 }
170 }
171 }
172
173
174 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
175 // Bail out early if the byte data is not matching the encoding scheme.
176 // break detectBlock;
177 return confidence;
178 }
179 }
180
181 if (doubleByteCharCount <= 10 && badCharCount == 0) {
182 // Not many multi-byte chars.
183 if (doubleByteCharCount == 0 && totalCharCount < 10) {
184 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
185 // We don't have enough data to have any confidence.
186 // Statistical analysis of single byte non-ASCII characters would probably help here.
187 confidence = 0;
188 }
189 else {
190 // ASCII or ISO file? It's probably not our encoding,
191 // but is not incompatible with our encoding, so don't give it a zero.
192 confidence = 10;
193 }
194
195 return confidence;
196 }
197
198 //
199 // No match if there are too many characters that don't fit the encoding scheme.
200 // (should we have zero tolerance for these?)
201 //
202 if (doubleByteCharCount < 20*badCharCount) {
203 confidence = 0;
204
205 return confidence;
206 }
207
208 if (commonChars == 0) {
209 // We have no statistics on frequently occurring characters.
210 // Assess confidence purely on having a reasonable number of
211 // multi-byte characters (the more the better)
212 confidence = 30 + doubleByteCharCount - 20*badCharCount;
213
214 if (confidence > 100) {
215 confidence = 100;
216 }
217 } else {
218 //
219 // Frequency of occurrence statistics exist.
220 //
221
222 double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
223 double scaleFactor = 90.0 / maxVal;
224 confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
225
226 confidence = min(confidence, 100);
227 }
228
229 if (confidence < 0) {
230 confidence = 0;
231 }
232
233 return confidence;
234 }
235
~CharsetRecog_sjis()236 CharsetRecog_sjis::~CharsetRecog_sjis()
237 {
238 // nothing to do
239 }
240
nextChar(IteratedChar * it,InputText * det) const241 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
242 it->index = it->nextIndex;
243 it->error = false;
244
245 int32_t firstByte = it->charValue = it->nextByte(det);
246
247 if (firstByte < 0) {
248 return false;
249 }
250
251 if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
252 return true;
253 }
254
255 int32_t secondByte = it->nextByte(det);
256 if (secondByte >= 0) {
257 it->charValue = (firstByte << 8) | secondByte;
258 }
259 // else we'll handle the error later.
260
261 if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
262 // Illegal second byte value.
263 it->error = true;
264 }
265
266 return true;
267 }
268
match(InputText * det,CharsetMatch * results) const269 UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
270 int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
271 results->set(det, this, confidence);
272 return (confidence > 0);
273 }
274
getName() const275 const char *CharsetRecog_sjis::getName() const
276 {
277 return "Shift_JIS";
278 }
279
getLanguage() const280 const char *CharsetRecog_sjis::getLanguage() const
281 {
282 return "ja";
283 }
284
~CharsetRecog_euc()285 CharsetRecog_euc::~CharsetRecog_euc()
286 {
287 // nothing to do
288 }
289
nextChar(IteratedChar * it,InputText * det) const290 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
291 int32_t firstByte = 0;
292 int32_t secondByte = 0;
293 int32_t thirdByte = 0;
294
295 it->index = it->nextIndex;
296 it->error = false;
297 firstByte = it->charValue = it->nextByte(det);
298
299 if (firstByte < 0) {
300 // Ran off the end of the input data
301 return false;
302 }
303
304 if (firstByte <= 0x8D) {
305 // single byte char
306 return true;
307 }
308
309 secondByte = it->nextByte(det);
310 if (secondByte >= 0) {
311 it->charValue = (it->charValue << 8) | secondByte;
312 }
313 // else we'll handle the error later.
314
315 if (firstByte >= 0xA1 && firstByte <= 0xFE) {
316 // Two byte Char
317 if (secondByte < 0xA1) {
318 it->error = true;
319 }
320
321 return true;
322 }
323
324 if (firstByte == 0x8E) {
325 // Code Set 2.
326 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
327 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
328 // We don't know which we've got.
329 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
330 // bytes will look like a well formed 2 byte char.
331 if (secondByte < 0xA1) {
332 it->error = true;
333 }
334
335 return true;
336 }
337
338 if (firstByte == 0x8F) {
339 // Code set 3.
340 // Three byte total char size, two bytes of actual char value.
341 thirdByte = it->nextByte(det);
342 it->charValue = (it->charValue << 8) | thirdByte;
343
344 if (thirdByte < 0xa1) {
345 // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
346 it->error = true;
347 }
348 }
349
350 return true;
351
352 }
353
~CharsetRecog_euc_jp()354 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
355 {
356 // nothing to do
357 }
358
getName() const359 const char *CharsetRecog_euc_jp::getName() const
360 {
361 return "EUC-JP";
362 }
363
getLanguage() const364 const char *CharsetRecog_euc_jp::getLanguage() const
365 {
366 return "ja";
367 }
368
match(InputText * det,CharsetMatch * results) const369 UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
370 {
371 int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
372 results->set(det, this, confidence);
373 return (confidence > 0);
374 }
375
~CharsetRecog_euc_kr()376 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
377 {
378 // nothing to do
379 }
380
getName() const381 const char *CharsetRecog_euc_kr::getName() const
382 {
383 return "EUC-KR";
384 }
385
getLanguage() const386 const char *CharsetRecog_euc_kr::getLanguage() const
387 {
388 return "ko";
389 }
390
match(InputText * det,CharsetMatch * results) const391 UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
392 {
393 int32_t confidence = match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
394 results->set(det, this, confidence);
395 return (confidence > 0);
396 }
397
~CharsetRecog_big5()398 CharsetRecog_big5::~CharsetRecog_big5()
399 {
400 // nothing to do
401 }
402
nextChar(IteratedChar * it,InputText * det) const403 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
404 {
405 int32_t firstByte;
406
407 it->index = it->nextIndex;
408 it->error = false;
409 firstByte = it->charValue = it->nextByte(det);
410
411 if (firstByte < 0) {
412 return false;
413 }
414
415 if (firstByte <= 0x7F || firstByte == 0xFF) {
416 // single byte character.
417 return true;
418 }
419
420 int32_t secondByte = it->nextByte(det);
421 if (secondByte >= 0) {
422 it->charValue = (it->charValue << 8) | secondByte;
423 }
424 // else we'll handle the error later.
425
426 if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
427 it->error = true;
428 }
429
430 return true;
431 }
432
getName() const433 const char *CharsetRecog_big5::getName() const
434 {
435 return "Big5";
436 }
437
getLanguage() const438 const char *CharsetRecog_big5::getLanguage() const
439 {
440 return "zh";
441 }
442
match(InputText * det,CharsetMatch * results) const443 UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
444 {
445 int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
446 results->set(det, this, confidence);
447 return (confidence > 0);
448 }
449
~CharsetRecog_gb_18030()450 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
451 {
452 // nothing to do
453 }
454
nextChar(IteratedChar * it,InputText * det) const455 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
456 int32_t firstByte = 0;
457 int32_t secondByte = 0;
458 int32_t thirdByte = 0;
459 int32_t fourthByte = 0;
460
461 it->index = it->nextIndex;
462 it->error = false;
463 firstByte = it->charValue = it->nextByte(det);
464
465 if (firstByte < 0) {
466 // Ran off the end of the input data
467 return false;
468 }
469
470 if (firstByte <= 0x80) {
471 // single byte char
472 return true;
473 }
474
475 secondByte = it->nextByte(det);
476 if (secondByte >= 0) {
477 it->charValue = (it->charValue << 8) | secondByte;
478 }
479 // else we'll handle the error later.
480
481 if (firstByte >= 0x81 && firstByte <= 0xFE) {
482 // Two byte Char
483 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
484 return true;
485 }
486
487 // Four byte char
488 if (secondByte >= 0x30 && secondByte <= 0x39) {
489 thirdByte = it->nextByte(det);
490
491 if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
492 fourthByte = it->nextByte(det);
493
494 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
495 it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
496
497 return true;
498 }
499 }
500 }
501
502 // Something wasn't valid, or we ran out of data (-1).
503 it->error = true;
504 }
505
506 return true;
507 }
508
getName() const509 const char *CharsetRecog_gb_18030::getName() const
510 {
511 return "GB18030";
512 }
513
getLanguage() const514 const char *CharsetRecog_gb_18030::getLanguage() const
515 {
516 return "zh";
517 }
518
match(InputText * det,CharsetMatch * results) const519 UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
520 {
521 int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
522 results->set(det, this, confidence);
523 return (confidence > 0);
524 }
525
526 U_NAMESPACE_END
527 #endif
528