1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2005-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9
10 #include "unicode/utypes.h"
11
12 #if !UCONFIG_NO_CONVERSION
13
14 #include "cmemory.h"
15 #include "csmatch.h"
16 #include "csrmbcs.h"
17
18 #include <math.h>
19
20 U_NAMESPACE_BEGIN
21
22 #define min(x,y) (((x)<(y))?(x):(y))
23
24 static const uint16_t commonChars_sjis [] = {
25 // TODO: This set of data comes from the character frequency-
26 // of-occurence analysis tool. The data needs to be moved
27 // into a resource and loaded from there.
28 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
29 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
30 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
31 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
32 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
33 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
34
35 static const uint16_t commonChars_euc_jp[] = {
36 // TODO: This set of data comes from the character frequency-
37 // of-occurence analysis tool. The data needs to be moved
38 // into a resource and loaded from there.
39 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
40 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
41 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
42 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
43 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
44 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
45 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
46 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
47 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
48 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
49
50 static const uint16_t commonChars_euc_kr[] = {
51 // TODO: This set of data comes from the character frequency-
52 // of-occurence analysis tool. The data needs to be moved
53 // into a resource and loaded from there.
54 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
55 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
56 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
57 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
58 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
59 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
60 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
61 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
62 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
63 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
64
65 static const uint16_t commonChars_big5[] = {
66 // TODO: This set of data comes from the character frequency-
67 // of-occurence analysis tool. The data needs to be moved
68 // into a resource and loaded from there.
69 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
70 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
71 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
72 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
73 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
74 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
75 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
76 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
77 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
78 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
79
80 static const uint16_t commonChars_gb_18030[] = {
81 // TODO: This set of data comes from the character frequency-
82 // of-occurence analysis tool. The data needs to be moved
83 // into a resource and loaded from there.
84 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
85 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
86 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
87 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
88 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
89 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
90 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
91 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
92 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
93 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
94
binarySearch(const uint16_t * array,int32_t len,uint16_t value)95 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
96 {
97 int32_t start = 0, end = len-1;
98 int32_t mid = (start+end)/2;
99
100 while(start <= end) {
101 if(array[mid] == value) {
102 return mid;
103 }
104
105 if(array[mid] < value){
106 start = mid+1;
107 } else {
108 end = mid-1;
109 }
110
111 mid = (start+end)/2;
112 }
113
114 return -1;
115 }
116
IteratedChar()117 IteratedChar::IteratedChar() :
118 charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
119 {
120 // nothing else to do.
121 }
122
123 /*void IteratedChar::reset()
124 {
125 charValue = 0;
126 index = -1;
127 nextIndex = 0;
128 error = FALSE;
129 done = FALSE;
130 }*/
131
nextByte(InputText * det)132 int32_t IteratedChar::nextByte(InputText *det)
133 {
134 if (nextIndex >= det->fRawLength) {
135 done = TRUE;
136
137 return -1;
138 }
139
140 return det->fRawInput[nextIndex++];
141 }
142
~CharsetRecog_mbcs()143 CharsetRecog_mbcs::~CharsetRecog_mbcs()
144 {
145 // nothing to do.
146 }
147
match_mbcs(InputText * det,const uint16_t commonChars[],int32_t commonCharsLen) const148 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
149 int32_t singleByteCharCount = 0;
150 int32_t doubleByteCharCount = 0;
151 int32_t commonCharCount = 0;
152 int32_t badCharCount = 0;
153 int32_t totalCharCount = 0;
154 int32_t confidence = 0;
155 IteratedChar iter;
156
157 while (nextChar(&iter, det)) {
158 totalCharCount++;
159
160 if (iter.error) {
161 badCharCount++;
162 } else {
163 if (iter.charValue <= 0xFF) {
164 singleByteCharCount++;
165 } else {
166 doubleByteCharCount++;
167
168 if (commonChars != 0) {
169 if (binarySearch(commonChars, commonCharsLen, static_cast<uint16_t>(iter.charValue)) >= 0){
170 commonCharCount += 1;
171 }
172 }
173 }
174 }
175
176
177 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
178 // Bail out early if the byte data is not matching the encoding scheme.
179 // break detectBlock;
180 return confidence;
181 }
182 }
183
184 if (doubleByteCharCount <= 10 && badCharCount == 0) {
185 // Not many multi-byte chars.
186 if (doubleByteCharCount == 0 && totalCharCount < 10) {
187 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
188 // We don't have enough data to have any confidence.
189 // Statistical analysis of single byte non-ASCII charcters would probably help here.
190 confidence = 0;
191 }
192 else {
193 // ASCII or ISO file? It's probably not our encoding,
194 // but is not incompatible with our encoding, so don't give it a zero.
195 confidence = 10;
196 }
197
198 return confidence;
199 }
200
201 //
202 // No match if there are too many characters that don't fit the encoding scheme.
203 // (should we have zero tolerance for these?)
204 //
205 if (doubleByteCharCount < 20*badCharCount) {
206 confidence = 0;
207
208 return confidence;
209 }
210
211 if (commonChars == 0) {
212 // We have no statistics on frequently occuring characters.
213 // Assess confidence purely on having a reasonable number of
214 // multi-byte characters (the more the better)
215 confidence = 30 + doubleByteCharCount - 20*badCharCount;
216
217 if (confidence > 100) {
218 confidence = 100;
219 }
220 } else {
221 //
222 // Frequency of occurence statistics exist.
223 //
224
225 double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
226 double scaleFactor = 90.0 / maxVal;
227 confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
228
229 confidence = min(confidence, 100);
230 }
231
232 if (confidence < 0) {
233 confidence = 0;
234 }
235
236 return confidence;
237 }
238
~CharsetRecog_sjis()239 CharsetRecog_sjis::~CharsetRecog_sjis()
240 {
241 // nothing to do
242 }
243
nextChar(IteratedChar * it,InputText * det) const244 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
245 it->index = it->nextIndex;
246 it->error = FALSE;
247
248 int32_t firstByte = it->charValue = it->nextByte(det);
249
250 if (firstByte < 0) {
251 return FALSE;
252 }
253
254 if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
255 return TRUE;
256 }
257
258 int32_t secondByte = it->nextByte(det);
259 if (secondByte >= 0) {
260 it->charValue = (firstByte << 8) | secondByte;
261 }
262 // else we'll handle the error later.
263
264 if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
265 // Illegal second byte value.
266 it->error = TRUE;
267 }
268
269 return TRUE;
270 }
271
match(InputText * det,CharsetMatch * results) const272 UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
273 int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
274 results->set(det, this, confidence);
275 return (confidence > 0);
276 }
277
getName() const278 const char *CharsetRecog_sjis::getName() const
279 {
280 return "Shift_JIS";
281 }
282
getLanguage() const283 const char *CharsetRecog_sjis::getLanguage() const
284 {
285 return "ja";
286 }
287
~CharsetRecog_euc()288 CharsetRecog_euc::~CharsetRecog_euc()
289 {
290 // nothing to do
291 }
292
nextChar(IteratedChar * it,InputText * det) const293 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
294 int32_t firstByte = 0;
295 int32_t secondByte = 0;
296 int32_t thirdByte = 0;
297
298 it->index = it->nextIndex;
299 it->error = FALSE;
300 firstByte = it->charValue = it->nextByte(det);
301
302 if (firstByte < 0) {
303 // Ran off the end of the input data
304 return FALSE;
305 }
306
307 if (firstByte <= 0x8D) {
308 // single byte char
309 return TRUE;
310 }
311
312 secondByte = it->nextByte(det);
313 if (secondByte >= 0) {
314 it->charValue = (it->charValue << 8) | secondByte;
315 }
316 // else we'll handle the error later.
317
318 if (firstByte >= 0xA1 && firstByte <= 0xFE) {
319 // Two byte Char
320 if (secondByte < 0xA1) {
321 it->error = TRUE;
322 }
323
324 return TRUE;
325 }
326
327 if (firstByte == 0x8E) {
328 // Code Set 2.
329 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
330 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
331 // We don't know which we've got.
332 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
333 // bytes will look like a well formed 2 byte char.
334 if (secondByte < 0xA1) {
335 it->error = TRUE;
336 }
337
338 return TRUE;
339 }
340
341 if (firstByte == 0x8F) {
342 // Code set 3.
343 // Three byte total char size, two bytes of actual char value.
344 thirdByte = it->nextByte(det);
345 it->charValue = (it->charValue << 8) | thirdByte;
346
347 if (thirdByte < 0xa1) {
348 // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
349 it->error = TRUE;
350 }
351 }
352
353 return TRUE;
354
355 }
356
~CharsetRecog_euc_jp()357 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
358 {
359 // nothing to do
360 }
361
getName() const362 const char *CharsetRecog_euc_jp::getName() const
363 {
364 return "EUC-JP";
365 }
366
getLanguage() const367 const char *CharsetRecog_euc_jp::getLanguage() const
368 {
369 return "ja";
370 }
371
match(InputText * det,CharsetMatch * results) const372 UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
373 {
374 int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
375 results->set(det, this, confidence);
376 return (confidence > 0);
377 }
378
~CharsetRecog_euc_kr()379 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
380 {
381 // nothing to do
382 }
383
getName() const384 const char *CharsetRecog_euc_kr::getName() const
385 {
386 return "EUC-KR";
387 }
388
getLanguage() const389 const char *CharsetRecog_euc_kr::getLanguage() const
390 {
391 return "ko";
392 }
393
match(InputText * det,CharsetMatch * results) const394 UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
395 {
396 int32_t confidence = match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
397 results->set(det, this, confidence);
398 return (confidence > 0);
399 }
400
~CharsetRecog_big5()401 CharsetRecog_big5::~CharsetRecog_big5()
402 {
403 // nothing to do
404 }
405
nextChar(IteratedChar * it,InputText * det) const406 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
407 {
408 int32_t firstByte;
409
410 it->index = it->nextIndex;
411 it->error = FALSE;
412 firstByte = it->charValue = it->nextByte(det);
413
414 if (firstByte < 0) {
415 return FALSE;
416 }
417
418 if (firstByte <= 0x7F || firstByte == 0xFF) {
419 // single byte character.
420 return TRUE;
421 }
422
423 int32_t secondByte = it->nextByte(det);
424 if (secondByte >= 0) {
425 it->charValue = (it->charValue << 8) | secondByte;
426 }
427 // else we'll handle the error later.
428
429 if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
430 it->error = TRUE;
431 }
432
433 return TRUE;
434 }
435
getName() const436 const char *CharsetRecog_big5::getName() const
437 {
438 return "Big5";
439 }
440
getLanguage() const441 const char *CharsetRecog_big5::getLanguage() const
442 {
443 return "zh";
444 }
445
match(InputText * det,CharsetMatch * results) const446 UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
447 {
448 int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
449 results->set(det, this, confidence);
450 return (confidence > 0);
451 }
452
~CharsetRecog_gb_18030()453 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
454 {
455 // nothing to do
456 }
457
nextChar(IteratedChar * it,InputText * det) const458 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
459 int32_t firstByte = 0;
460 int32_t secondByte = 0;
461 int32_t thirdByte = 0;
462 int32_t fourthByte = 0;
463
464 it->index = it->nextIndex;
465 it->error = FALSE;
466 firstByte = it->charValue = it->nextByte(det);
467
468 if (firstByte < 0) {
469 // Ran off the end of the input data
470 return FALSE;
471 }
472
473 if (firstByte <= 0x80) {
474 // single byte char
475 return TRUE;
476 }
477
478 secondByte = it->nextByte(det);
479 if (secondByte >= 0) {
480 it->charValue = (it->charValue << 8) | secondByte;
481 }
482 // else we'll handle the error later.
483
484 if (firstByte >= 0x81 && firstByte <= 0xFE) {
485 // Two byte Char
486 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
487 return TRUE;
488 }
489
490 // Four byte char
491 if (secondByte >= 0x30 && secondByte <= 0x39) {
492 thirdByte = it->nextByte(det);
493
494 if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
495 fourthByte = it->nextByte(det);
496
497 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
498 it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
499
500 return TRUE;
501 }
502 }
503 }
504
505 // Something wasn't valid, or we ran out of data (-1).
506 it->error = TRUE;
507 }
508
509 return TRUE;
510 }
511
getName() const512 const char *CharsetRecog_gb_18030::getName() const
513 {
514 return "GB18030";
515 }
516
getLanguage() const517 const char *CharsetRecog_gb_18030::getLanguage() const
518 {
519 return "zh";
520 }
521
match(InputText * det,CharsetMatch * results) const522 UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
523 {
524 int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
525 results->set(det, this, confidence);
526 return (confidence > 0);
527 }
528
529 U_NAMESPACE_END
530 #endif
531