• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  ****************************************************************************
6  * Copyright (C) 2005-2012, International Business Machines Corporation and *
7  * others. All Rights Reserved.                                             *
8  ****************************************************************************
9  *
10  */
11 package ohos.global.icu.text;
12 
13 import java.util.Arrays;
14 
15 /**
16  * CharsetRecognizer implemenation for Asian  - double or multi-byte - charsets.
17  *                   Match is determined mostly by the input data adhering to the
18  *                   encoding scheme for the charset, and, optionally,
19  *                   frequency-of-occurence of characters.
20  * <p/>
21  *                   Instances of this class are singletons, one per encoding
22  *                   being recognized.  They are created in the main
23  *                   CharsetDetector class and kept in the global list of available
24  *                   encodings to be checked.  The specific encoding being recognized
25  *                   is determined by subclass.
26  */
27 abstract class CharsetRecog_mbcs extends CharsetRecognizer {
28 
29    /**
30      * Get the IANA name of this charset.
31      * @return the charset name.
32      */
33     @Override
getName()34     abstract String      getName() ;
35 
36 
37     /**
38      * Test the match of this charset with the input text data
39      *      which is obtained via the CharsetDetector object.
40      *
41      * @param det  The CharsetDetector, which contains the input text
42      *             to be checked for being in this charset.
43      * @return     Two values packed into one int  (Damn java, anyhow)
44      *             <br/>
45      *             bits 0-7:  the match confidence, ranging from 0-100
46      *             <br/>
47      *             bits 8-15: The match reason, an enum-like value.
48      */
match(CharsetDetector det, int [] commonChars)49     int match(CharsetDetector det, int [] commonChars) {
50         @SuppressWarnings("unused")
51         int   singleByteCharCount = 0;  //TODO Do we really need this?
52         int   doubleByteCharCount = 0;
53         int   commonCharCount     = 0;
54         int   badCharCount        = 0;
55         int   totalCharCount      = 0;
56         int   confidence          = 0;
57         iteratedChar   iter       = new iteratedChar();
58 
59         detectBlock: {
60             for (iter.reset(); nextChar(iter, det);) {
61                 totalCharCount++;
62                 if (iter.error) {
63                     badCharCount++;
64                 } else {
65                     long cv = iter.charValue & 0xFFFFFFFFL;
66 
67                     if (cv <= 0xff) {
68                         singleByteCharCount++;
69                     } else {
70                         doubleByteCharCount++;
71                         if (commonChars != null) {
72                             // NOTE: This assumes that there are no 4-byte common chars.
73                             if (Arrays.binarySearch(commonChars, (int) cv) >= 0) {
74                                 commonCharCount++;
75                             }
76                         }
77                     }
78                 }
79                 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
80                     // Bail out early if the byte data is not matching the encoding scheme.
81                     break detectBlock;
82                 }
83             }
84 
85             if (doubleByteCharCount <= 10 && badCharCount== 0) {
86                 // Not many multi-byte chars.
87                 if (doubleByteCharCount == 0 && totalCharCount < 10) {
88                     // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
89                     // We don't have enough data to have any confidence.
90                     // Statistical analysis of single byte non-ASCII charcters would probably help here.
91                     confidence = 0;
92                 }
93                 else {
94                     //   ASCII or ISO file?  It's probably not our encoding,
95                     //   but is not incompatible with our encoding, so don't give it a zero.
96                     confidence = 10;
97                 }
98 
99                 break detectBlock;
100             }
101 
102             //
103             //  No match if there are too many characters that don't fit the encoding scheme.
104             //    (should we have zero tolerance for these?)
105             //
106             if (doubleByteCharCount < 20*badCharCount) {
107                 confidence = 0;
108                 break detectBlock;
109             }
110 
111             if (commonChars == null) {
112                 // We have no statistics on frequently occuring characters.
113                 //  Assess confidence purely on having a reasonable number of
114                 //  multi-byte characters (the more the better
115                 confidence = 30 + doubleByteCharCount - 20*badCharCount;
116                 if (confidence > 100) {
117                     confidence = 100;
118                 }
119             }else {
120                 //
121                 // Frequency of occurence statistics exist.
122                 //
123                 double maxVal = Math.log((float)doubleByteCharCount / 4);
124                 double scaleFactor = 90.0 / maxVal;
125                 confidence = (int)(Math.log(commonCharCount+1) * scaleFactor + 10);
126                 confidence = Math.min(confidence, 100);
127             }
128         }   // end of detectBlock:
129 
130         return confidence;
131     }
132 
133      // "Character"  iterated character class.
134      //    Recognizers for specific mbcs encodings make their "characters" available
135      //    by providing a nextChar() function that fills in an instance of iteratedChar
136      //    with the next char from the input.
137      //    The returned characters are not converted to Unicode, but remain as the raw
138      //    bytes (concatenated into an int) from the codepage data.
139      //
140      //  For Asian charsets, use the raw input rather than the input that has been
141      //   stripped of markup.  Detection only considers multi-byte chars, effectively
142      //   stripping markup anyway, and double byte chars do occur in markup too.
143      //
144      static class iteratedChar {
145          int             charValue = 0;             // 1-4 bytes from the raw input data
146          int             nextIndex = 0;
147          boolean         error     = false;
148          boolean         done      = false;
149 
reset()150          void reset() {
151              charValue = 0;
152              nextIndex = 0;
153              error     = false;
154              done      = false;
155          }
156 
nextByte(CharsetDetector det)157          int nextByte(CharsetDetector det) {
158              if (nextIndex >= det.fRawLength) {
159                  done = true;
160                  return -1;
161              }
162              int byteValue = det.fRawInput[nextIndex++] & 0x00ff;
163              return byteValue;
164          }
165      }
166 
167      /**
168       * Get the next character (however many bytes it is) from the input data
169       *    Subclasses for specific charset encodings must implement this function
170       *    to get characters according to the rules of their encoding scheme.
171       *
172       *  This function is not a method of class iteratedChar only because
173       *   that would require a lot of extra derived classes, which is awkward.
174       * @param it  The iteratedChar "struct" into which the returned char is placed.
175       * @param det The charset detector, which is needed to get at the input byte data
176       *            being iterated over.
177       * @return    True if a character was returned, false at end of input.
178       */
nextChar(iteratedChar it, CharsetDetector det)179      abstract boolean nextChar(iteratedChar it, CharsetDetector det);
180 
181 
182 
183 
184 
185      /**
186       *   Shift-JIS charset recognizer.
187       *
188       */
189      static class CharsetRecog_sjis extends CharsetRecog_mbcs {
190          static int [] commonChars =
191              // TODO:  This set of data comes from the character frequency-
192              //        of-occurence analysis tool.  The data needs to be moved
193              //        into a resource and loaded from there.
194             {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
195              0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
196              0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
197              0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
198              0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
199              0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
200 
201          @Override
nextChar(iteratedChar it, CharsetDetector det)202         boolean nextChar(iteratedChar it, CharsetDetector det) {
203              it.error = false;
204              int firstByte;
205              firstByte = it.charValue = it.nextByte(det);
206              if (firstByte < 0) {
207                  return false;
208              }
209 
210              if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) {
211                  return true;
212              }
213 
214              int secondByte = it.nextByte(det);
215              if (secondByte < 0)  {
216                  return false;
217              }
218              it.charValue = (firstByte << 8) | secondByte;
219              if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) {
220                  // Illegal second byte value.
221                  it.error = true;
222              }
223              return true;
224          }
225 
226          @Override
match(CharsetDetector det)227         CharsetMatch match(CharsetDetector det) {
228              int confidence = match(det, commonChars);
229              return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
230          }
231 
232          @Override
getName()233         String getName() {
234              return "Shift_JIS";
235          }
236 
237          @Override
getLanguage()238         public String getLanguage()
239          {
240              return "ja";
241          }
242 
243 
244      }
245 
246 
247      /**
248       *   Big5 charset recognizer.
249       *
250       */
251      static class CharsetRecog_big5 extends CharsetRecog_mbcs {
252          static int [] commonChars =
253              // TODO:  This set of data comes from the character frequency-
254              //        of-occurence analysis tool.  The data needs to be moved
255              //        into a resource and loaded from there.
256             {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
257              0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
258              0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
259              0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
260              0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
261              0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
262              0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
263              0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
264              0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
265              0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
266 
267          @Override
nextChar(iteratedChar it, CharsetDetector det)268         boolean nextChar(iteratedChar it, CharsetDetector det) {
269              it.error = false;
270              int firstByte;
271              firstByte = it.charValue = it.nextByte(det);
272              if (firstByte < 0) {
273                  return false;
274              }
275 
276              if (firstByte <= 0x7f || firstByte==0xff) {
277                  // single byte character.
278                  return true;
279              }
280 
281              int secondByte = it.nextByte(det);
282              if (secondByte < 0)  {
283                  return false;
284              }
285              it.charValue = (it.charValue << 8) | secondByte;
286 
287              if (secondByte < 0x40 ||
288                  secondByte ==0x7f ||
289                  secondByte == 0xff) {
290                      it.error = true;
291              }
292              return true;
293          }
294 
295          @Override
match(CharsetDetector det)296         CharsetMatch match(CharsetDetector det) {
297              int confidence = match(det, commonChars);
298              return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
299          }
300 
301          @Override
getName()302         String getName() {
303              return "Big5";
304          }
305 
306 
307          @Override
getLanguage()308         public String getLanguage()
309          {
310              return "zh";
311          }
312      }
313 
314 
315      /**
316       *   EUC charset recognizers.  One abstract class that provides the common function
317       *             for getting the next character according to the EUC encoding scheme,
318       *             and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
319       *
320       */
321      abstract static class CharsetRecog_euc extends CharsetRecog_mbcs {
322 
323          /*
324           *  (non-Javadoc)
325           *  Get the next character value for EUC based encodings.
326           *  Character "value" is simply the raw bytes that make up the character
327           *     packed into an int.
328           */
329          @Override
nextChar(iteratedChar it, CharsetDetector det)330         boolean nextChar(iteratedChar it, CharsetDetector det) {
331              it.error = false;
332              int firstByte  = 0;
333              int secondByte = 0;
334              int thirdByte  = 0;
335              //int fourthByte = 0;
336 
337              buildChar: {
338                  firstByte = it.charValue = it.nextByte(det);
339                  if (firstByte < 0) {
340                      // Ran off the end of the input data
341                      it.done = true;
342                      break buildChar;
343                  }
344                  if (firstByte <= 0x8d) {
345                      // single byte char
346                      break buildChar;
347                  }
348 
349                  secondByte = it.nextByte(det);
350                  it.charValue = (it.charValue << 8) | secondByte;
351 
352                  if (firstByte >= 0xA1 && firstByte <= 0xfe) {
353                      // Two byte Char
354                      if (secondByte < 0xa1) {
355                          it.error = true;
356                      }
357                      break buildChar;
358                  }
359                  if (firstByte == 0x8e) {
360                      // Code Set 2.
361                      //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
362                      //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
363                      // We don't know which we've got.
364                      // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
365                      //   bytes will look like a well formed 2 byte char.
366                      if (secondByte < 0xa1) {
367                          it.error = true;
368                      }
369                      break buildChar;
370                  }
371 
372                  if (firstByte == 0x8f) {
373                      // Code set 3.
374                      // Three byte total char size, two bytes of actual char value.
375                      thirdByte    = it.nextByte(det);
376                      it.charValue = (it.charValue << 8) | thirdByte;
377                      if (thirdByte < 0xa1) {
378                          it.error = true;
379                      }
380                  }
381               }
382 
383              return (it.done == false);
384          }
385 
386          /**
387           * The charset recognize for EUC-JP.  A singleton instance of this class
388           *    is created and kept by the public CharsetDetector class
389           */
390          static class CharsetRecog_euc_jp extends CharsetRecog_euc {
391              static int [] commonChars =
392                  // TODO:  This set of data comes from the character frequency-
393                  //        of-occurence analysis tool.  The data needs to be moved
394                  //        into a resource and loaded from there.
395                 {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
396                  0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
397                  0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
398                  0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
399                  0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
400                  0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
401                  0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
402                  0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
403                  0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
404                  0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
405              @Override
getName()406             String getName() {
407                  return "EUC-JP";
408              }
409 
410              @Override
match(CharsetDetector det)411             CharsetMatch match(CharsetDetector det) {
412                  int confidence = match(det, commonChars);
413                  return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
414              }
415 
416              @Override
getLanguage()417             public String getLanguage()
418              {
419                  return "ja";
420              }
421          }
422 
423          /**
424           * The charset recognize for EUC-KR.  A singleton instance of this class
425           *    is created and kept by the public CharsetDetector class
426           */
427          static class CharsetRecog_euc_kr extends CharsetRecog_euc {
428              static int [] commonChars =
429                  // TODO:  This set of data comes from the character frequency-
430                  //        of-occurence analysis tool.  The data needs to be moved
431                  //        into a resource and loaded from there.
432                 {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
433                  0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
434                  0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
435                  0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
436                  0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
437                  0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
438                  0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
439                  0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
440                  0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
441                  0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
442 
443              @Override
getName()444             String getName() {
445                  return "EUC-KR";
446              }
447 
448              @Override
match(CharsetDetector det)449             CharsetMatch match(CharsetDetector det) {
450                  int confidence = match(det, commonChars);
451                  return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
452              }
453 
454              @Override
getLanguage()455             public String getLanguage()
456              {
457                  return "ko";
458              }
459          }
460      }
461 
462      /**
463       *
464       *   GB-18030 recognizer. Uses simplified Chinese statistics.
465       *
466       */
467      static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs {
468 
469          /*
470           *  (non-Javadoc)
471           *  Get the next character value for EUC based encodings.
472           *  Character "value" is simply the raw bytes that make up the character
473           *     packed into an int.
474           */
475          @Override
nextChar(iteratedChar it, CharsetDetector det)476         boolean nextChar(iteratedChar it, CharsetDetector det) {
477              it.error = false;
478              int firstByte  = 0;
479              int secondByte = 0;
480              int thirdByte  = 0;
481              int fourthByte = 0;
482 
483              buildChar: {
484                  firstByte = it.charValue = it.nextByte(det);
485 
486                  if (firstByte < 0) {
487                      // Ran off the end of the input data
488                      it.done = true;
489                      break buildChar;
490                  }
491 
492                  if (firstByte <= 0x80) {
493                      // single byte char
494                      break buildChar;
495                  }
496 
497                  secondByte = it.nextByte(det);
498                  it.charValue = (it.charValue << 8) | secondByte;
499 
500                  if (firstByte >= 0x81 && firstByte <= 0xFE) {
501                      // Two byte Char
502                      if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <=0xFE)) {
503                          break buildChar;
504                      }
505 
506                      // Four byte char
507                      if (secondByte >= 0x30 && secondByte <= 0x39) {
508                          thirdByte = it.nextByte(det);
509 
510                          if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
511                              fourthByte = it.nextByte(det);
512 
513                              if (fourthByte >= 0x30 && fourthByte <= 0x39) {
514                                  it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte;
515                                  break buildChar;
516                              }
517                          }
518                      }
519 
520                      it.error = true;
521                      break buildChar;
522                  }
523              }
524 
525              return (it.done == false);
526          }
527 
528          static int [] commonChars =
529              // TODO:  This set of data comes from the character frequency-
530              //        of-occurence analysis tool.  The data needs to be moved
531              //        into a resource and loaded from there.
532             {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
533              0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
534              0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
535              0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
536              0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
537              0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
538              0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
539              0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
540              0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
541              0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
542 
543 
544          @Override
getName()545         String getName() {
546              return "GB18030";
547          }
548 
549          @Override
match(CharsetDetector det)550         CharsetMatch match(CharsetDetector det) {
551              int confidence = match(det, commonChars);
552              return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
553          }
554 
555          @Override
getLanguage()556         public String getLanguage()
557          {
558              return "zh";
559          }
560      }
561 
562 
563 }
564