• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2010 ZXing authors
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.zxing.common;
18 
19 import java.nio.charset.Charset;
20 import java.nio.charset.StandardCharsets;
21 import java.nio.charset.UnsupportedCharsetException;
22 import java.util.Map;
23 
24 import com.google.zxing.DecodeHintType;
25 
26 /**
27  * Common string-related functions.
28  *
29  * @author Sean Owen
30  * @author Alex Dupre
31  */
32 public final class StringUtils {
33 
34   private static final Charset PLATFORM_DEFAULT_ENCODING = Charset.defaultCharset();
35   public static final Charset SHIFT_JIS_CHARSET = Charset.forName("SJIS");
36   public static final Charset GB2312_CHARSET;
37   static {
38     Charset gb2312Charset;
39     try {
40       gb2312Charset = Charset.forName("GB2312");
41     } catch (UnsupportedCharsetException ucee) {
42       // Can happen on some embedded JREs?
43       gb2312Charset = null;
44     }
45     GB2312_CHARSET = gb2312Charset;
46   }
47   private static final Charset EUC_JP = Charset.forName("EUC_JP");
48   private static final boolean ASSUME_SHIFT_JIS =
49       SHIFT_JIS_CHARSET.equals(PLATFORM_DEFAULT_ENCODING) ||
50       EUC_JP.equals(PLATFORM_DEFAULT_ENCODING);
51 
52   // Retained for ABI compatibility with earlier versions
53   public static final String SHIFT_JIS = "SJIS";
54   public static final String GB2312 = "GB2312";
55 
StringUtils()56   private StringUtils() { }
57 
58   /**
59    * @param bytes bytes encoding a string, whose encoding should be guessed
60    * @param hints decode hints if applicable
61    * @return name of guessed encoding; at the moment will only guess one of:
62    *  "SJIS", "UTF8", "ISO8859_1", or the platform default encoding if none
63    *  of these can possibly be correct
64    */
guessEncoding(byte[] bytes, Map<DecodeHintType,?> hints)65   public static String guessEncoding(byte[] bytes, Map<DecodeHintType,?> hints) {
66     Charset c = guessCharset(bytes, hints);
67     if (c.equals(SHIFT_JIS_CHARSET)) {
68       return "SJIS";
69     }
70     if (c.equals(StandardCharsets.UTF_8)) {
71       return "UTF8";
72     }
73     if (c.equals(StandardCharsets.ISO_8859_1)) {
74       return "ISO8859_1";
75     }
76     return c.name();
77   }
78 
79   /**
80    * @param bytes bytes encoding a string, whose encoding should be guessed
81    * @param hints decode hints if applicable
82    * @return Charset of guessed encoding; at the moment will only guess one of:
83    *  {@link #SHIFT_JIS_CHARSET}, {@link StandardCharsets#UTF_8},
84    *  {@link StandardCharsets#ISO_8859_1}, {@link StandardCharsets#UTF_16},
85    *  or the platform default encoding if
86    *  none of these can possibly be correct
87    */
guessCharset(byte[] bytes, Map<DecodeHintType,?> hints)88   public static Charset guessCharset(byte[] bytes, Map<DecodeHintType,?> hints) {
89     if (hints != null && hints.containsKey(DecodeHintType.CHARACTER_SET)) {
90       return Charset.forName(hints.get(DecodeHintType.CHARACTER_SET).toString());
91     }
92 
93     // First try UTF-16, assuming anything with its BOM is UTF-16
94     if (bytes.length > 2 &&
95         ((bytes[0] == (byte) 0xFE && bytes[1] == (byte) 0xFF) ||
96          (bytes[0] == (byte) 0xFF && bytes[1] == (byte) 0xFE))) {
97       return StandardCharsets.UTF_16;
98     }
99 
100     // For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
101     // which should be by far the most common encodings.
102     int length = bytes.length;
103     boolean canBeISO88591 = true;
104     boolean canBeShiftJIS = true;
105     boolean canBeUTF8 = true;
106     int utf8BytesLeft = 0;
107     int utf2BytesChars = 0;
108     int utf3BytesChars = 0;
109     int utf4BytesChars = 0;
110     int sjisBytesLeft = 0;
111     int sjisKatakanaChars = 0;
112     int sjisCurKatakanaWordLength = 0;
113     int sjisCurDoubleBytesWordLength = 0;
114     int sjisMaxKatakanaWordLength = 0;
115     int sjisMaxDoubleBytesWordLength = 0;
116     int isoHighOther = 0;
117 
118     boolean utf8bom = bytes.length > 3 &&
119         bytes[0] == (byte) 0xEF &&
120         bytes[1] == (byte) 0xBB &&
121         bytes[2] == (byte) 0xBF;
122 
123     for (int i = 0;
124          i < length && (canBeISO88591 || canBeShiftJIS || canBeUTF8);
125          i++) {
126 
127       int value = bytes[i] & 0xFF;
128 
129       // UTF-8 stuff
130       if (canBeUTF8) {
131         if (utf8BytesLeft > 0) {
132           if ((value & 0x80) == 0) {
133             canBeUTF8 = false;
134           } else {
135             utf8BytesLeft--;
136           }
137         } else if ((value & 0x80) != 0) {
138           if ((value & 0x40) == 0) {
139             canBeUTF8 = false;
140           } else {
141             utf8BytesLeft++;
142             if ((value & 0x20) == 0) {
143               utf2BytesChars++;
144             } else {
145               utf8BytesLeft++;
146               if ((value & 0x10) == 0) {
147                 utf3BytesChars++;
148               } else {
149                 utf8BytesLeft++;
150                 if ((value & 0x08) == 0) {
151                   utf4BytesChars++;
152                 } else {
153                   canBeUTF8 = false;
154                 }
155               }
156             }
157           }
158         }
159       }
160 
161       // ISO-8859-1 stuff
162       if (canBeISO88591) {
163         if (value > 0x7F && value < 0xA0) {
164           canBeISO88591 = false;
165         } else if (value > 0x9F && (value < 0xC0 || value == 0xD7 || value == 0xF7)) {
166           isoHighOther++;
167         }
168       }
169 
170       // Shift_JIS stuff
171       if (canBeShiftJIS) {
172         if (sjisBytesLeft > 0) {
173           if (value < 0x40 || value == 0x7F || value > 0xFC) {
174             canBeShiftJIS = false;
175           } else {
176             sjisBytesLeft--;
177           }
178         } else if (value == 0x80 || value == 0xA0 || value > 0xEF) {
179           canBeShiftJIS = false;
180         } else if (value > 0xA0 && value < 0xE0) {
181           sjisKatakanaChars++;
182           sjisCurDoubleBytesWordLength = 0;
183           sjisCurKatakanaWordLength++;
184           if (sjisCurKatakanaWordLength > sjisMaxKatakanaWordLength) {
185             sjisMaxKatakanaWordLength = sjisCurKatakanaWordLength;
186           }
187         } else if (value > 0x7F) {
188           sjisBytesLeft++;
189           //sjisDoubleBytesChars++;
190           sjisCurKatakanaWordLength = 0;
191           sjisCurDoubleBytesWordLength++;
192           if (sjisCurDoubleBytesWordLength > sjisMaxDoubleBytesWordLength) {
193             sjisMaxDoubleBytesWordLength = sjisCurDoubleBytesWordLength;
194           }
195         } else {
196           //sjisLowChars++;
197           sjisCurKatakanaWordLength = 0;
198           sjisCurDoubleBytesWordLength = 0;
199         }
200       }
201     }
202 
203     if (canBeUTF8 && utf8BytesLeft > 0) {
204       canBeUTF8 = false;
205     }
206     if (canBeShiftJIS && sjisBytesLeft > 0) {
207       canBeShiftJIS = false;
208     }
209 
210     // Easy -- if there is BOM or at least 1 valid not-single byte character (and no evidence it can't be UTF-8), done
211     if (canBeUTF8 && (utf8bom || utf2BytesChars + utf3BytesChars + utf4BytesChars > 0)) {
212       return StandardCharsets.UTF_8;
213     }
214     // Easy -- if assuming Shift_JIS or >= 3 valid consecutive not-ascii characters (and no evidence it can't be), done
215     if (canBeShiftJIS && (ASSUME_SHIFT_JIS || sjisMaxKatakanaWordLength >= 3 || sjisMaxDoubleBytesWordLength >= 3)) {
216       return SHIFT_JIS_CHARSET;
217     }
218     // Distinguishing Shift_JIS and ISO-8859-1 can be a little tough for short words. The crude heuristic is:
219     // - If we saw
220     //   - only two consecutive katakana chars in the whole text, or
221     //   - at least 10% of bytes that could be "upper" not-alphanumeric Latin1,
222     // - then we conclude Shift_JIS, else ISO-8859-1
223     if (canBeISO88591 && canBeShiftJIS) {
224       return (sjisMaxKatakanaWordLength == 2 && sjisKatakanaChars == 2) || isoHighOther * 10 >= length
225           ? SHIFT_JIS_CHARSET : StandardCharsets.ISO_8859_1;
226     }
227 
228     // Otherwise, try in order ISO-8859-1, Shift JIS, UTF-8 and fall back to default platform encoding
229     if (canBeISO88591) {
230       return StandardCharsets.ISO_8859_1;
231     }
232     if (canBeShiftJIS) {
233       return SHIFT_JIS_CHARSET;
234     }
235     if (canBeUTF8) {
236       return StandardCharsets.UTF_8;
237     }
238     // Otherwise, we take a wild guess with platform encoding
239     return PLATFORM_DEFAULT_ENCODING;
240   }
241 
242 }
243