1 /* 2 * Copyright (C) 2010 ZXing authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.zxing.common; 18 19 import java.nio.charset.Charset; 20 import java.nio.charset.StandardCharsets; 21 import java.nio.charset.UnsupportedCharsetException; 22 import java.util.Map; 23 24 import com.google.zxing.DecodeHintType; 25 26 /** 27 * Common string-related functions. 28 * 29 * @author Sean Owen 30 * @author Alex Dupre 31 */ 32 public final class StringUtils { 33 34 private static final Charset PLATFORM_DEFAULT_ENCODING = Charset.defaultCharset(); 35 public static final Charset SHIFT_JIS_CHARSET = Charset.forName("SJIS"); 36 public static final Charset GB2312_CHARSET; 37 static { 38 Charset gb2312Charset; 39 try { 40 gb2312Charset = Charset.forName("GB2312"); 41 } catch (UnsupportedCharsetException ucee) { 42 // Can happen on some embedded JREs? 43 gb2312Charset = null; 44 } 45 GB2312_CHARSET = gb2312Charset; 46 } 47 private static final Charset EUC_JP = Charset.forName("EUC_JP"); 48 private static final boolean ASSUME_SHIFT_JIS = 49 SHIFT_JIS_CHARSET.equals(PLATFORM_DEFAULT_ENCODING) || 50 EUC_JP.equals(PLATFORM_DEFAULT_ENCODING); 51 52 // Retained for ABI compatibility with earlier versions 53 public static final String SHIFT_JIS = "SJIS"; 54 public static final String GB2312 = "GB2312"; 55 StringUtils()56 private StringUtils() { } 57 58 /** 59 * @param bytes bytes encoding a string, whose encoding should be guessed 60 * @param hints decode hints if applicable 61 * @return name of guessed encoding; at the moment will only guess one of: 62 * "SJIS", "UTF8", "ISO8859_1", or the platform default encoding if none 63 * of these can possibly be correct 64 */ guessEncoding(byte[] bytes, Map<DecodeHintType,?> hints)65 public static String guessEncoding(byte[] bytes, Map<DecodeHintType,?> hints) { 66 Charset c = guessCharset(bytes, hints); 67 if (c.equals(SHIFT_JIS_CHARSET)) { 68 return "SJIS"; 69 } 70 if (c.equals(StandardCharsets.UTF_8)) { 71 return "UTF8"; 72 } 73 if (c.equals(StandardCharsets.ISO_8859_1)) { 74 return "ISO8859_1"; 75 } 76 return c.name(); 77 } 78 79 /** 80 * @param bytes bytes encoding a string, whose encoding should be guessed 81 * @param hints decode hints if applicable 82 * @return Charset of guessed encoding; at the moment will only guess one of: 83 * {@link #SHIFT_JIS_CHARSET}, {@link StandardCharsets#UTF_8}, 84 * {@link StandardCharsets#ISO_8859_1}, {@link StandardCharsets#UTF_16}, 85 * or the platform default encoding if 86 * none of these can possibly be correct 87 */ guessCharset(byte[] bytes, Map<DecodeHintType,?> hints)88 public static Charset guessCharset(byte[] bytes, Map<DecodeHintType,?> hints) { 89 if (hints != null && hints.containsKey(DecodeHintType.CHARACTER_SET)) { 90 return Charset.forName(hints.get(DecodeHintType.CHARACTER_SET).toString()); 91 } 92 93 // First try UTF-16, assuming anything with its BOM is UTF-16 94 if (bytes.length > 2 && 95 ((bytes[0] == (byte) 0xFE && bytes[1] == (byte) 0xFF) || 96 (bytes[0] == (byte) 0xFF && bytes[1] == (byte) 0xFE))) { 97 return StandardCharsets.UTF_16; 98 } 99 100 // For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS, 101 // which should be by far the most common encodings. 102 int length = bytes.length; 103 boolean canBeISO88591 = true; 104 boolean canBeShiftJIS = true; 105 boolean canBeUTF8 = true; 106 int utf8BytesLeft = 0; 107 int utf2BytesChars = 0; 108 int utf3BytesChars = 0; 109 int utf4BytesChars = 0; 110 int sjisBytesLeft = 0; 111 int sjisKatakanaChars = 0; 112 int sjisCurKatakanaWordLength = 0; 113 int sjisCurDoubleBytesWordLength = 0; 114 int sjisMaxKatakanaWordLength = 0; 115 int sjisMaxDoubleBytesWordLength = 0; 116 int isoHighOther = 0; 117 118 boolean utf8bom = bytes.length > 3 && 119 bytes[0] == (byte) 0xEF && 120 bytes[1] == (byte) 0xBB && 121 bytes[2] == (byte) 0xBF; 122 123 for (int i = 0; 124 i < length && (canBeISO88591 || canBeShiftJIS || canBeUTF8); 125 i++) { 126 127 int value = bytes[i] & 0xFF; 128 129 // UTF-8 stuff 130 if (canBeUTF8) { 131 if (utf8BytesLeft > 0) { 132 if ((value & 0x80) == 0) { 133 canBeUTF8 = false; 134 } else { 135 utf8BytesLeft--; 136 } 137 } else if ((value & 0x80) != 0) { 138 if ((value & 0x40) == 0) { 139 canBeUTF8 = false; 140 } else { 141 utf8BytesLeft++; 142 if ((value & 0x20) == 0) { 143 utf2BytesChars++; 144 } else { 145 utf8BytesLeft++; 146 if ((value & 0x10) == 0) { 147 utf3BytesChars++; 148 } else { 149 utf8BytesLeft++; 150 if ((value & 0x08) == 0) { 151 utf4BytesChars++; 152 } else { 153 canBeUTF8 = false; 154 } 155 } 156 } 157 } 158 } 159 } 160 161 // ISO-8859-1 stuff 162 if (canBeISO88591) { 163 if (value > 0x7F && value < 0xA0) { 164 canBeISO88591 = false; 165 } else if (value > 0x9F && (value < 0xC0 || value == 0xD7 || value == 0xF7)) { 166 isoHighOther++; 167 } 168 } 169 170 // Shift_JIS stuff 171 if (canBeShiftJIS) { 172 if (sjisBytesLeft > 0) { 173 if (value < 0x40 || value == 0x7F || value > 0xFC) { 174 canBeShiftJIS = false; 175 } else { 176 sjisBytesLeft--; 177 } 178 } else if (value == 0x80 || value == 0xA0 || value > 0xEF) { 179 canBeShiftJIS = false; 180 } else if (value > 0xA0 && value < 0xE0) { 181 sjisKatakanaChars++; 182 sjisCurDoubleBytesWordLength = 0; 183 sjisCurKatakanaWordLength++; 184 if (sjisCurKatakanaWordLength > sjisMaxKatakanaWordLength) { 185 sjisMaxKatakanaWordLength = sjisCurKatakanaWordLength; 186 } 187 } else if (value > 0x7F) { 188 sjisBytesLeft++; 189 //sjisDoubleBytesChars++; 190 sjisCurKatakanaWordLength = 0; 191 sjisCurDoubleBytesWordLength++; 192 if (sjisCurDoubleBytesWordLength > sjisMaxDoubleBytesWordLength) { 193 sjisMaxDoubleBytesWordLength = sjisCurDoubleBytesWordLength; 194 } 195 } else { 196 //sjisLowChars++; 197 sjisCurKatakanaWordLength = 0; 198 sjisCurDoubleBytesWordLength = 0; 199 } 200 } 201 } 202 203 if (canBeUTF8 && utf8BytesLeft > 0) { 204 canBeUTF8 = false; 205 } 206 if (canBeShiftJIS && sjisBytesLeft > 0) { 207 canBeShiftJIS = false; 208 } 209 210 // Easy -- if there is BOM or at least 1 valid not-single byte character (and no evidence it can't be UTF-8), done 211 if (canBeUTF8 && (utf8bom || utf2BytesChars + utf3BytesChars + utf4BytesChars > 0)) { 212 return StandardCharsets.UTF_8; 213 } 214 // Easy -- if assuming Shift_JIS or >= 3 valid consecutive not-ascii characters (and no evidence it can't be), done 215 if (canBeShiftJIS && (ASSUME_SHIFT_JIS || sjisMaxKatakanaWordLength >= 3 || sjisMaxDoubleBytesWordLength >= 3)) { 216 return SHIFT_JIS_CHARSET; 217 } 218 // Distinguishing Shift_JIS and ISO-8859-1 can be a little tough for short words. The crude heuristic is: 219 // - If we saw 220 // - only two consecutive katakana chars in the whole text, or 221 // - at least 10% of bytes that could be "upper" not-alphanumeric Latin1, 222 // - then we conclude Shift_JIS, else ISO-8859-1 223 if (canBeISO88591 && canBeShiftJIS) { 224 return (sjisMaxKatakanaWordLength == 2 && sjisKatakanaChars == 2) || isoHighOther * 10 >= length 225 ? SHIFT_JIS_CHARSET : StandardCharsets.ISO_8859_1; 226 } 227 228 // Otherwise, try in order ISO-8859-1, Shift JIS, UTF-8 and fall back to default platform encoding 229 if (canBeISO88591) { 230 return StandardCharsets.ISO_8859_1; 231 } 232 if (canBeShiftJIS) { 233 return SHIFT_JIS_CHARSET; 234 } 235 if (canBeUTF8) { 236 return StandardCharsets.UTF_8; 237 } 238 // Otherwise, we take a wild guess with platform encoding 239 return PLATFORM_DEFAULT_ENCODING; 240 } 241 242 } 243