1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /** 4 ******************************************************************************* 5 * Copyright (C) 2006-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.charset; 11 12 import java.lang.reflect.Constructor; 13 import java.lang.reflect.InvocationTargetException; 14 import java.nio.charset.Charset; 15 import java.nio.charset.IllegalCharsetNameException; 16 import java.nio.charset.UnsupportedCharsetException; 17 import java.util.HashMap; 18 19 import com.ibm.icu.text.UnicodeSet; 20 21 /** 22 * <p>A subclass of java.nio.Charset for providing implementation of ICU's charset converters. 23 * This API is used to convert codepage or character encoded data to and 24 * from UTF-16. You can open a converter with {@link Charset#forName} and {@link #forNameICU}. With that 25 * converter, you can get its properties, set options, convert your data. 26 * 27 * <p>Since many software programs recognize different converter names for 28 * different types of converters, there are other functions in this API to 29 * iterate over the converter aliases. 30 * 31 * <p>Note that {@link #name()} cannot always return a unique charset name. 32 * {@link Charset} documents that, 33 * for charsets listed in the IANA Charset Registry, 34 * the {@link #name()} must be listed there, 35 * and it “must be the MIME-preferred name” if there are multiple names. 36 * 37 * <p>However, there are different implementations of many if not most charsets, 38 * ICU provides multiple variants for some of them, 39 * ICU provides variants of some java.nio-system-supported charsets, 40 * and ICU users are free to add more variants. 41 * This is so that applications can be compatible with multiple implementations at the same time. 42 * 43 * <p>This is in conflict with the {@link Charset#name()} requirements. 44 * It is not possible to offer variants of an IANA charset and 45 * always use the MIME-preferred name and also have those names be unique. 46 * 47 * <p>{@link #name()} returns the MIME-preferred name, or IANA name, 48 * so that it can always be used for the charset field in internet protocols. 49 * 50 * <p>Same-name charsets are accessible via {@link Charset#forName} or {@link #forNameICU} 51 * by using unique aliases (e.g., the ICU-canonical names). 52 * 53 * <p>{@link Charset} also documents that 54 * “Two charsets are equal if, and only if, they have the same canonical names.” 55 * This is not possible. 56 * 57 * <p>Unfortunately, {@link Charset#equals} is final, and 58 * {@link Charset#availableCharsets} returns 59 * “a sorted map from canonical charset names to charset objects”. 60 * Since {@link #name()} cannot be unique, 61 * {@link #equals} cannot work properly in such cases, and 62 * {@link Charset#availableCharsets} can only include one variant for a name. 63 * 64 * @stable ICU 3.6 65 */ 66 public abstract class CharsetICU extends Charset{ 67 68 String icuCanonicalName; 69 int options; 70 71 float maxCharsPerByte; 72 73 String name; /* +4: 60 internal name of the converter- invariant chars */ 74 75 int codepage; /* +64: 4 codepage # (now IBM-$codepage) */ 76 77 byte platform; /* +68: 1 platform of the converter (only IBM now) */ 78 byte conversionType; /* +69: 1 conversion type */ 79 80 int minBytesPerChar; /* +70: 1 Minimum # bytes per char in this codepage */ 81 int maxBytesPerChar; /* +71: 1 Maximum # bytes output per UChar in this codepage */ 82 83 byte subChar[/*UCNV_MAX_SUBCHAR_LEN*/]; /* +72: 4 [note: 4 and 8 byte boundary] */ 84 byte subCharLen; /* +76: 1 */ 85 86 byte hasToUnicodeFallback; /* +77: 1 UBool needs to be changed to UBool to be consistent across platform */ 87 byte hasFromUnicodeFallback; /* +78: 1 */ 88 short unicodeMask; /* +79: 1 bit 0: has supplementary bit 1: has single surrogates */ 89 byte subChar1; /* +80: 1 single-byte substitution character for IBM MBCS (0 if none) */ 90 //byte reserved[/*19*/]; /* +81: 19 to round out the structure */ 91 92 93 // typedef enum UConverterUnicodeSet { 94 /** 95 * Parameter that select the set of roundtrippable Unicode code points. 96 * @stable ICU 4.0 97 */ 98 public static final int ROUNDTRIP_SET=0; 99 /** 100 * Select the set of Unicode code points with roundtrip or fallback mappings. 101 * Not supported at this point. 102 * @internal 103 * @deprecated This API is ICU internal only. 104 */ 105 @Deprecated 106 public static final int ROUNDTRIP_AND_FALLBACK_SET =1; 107 108 //} UConverterUnicodeSet; 109 110 /** 111 * 112 * @param icuCanonicalName 113 * @param canonicalName 114 * @param aliases 115 * @stable ICU 3.6 116 */ CharsetICU(String icuCanonicalName, String canonicalName, String[] aliases)117 protected CharsetICU(String icuCanonicalName, String canonicalName, String[] aliases) { 118 super(canonicalName,aliases); 119 if(canonicalName.length() == 0){ 120 throw new IllegalCharsetNameException(canonicalName); 121 } 122 this.icuCanonicalName = icuCanonicalName; 123 } 124 125 /** 126 * Ascertains if a charset is a sub set of this charset 127 * Implements the abstract method of super class. 128 * @param cs charset to test 129 * @return true if the given charset is a subset of this charset 130 * @stable ICU 3.6 131 */ 132 @Override contains(Charset cs)133 public boolean contains(Charset cs){ 134 if (null == cs) { 135 return false; 136 } else if (this.equals(cs)) { 137 return true; 138 } 139 return false; 140 } 141 private static final HashMap<String, String> algorithmicCharsets = new HashMap<String, String>(); 142 static{ 143 algorithmicCharsets.put("LMBCS-1", "com.ibm.icu.charset.CharsetLMBCS"); 144 algorithmicCharsets.put("LMBCS-2", "com.ibm.icu.charset.CharsetLMBCS"); 145 algorithmicCharsets.put("LMBCS-3", "com.ibm.icu.charset.CharsetLMBCS"); 146 algorithmicCharsets.put("LMBCS-4", "com.ibm.icu.charset.CharsetLMBCS"); 147 algorithmicCharsets.put("LMBCS-5", "com.ibm.icu.charset.CharsetLMBCS"); 148 algorithmicCharsets.put("LMBCS-6", "com.ibm.icu.charset.CharsetLMBCS"); 149 algorithmicCharsets.put("LMBCS-8", "com.ibm.icu.charset.CharsetLMBCS"); 150 algorithmicCharsets.put("LMBCS-11", "com.ibm.icu.charset.CharsetLMBCS"); 151 algorithmicCharsets.put("LMBCS-16", "com.ibm.icu.charset.CharsetLMBCS"); 152 algorithmicCharsets.put("LMBCS-17", "com.ibm.icu.charset.CharsetLMBCS"); 153 algorithmicCharsets.put("LMBCS-18", "com.ibm.icu.charset.CharsetLMBCS"); 154 algorithmicCharsets.put("LMBCS-19", "com.ibm.icu.charset.CharsetLMBCS"); 155 algorithmicCharsets.put("BOCU-1", "com.ibm.icu.charset.CharsetBOCU1" ); 156 algorithmicCharsets.put("SCSU", "com.ibm.icu.charset.CharsetSCSU" ); 157 algorithmicCharsets.put("US-ASCII", "com.ibm.icu.charset.CharsetASCII" ); 158 algorithmicCharsets.put("ISO-8859-1", "com.ibm.icu.charset.Charset88591" ); 159 algorithmicCharsets.put("UTF-16", "com.ibm.icu.charset.CharsetUTF16" ); 160 algorithmicCharsets.put("UTF-16BE", "com.ibm.icu.charset.CharsetUTF16BE" ); 161 algorithmicCharsets.put("UTF-16BE,version=1", "com.ibm.icu.charset.CharsetUTF16BE" ); 162 algorithmicCharsets.put("UTF-16LE", "com.ibm.icu.charset.CharsetUTF16LE" ); 163 algorithmicCharsets.put("UTF-16LE,version=1", "com.ibm.icu.charset.CharsetUTF16LE" ); 164 algorithmicCharsets.put("UTF16_OppositeEndian", "com.ibm.icu.charset.CharsetUTF16LE" ); 165 algorithmicCharsets.put("UTF16_PlatformEndian", "com.ibm.icu.charset.CharsetUTF16" ); 166 algorithmicCharsets.put("UTF-32", "com.ibm.icu.charset.CharsetUTF32" ); 167 algorithmicCharsets.put("UTF-32BE", "com.ibm.icu.charset.CharsetUTF32BE" ); 168 algorithmicCharsets.put("UTF-32LE", "com.ibm.icu.charset.CharsetUTF32LE" ); 169 algorithmicCharsets.put("UTF32_OppositeEndian", "com.ibm.icu.charset.CharsetUTF32LE" ); 170 algorithmicCharsets.put("UTF32_PlatformEndian", "com.ibm.icu.charset.CharsetUTF32" ); 171 algorithmicCharsets.put("UTF-8", "com.ibm.icu.charset.CharsetUTF8" ); 172 algorithmicCharsets.put("CESU-8", "com.ibm.icu.charset.CharsetCESU8" ); 173 algorithmicCharsets.put("UTF-7", "com.ibm.icu.charset.CharsetUTF7" ); 174 algorithmicCharsets.put("ISCII,version=0", "com.ibm.icu.charset.CharsetISCII" ); 175 algorithmicCharsets.put("ISCII,version=1", "com.ibm.icu.charset.CharsetISCII" ); 176 algorithmicCharsets.put("ISCII,version=2", "com.ibm.icu.charset.CharsetISCII" ); 177 algorithmicCharsets.put("ISCII,version=3", "com.ibm.icu.charset.CharsetISCII" ); 178 algorithmicCharsets.put("ISCII,version=4", "com.ibm.icu.charset.CharsetISCII" ); 179 algorithmicCharsets.put("ISCII,version=5", "com.ibm.icu.charset.CharsetISCII" ); 180 algorithmicCharsets.put("ISCII,version=6", "com.ibm.icu.charset.CharsetISCII" ); 181 algorithmicCharsets.put("ISCII,version=7", "com.ibm.icu.charset.CharsetISCII" ); 182 algorithmicCharsets.put("ISCII,version=8", "com.ibm.icu.charset.CharsetISCII" ); 183 algorithmicCharsets.put("IMAP-mailbox-name", "com.ibm.icu.charset.CharsetUTF7" ); 184 algorithmicCharsets.put("HZ", "com.ibm.icu.charset.CharsetHZ" ); 185 algorithmicCharsets.put("ISO_2022,locale=ja,version=0", "com.ibm.icu.charset.CharsetISO2022" ); 186 algorithmicCharsets.put("ISO_2022,locale=ja,version=1", "com.ibm.icu.charset.CharsetISO2022" ); 187 algorithmicCharsets.put("ISO_2022,locale=ja,version=2", "com.ibm.icu.charset.CharsetISO2022" ); 188 algorithmicCharsets.put("ISO_2022,locale=ja,version=3", "com.ibm.icu.charset.CharsetISO2022" ); 189 algorithmicCharsets.put("ISO_2022,locale=ja,version=4", "com.ibm.icu.charset.CharsetISO2022" ); 190 algorithmicCharsets.put("ISO_2022,locale=zh,version=0", "com.ibm.icu.charset.CharsetISO2022" ); 191 algorithmicCharsets.put("ISO_2022,locale=zh,version=1", "com.ibm.icu.charset.CharsetISO2022" ); 192 algorithmicCharsets.put("ISO_2022,locale=zh,version=2", "com.ibm.icu.charset.CharsetISO2022" ); 193 algorithmicCharsets.put("ISO_2022,locale=ko,version=0", "com.ibm.icu.charset.CharsetISO2022" ); 194 algorithmicCharsets.put("ISO_2022,locale=ko,version=1", "com.ibm.icu.charset.CharsetISO2022" ); 195 algorithmicCharsets.put("x11-compound-text", "com.ibm.icu.charset.CharsetCompoundText" ); 196 } 197 getCharset(String icuCanonicalName, String javaCanonicalName, String[] aliases)198 /*public*/ static final Charset getCharset(String icuCanonicalName, String javaCanonicalName, String[] aliases){ 199 String className = algorithmicCharsets.get(icuCanonicalName); 200 if(className==null){ 201 //all the cnv files are loaded as MBCS 202 className = "com.ibm.icu.charset.CharsetMBCS"; 203 } 204 try{ 205 CharsetICU conv = null; 206 Class<? extends CharsetICU> cs = Class.forName(className).asSubclass(CharsetICU.class); 207 Class<?>[] paramTypes = new Class<?>[]{ String.class, String.class, String[].class}; 208 final Constructor<? extends CharsetICU> c = cs.getConstructor(paramTypes); 209 Object[] params = new Object[]{ icuCanonicalName, javaCanonicalName, aliases}; 210 211 // Run constructor 212 try { 213 conv = c.newInstance(params); 214 if (conv != null) { 215 return conv; 216 } 217 }catch (InvocationTargetException e) { 218 Throwable cause = e.getCause(); 219 UnsupportedCharsetException e2 = new UnsupportedCharsetException( 220 icuCanonicalName + ": " + "Could not load " + className + ". Exception: " + cause); 221 e2.initCause(cause); 222 throw e2; 223 } 224 }catch(ClassNotFoundException ex){ 225 }catch(NoSuchMethodException ex){ 226 }catch (IllegalAccessException ex){ 227 }catch (InstantiationException ex){ 228 } 229 throw new UnsupportedCharsetException( icuCanonicalName+": "+"Could not load " + className); 230 } 231 isSurrogate(int c)232 static final boolean isSurrogate(int c){ 233 return (((c)&0xfffff800)==0xd800); 234 } 235 236 /* 237 * Returns the default charset name 238 */ 239 // static final String getDefaultCharsetName(){ 240 // String defaultEncoding = new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding(); 241 // return defaultEncoding; 242 // } 243 244 /** 245 * Returns a charset object for the named charset. 246 * This method gurantee that ICU charset is returned when 247 * available. If the ICU charset provider does not support 248 * the specified charset, then try other charset providers 249 * including the standard Java charset provider. 250 * 251 * @param charsetName The name of the requested charset, 252 * may be either a canonical name or an alias 253 * @return A charset object for the named charset 254 * @throws IllegalCharsetNameException If the given charset name 255 * is illegal 256 * @throws UnsupportedCharsetException If no support for the 257 * named charset is available in this instance of th Java 258 * virtual machine 259 * @stable ICU 3.6 260 */ forNameICU(String charsetName)261 public static Charset forNameICU(String charsetName) throws IllegalCharsetNameException, UnsupportedCharsetException { 262 CharsetProviderICU icuProvider = new CharsetProviderICU(); 263 CharsetICU cs = (CharsetICU) icuProvider.charsetForName(charsetName); 264 if (cs != null) { 265 return cs; 266 } 267 return Charset.forName(charsetName); 268 } 269 270 // /** 271 // * @see java.lang.Comparable#compareTo(java.lang.Object) 272 // * @stable 3.8 273 // */ 274 // public int compareTo(Object otherObj) { 275 // if (!(otherObj instanceof CharsetICU)) { 276 // return -1; 277 // } 278 // return icuCanonicalName.compareTo(((CharsetICU)otherObj).icuCanonicalName); 279 // } 280 281 /** 282 * This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the 283 * start of the stream for example U+FEFF (the Unicode BOM/signature 284 * character) that can be ignored. 285 * 286 * Detects Unicode signature byte sequences at the start of the byte stream 287 * and returns number of bytes of the BOM of the indicated Unicode charset. 288 * 0 is returned when no Unicode signature is recognized. 289 * 290 */ 291 // TODO This should be proposed as CharsetDecoderICU API. 292 // static String detectUnicodeSignature(ByteBuffer source) { 293 // int signatureLength = 0; // number of bytes of the signature 294 // final int SIG_MAX_LEN = 5; 295 // String sigUniCharset = null; // states what unicode charset is the BOM 296 // int i = 0; 297 // 298 // /* 299 // * initial 0xa5 bytes: make sure that if we read <SIG_MAX_LEN bytes we 300 // * don't misdetect something 301 // */ 302 // byte start[] = { (byte) 0xa5, (byte) 0xa5, (byte) 0xa5, (byte) 0xa5, 303 // (byte) 0xa5 }; 304 // 305 // while (i < source.remaining() && i < SIG_MAX_LEN) { 306 // start[i] = source.get(i); 307 // i++; 308 // } 309 // 310 // if (start[0] == (byte) 0xFE && start[1] == (byte) 0xFF) { 311 // signatureLength = 2; 312 // sigUniCharset = "UTF-16BE"; 313 // source.position(signatureLength); 314 // return sigUniCharset; 315 // } else if (start[0] == (byte) 0xFF && start[1] == (byte) 0xFE) { 316 // if (start[2] == (byte) 0x00 && start[3] == (byte) 0x00) { 317 // signatureLength = 4; 318 // sigUniCharset = "UTF-32LE"; 319 // source.position(signatureLength); 320 // return sigUniCharset; 321 // } else { 322 // signatureLength = 2; 323 // sigUniCharset = "UTF-16LE"; 324 // source.position(signatureLength); 325 // return sigUniCharset; 326 // } 327 // } else if (start[0] == (byte) 0xEF && start[1] == (byte) 0xBB 328 // && start[2] == (byte) 0xBF) { 329 // signatureLength = 3; 330 // sigUniCharset = "UTF-8"; 331 // source.position(signatureLength); 332 // return sigUniCharset; 333 // } else if (start[0] == (byte) 0x00 && start[1] == (byte) 0x00 334 // && start[2] == (byte) 0xFE && start[3] == (byte) 0xFF) { 335 // signatureLength = 4; 336 // sigUniCharset = "UTF-32BE"; 337 // source.position(signatureLength); 338 // return sigUniCharset; 339 // } else if (start[0] == (byte) 0x0E && start[1] == (byte) 0xFE 340 // && start[2] == (byte) 0xFF) { 341 // signatureLength = 3; 342 // sigUniCharset = "SCSU"; 343 // source.position(signatureLength); 344 // return sigUniCharset; 345 // } else if (start[0] == (byte) 0xFB && start[1] == (byte) 0xEE 346 // && start[2] == (byte) 0x28) { 347 // signatureLength = 3; 348 // sigUniCharset = "BOCU-1"; 349 // source.position(signatureLength); 350 // return sigUniCharset; 351 // } else if (start[0] == (byte) 0x2B && start[1] == (byte) 0x2F 352 // && start[2] == (byte) 0x76) { 353 // 354 // if (start[3] == (byte) 0x38 && start[4] == (byte) 0x2D) { 355 // signatureLength = 5; 356 // sigUniCharset = "UTF-7"; 357 // source.position(signatureLength); 358 // return sigUniCharset; 359 // } else if (start[3] == (byte) 0x38 || start[3] == (byte) 0x39 360 // || start[3] == (byte) 0x2B || start[3] == (byte) 0x2F) { 361 // signatureLength = 4; 362 // sigUniCharset = "UTF-7"; 363 // source.position(signatureLength); 364 // return sigUniCharset; 365 // } 366 // } else if (start[0] == (byte) 0xDD && start[2] == (byte) 0x73 367 // && start[2] == (byte) 0x66 && start[3] == (byte) 0x73) { 368 // signatureLength = 4; 369 // sigUniCharset = "UTF-EBCDIC"; 370 // source.position(signatureLength); 371 // return sigUniCharset; 372 // } 373 // 374 // /* no known Unicode signature byte sequence recognized */ 375 // return null; 376 // } 377 378 getUnicodeSetImpl(UnicodeSet setFillIn, int which)379 abstract void getUnicodeSetImpl(UnicodeSet setFillIn, int which); 380 381 /** 382 * Returns the set of Unicode code points that can be converted by an ICU Converter. 383 * 384 * <p>The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET): The set of all Unicode code points that can be 385 * roundtrip-converted (converted without any data loss) with the converter This set will not include code points that have fallback 386 * mappings or are only the result of reverse fallback mappings. See UTR #22 "Character Mapping Markup Language" at <a href="http://www.unicode.org/reports/tr22/">http://www.unicode.org/reports/tr22/</a> 387 * 388 * <p>In the future, there may be more UConverterUnicodeSet choices to select sets with different properties. 389 * 390 * <p>This is useful for example for 391 * <ul><li>checking that a string or document can be roundtrip-converted with a converter, 392 * without/before actually performing the conversion</li> 393 * <li>testing if a converter can be used for text for typical text for a certain locale, 394 * by comparing its roundtrip set with the set of ExemplarCharacters from 395 * ICU's locale data or other sources</li></ul> 396 * 397 * @param setFillIn A valid UnicodeSet. It will be cleared by this function before 398 * the converter's specific set is filled in. 399 * @param which A selector; currently ROUNDTRIP_SET is the only supported value. 400 * @throws IllegalArgumentException if the parameters does not match. 401 * @stable ICU 4.0 402 */ getUnicodeSet(UnicodeSet setFillIn, int which)403 public void getUnicodeSet(UnicodeSet setFillIn, int which){ 404 if( setFillIn == null || which != ROUNDTRIP_SET ){ 405 throw new IllegalArgumentException(); 406 } 407 setFillIn.clear(); 408 getUnicodeSetImpl(setFillIn, which); 409 } 410 411 /** 412 * Returns whether or not the charset of the converter has a fixed number of bytes 413 * per charset character. 414 * An example of this are converters that are of the type UCNV_SBCS or UCNV_DBCS. 415 * Another example is UTF-32 which is always 4 bytes per character. A UTF-32 code point 416 * may represent more than one UTF-8 or UTF-16 code units but always have size of 4 bytes. 417 * Note: This method is not intended to be used to determine whether the charset has a 418 * fixed ratio of bytes to Unicode codes units for any particular Unicode encoding form. 419 * @return true if the converter is fixed-width 420 * @stable ICU 4.8 421 */ isFixedWidth()422 public boolean isFixedWidth() { 423 if (this instanceof CharsetASCII || this instanceof CharsetUTF32) { 424 return true; 425 } 426 427 if (this instanceof CharsetMBCS) { 428 if (((CharsetMBCS)this).sharedData.staticData.maxBytesPerChar == ((CharsetMBCS)this).sharedData.staticData.minBytesPerChar) { 429 return true; 430 } 431 } 432 433 return false; 434 } 435 getNonSurrogateUnicodeSet(UnicodeSet setFillIn)436 static void getNonSurrogateUnicodeSet(UnicodeSet setFillIn){ 437 setFillIn.add(0, 0xd7ff); 438 setFillIn.add(0xe000, 0x10ffff); 439 } 440 getCompleteUnicodeSet(UnicodeSet setFillIn)441 static void getCompleteUnicodeSet(UnicodeSet setFillIn){ 442 setFillIn.add(0, 0x10ffff); 443 } 444 } 445