1 /* 2 * Copyright (C) 2014 The Android Open Source Project 3 * Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. Oracle designates this 9 * particular file as subject to the "Classpath" exception as provided 10 * by Oracle in the LICENSE file that accompanied this code. 11 * 12 * This code is distributed in the hope that it will be useful, but WITHOUT 13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 * version 2 for more details (a copy is included in the LICENSE file that 16 * accompanied this code). 17 * 18 * You should have received a copy of the GNU General Public License version 19 * 2 along with this work; if not, write to the Free Software Foundation, 20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 21 * 22 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 23 * or visit www.oracle.com if you need additional information or have any 24 * questions. 25 */ 26 27 package java.nio.charset; 28 29 import java.io.UnsupportedEncodingException; 30 import libcore.icu.NativeConverter; 31 import java.nio.ByteBuffer; 32 import java.nio.CharBuffer; 33 import java.nio.charset.spi.CharsetProvider; 34 import java.security.AccessController; 35 import java.security.AccessControlException; 36 import java.security.PrivilegedAction; 37 import java.util.AbstractMap; 38 import java.util.Collections; 39 import java.util.HashMap; 40 import java.util.HashSet; 41 import java.util.Iterator; 42 import java.util.Locale; 43 import java.util.Map; 44 import java.util.NoSuchElementException; 45 import java.util.Set; 46 import java.util.ServiceLoader; 47 import java.util.ServiceConfigurationError; 48 import java.util.SortedMap; 49 import java.util.TreeMap; 50 import sun.misc.ASCIICaseInsensitiveComparator; 51 import sun.nio.cs.ThreadLocalCoders; 52 import sun.security.action.GetPropertyAction; 53 54 55 /** 56 * A named mapping between sequences of sixteen-bit Unicode <a 57 * href="../../lang/Character.html#unicode">code units</a> and sequences of 58 * bytes. This class defines methods for creating decoders and encoders and 59 * for retrieving the various names associated with a charset. Instances of 60 * this class are immutable. 61 * 62 * <p> This class also defines static methods for testing whether a particular 63 * charset is supported, for locating charset instances by name, and for 64 * constructing a map that contains every charset for which support is 65 * available in the current Java virtual machine. Support for new charsets can 66 * be added via the service-provider interface defined in the {@link 67 * java.nio.charset.spi.CharsetProvider} class. 68 * 69 * <p> All of the methods defined in this class are safe for use by multiple 70 * concurrent threads. 71 * 72 * 73 * <a name="names"></a><a name="charenc"></a> 74 * <h2>Charset names</h2> 75 * 76 * <p> Charsets are named by strings composed of the following characters: 77 * 78 * <ul> 79 * 80 * <li> The uppercase letters <tt>'A'</tt> through <tt>'Z'</tt> 81 * (<tt>'\u0041'</tt> through <tt>'\u005a'</tt>), 82 * 83 * <li> The lowercase letters <tt>'a'</tt> through <tt>'z'</tt> 84 * (<tt>'\u0061'</tt> through <tt>'\u007a'</tt>), 85 * 86 * <li> The digits <tt>'0'</tt> through <tt>'9'</tt> 87 * (<tt>'\u0030'</tt> through <tt>'\u0039'</tt>), 88 * 89 * <li> The dash character <tt>'-'</tt> 90 * (<tt>'\u002d'</tt>, <small>HYPHEN-MINUS</small>), 91 * 92 * <li> The plus character <tt>'+'</tt> 93 * (<tt>'\u002b'</tt>, <small>PLUS SIGN</small>), 94 * 95 * <li> The period character <tt>'.'</tt> 96 * (<tt>'\u002e'</tt>, <small>FULL STOP</small>), 97 * 98 * <li> The colon character <tt>':'</tt> 99 * (<tt>'\u003a'</tt>, <small>COLON</small>), and 100 * 101 * <li> The underscore character <tt>'_'</tt> 102 * (<tt>'\u005f'</tt>, <small>LOW LINE</small>). 103 * 104 * </ul> 105 * 106 * A charset name must begin with either a letter or a digit. The empty string 107 * is not a legal charset name. Charset names are not case-sensitive; that is, 108 * case is always ignored when comparing charset names. Charset names 109 * generally follow the conventions documented in <a 110 * href="http://www.ietf.org/rfc/rfc2278.txt"><i>RFC 2278: IANA Charset 111 * Registration Procedures</i></a>. 112 * 113 * <p> Every charset has a <i>canonical name</i> and may also have one or more 114 * <i>aliases</i>. The canonical name is returned by the {@link #name() name} method 115 * of this class. Canonical names are, by convention, usually in upper case. 116 * The aliases of a charset are returned by the {@link #aliases() aliases} 117 * method. 118 * 119 * <p><a name="hn">Some charsets have an <i>historical name</i> that is defined for 120 * compatibility with previous versions of the Java platform.</a> A charset's 121 * historical name is either its canonical name or one of its aliases. The 122 * historical name is returned by the <tt>getEncoding()</tt> methods of the 123 * {@link java.io.InputStreamReader#getEncoding InputStreamReader} and {@link 124 * java.io.OutputStreamWriter#getEncoding OutputStreamWriter} classes. 125 * 126 * <p><a name="iana"> </a>If a charset listed in the <a 127 * href="http://www.iana.org/assignments/character-sets"><i>IANA Charset 128 * Registry</i></a> is supported by an implementation of the Java platform then 129 * its canonical name must be the name listed in the registry. Many charsets 130 * are given more than one name in the registry, in which case the registry 131 * identifies one of the names as <i>MIME-preferred</i>. If a charset has more 132 * than one registry name then its canonical name must be the MIME-preferred 133 * name and the other names in the registry must be valid aliases. If a 134 * supported charset is not listed in the IANA registry then its canonical name 135 * must begin with one of the strings <tt>"X-"</tt> or <tt>"x-"</tt>. 136 * 137 * <p> The IANA charset registry does change over time, and so the canonical 138 * name and the aliases of a particular charset may also change over time. To 139 * ensure compatibility it is recommended that no alias ever be removed from a 140 * charset, and that if the canonical name of a charset is changed then its 141 * previous canonical name be made into an alias. 142 * 143 * 144 * <h2>Standard charsets</h2> 145 * 146 * 147 * 148 * <p><a name="standard">Every implementation of the Java platform is required to support the 149 * following standard charsets.</a> Consult the release documentation for your 150 * implementation to see if any other charsets are supported. The behavior 151 * of such optional charsets may differ between implementations. 152 * 153 * <blockquote><table width="80%" summary="Description of standard charsets"> 154 * <tr><th align="left">Charset</th><th align="left">Description</th></tr> 155 * <tr><td valign=top><tt>US-ASCII</tt></td> 156 * <td>Seven-bit ASCII, a.k.a. <tt>ISO646-US</tt>, 157 * a.k.a. the Basic Latin block of the Unicode character set</td></tr> 158 * <tr><td valign=top><tt>ISO-8859-1 </tt></td> 159 * <td>ISO Latin Alphabet No. 1, a.k.a. <tt>ISO-LATIN-1</tt></td></tr> 160 * <tr><td valign=top><tt>UTF-8</tt></td> 161 * <td>Eight-bit UCS Transformation Format</td></tr> 162 * <tr><td valign=top><tt>UTF-16BE</tt></td> 163 * <td>Sixteen-bit UCS Transformation Format, 164 * big-endian byte order</td></tr> 165 * <tr><td valign=top><tt>UTF-16LE</tt></td> 166 * <td>Sixteen-bit UCS Transformation Format, 167 * little-endian byte order</td></tr> 168 * <tr><td valign=top><tt>UTF-16</tt></td> 169 * <td>Sixteen-bit UCS Transformation Format, 170 * byte order identified by an optional byte-order mark</td></tr> 171 * </table></blockquote> 172 * 173 * <p> The <tt>UTF-8</tt> charset is specified by <a 174 * href="http://www.ietf.org/rfc/rfc2279.txt"><i>RFC 2279</i></a>; the 175 * transformation format upon which it is based is specified in 176 * Amendment 2 of ISO 10646-1 and is also described in the <a 177 * href="http://www.unicode.org/unicode/standard/standard.html"><i>Unicode 178 * Standard</i></a>. 179 * 180 * <p> The <tt>UTF-16</tt> charsets are specified by <a 181 * href="http://www.ietf.org/rfc/rfc2781.txt"><i>RFC 2781</i></a>; the 182 * transformation formats upon which they are based are specified in 183 * Amendment 1 of ISO 10646-1 and are also described in the <a 184 * href="http://www.unicode.org/unicode/standard/standard.html"><i>Unicode 185 * Standard</i></a>. 186 * 187 * <p> The <tt>UTF-16</tt> charsets use sixteen-bit quantities and are 188 * therefore sensitive to byte order. In these encodings the byte order of a 189 * stream may be indicated by an initial <i>byte-order mark</i> represented by 190 * the Unicode character <tt>'\uFEFF'</tt>. Byte-order marks are handled 191 * as follows: 192 * 193 * <ul> 194 * 195 * <li><p> When decoding, the <tt>UTF-16BE</tt> and <tt>UTF-16LE</tt> 196 * charsets interpret the initial byte-order marks as a <small>ZERO-WIDTH 197 * NON-BREAKING SPACE</small>; when encoding, they do not write 198 * byte-order marks. </p></li> 199 200 * 201 * <li><p> When decoding, the <tt>UTF-16</tt> charset interprets the 202 * byte-order mark at the beginning of the input stream to indicate the 203 * byte-order of the stream but defaults to big-endian if there is no 204 * byte-order mark; when encoding, it uses big-endian byte order and writes 205 * a big-endian byte-order mark. </p></li> 206 * 207 * </ul> 208 * 209 * In any case, byte order marks occurring after the first element of an 210 * input sequence are not omitted since the same code is used to represent 211 * <small>ZERO-WIDTH NON-BREAKING SPACE</small>. 212 * 213 * <p>Android note: The Android platform default is always UTF-8. 214 * 215 * <p>The {@link StandardCharsets} class defines constants for each of the 216 * standard charsets. 217 * 218 * <h2>Terminology</h2> 219 * 220 * <p> The name of this class is taken from the terms used in 221 * <a href="http://www.ietf.org/rfc/rfc2278.txt"><i>RFC 2278</i></a>. 222 * In that document a <i>charset</i> is defined as the combination of 223 * one or more coded character sets and a character-encoding scheme. 224 * (This definition is confusing; some other software systems define 225 * <i>charset</i> as a synonym for <i>coded character set</i>.) 226 * 227 * <p> A <i>coded character set</i> is a mapping between a set of abstract 228 * characters and a set of integers. US-ASCII, ISO 8859-1, 229 * JIS X 0201, and Unicode are examples of coded character sets. 230 * 231 * <p> Some standards have defined a <i>character set</i> to be simply a 232 * set of abstract characters without an associated assigned numbering. 233 * An alphabet is an example of such a character set. However, the subtle 234 * distinction between <i>character set</i> and <i>coded character set</i> 235 * is rarely used in practice; the former has become a short form for the 236 * latter, including in the Java API specification. 237 * 238 * <p> A <i>character-encoding scheme</i> is a mapping between one or more 239 * coded character sets and a set of octet (eight-bit byte) sequences. 240 * UTF-8, UTF-16, ISO 2022, and EUC are examples of 241 * character-encoding schemes. Encoding schemes are often associated with 242 * a particular coded character set; UTF-8, for example, is used only to 243 * encode Unicode. Some schemes, however, are associated with multiple 244 * coded character sets; EUC, for example, can be used to encode 245 * characters in a variety of Asian coded character sets. 246 * 247 * <p> When a coded character set is used exclusively with a single 248 * character-encoding scheme then the corresponding charset is usually 249 * named for the coded character set; otherwise a charset is usually named 250 * for the encoding scheme and, possibly, the locale of the coded 251 * character sets that it supports. Hence <tt>US-ASCII</tt> is both the 252 * name of a coded character set and of the charset that encodes it, while 253 * <tt>EUC-JP</tt> is the name of the charset that encodes the 254 * JIS X 0201, JIS X 0208, and JIS X 0212 255 * coded character sets for the Japanese language. 256 * 257 * <p> The native character encoding of the Java programming language is 258 * UTF-16. A charset in the Java platform therefore defines a mapping 259 * between sequences of sixteen-bit UTF-16 code units (that is, sequences 260 * of chars) and sequences of bytes. </p> 261 * 262 * 263 * @author Mark Reinhold 264 * @author JSR-51 Expert Group 265 * @since 1.4 266 * 267 * @see CharsetDecoder 268 * @see CharsetEncoder 269 * @see java.nio.charset.spi.CharsetProvider 270 * @see java.lang.Character 271 */ 272 273 public abstract class Charset 274 implements Comparable<Charset> 275 { 276 277 /* -- Static methods -- */ 278 279 private static volatile String bugLevel = null; 280 atBugLevel(String bl)281 static boolean atBugLevel(String bl) { // package-private 282 String level = bugLevel; 283 if (level == null) { 284 if (!sun.misc.VM.isBooted()) 285 return false; 286 bugLevel = level = AccessController.doPrivileged( 287 new GetPropertyAction("sun.nio.cs.bugLevel", "")); 288 } 289 return level.equals(bl); 290 } 291 292 /** 293 * Checks that the given string is a legal charset name. </p> 294 * 295 * @param s 296 * A purported charset name 297 * 298 * @throws IllegalCharsetNameException 299 * If the given name is not a legal charset name 300 */ checkName(String s)301 private static void checkName(String s) { 302 int n = s.length(); 303 if (!atBugLevel("1.4")) { 304 if (n == 0) 305 throw new IllegalCharsetNameException(s); 306 } 307 for (int i = 0; i < n; i++) { 308 char c = s.charAt(i); 309 if (c >= 'A' && c <= 'Z') continue; 310 if (c >= 'a' && c <= 'z') continue; 311 if (c >= '0' && c <= '9') continue; 312 if (c == '-' && i != 0) continue; 313 if (c == '+' && i != 0) continue; 314 if (c == ':' && i != 0) continue; 315 if (c == '_' && i != 0) continue; 316 if (c == '.' && i != 0) continue; 317 throw new IllegalCharsetNameException(s); 318 } 319 } 320 321 /* The standard set of charsets */ 322 // Android-removed: We use ICU's list of standard charsets. 323 // private static CharsetProvider standardProvider = new StandardCharsets(); 324 325 // Cache of the most-recently-returned charsets, 326 // along with the names that were used to find them 327 // 328 // cache1/2 usage is explained in the lookup method 329 // 330 private static volatile Map.Entry<String, Charset> cache1 = null; // "Level 1" cache 331 private static final HashMap<String, Charset> cache2 = new HashMap<>(); // "Level 2" cache 332 cache(String charsetName, Charset cs)333 private static void cache(String charsetName, Charset cs) { 334 synchronized(cache2) { 335 String canonicalName = cs.name(); 336 Charset canonicalCharset = cache2.get(canonicalName); 337 338 if (canonicalCharset != null) { 339 cs = canonicalCharset; 340 } else { 341 cache2.put(canonicalName, cs); 342 343 for (String alias : cs.aliases()) { 344 cache2.put(alias, cs); 345 } 346 } 347 348 cache2.put(charsetName, cs); 349 } 350 351 cache1 = new AbstractMap.SimpleImmutableEntry<>(charsetName, cs); 352 } 353 354 // Creates an iterator that walks over the available providers, ignoring 355 // those whose lookup or instantiation causes a security exception to be 356 // thrown. Should be invoked with full privileges. 357 // providers()358 private static Iterator<CharsetProvider> providers() { 359 return new Iterator<CharsetProvider>() { 360 361 ServiceLoader<CharsetProvider> sl = 362 ServiceLoader.load(CharsetProvider.class); 363 Iterator<CharsetProvider> i = sl.iterator(); 364 365 CharsetProvider next = null; 366 367 private boolean getNext() { 368 while (next == null) { 369 try { 370 if (!i.hasNext()) 371 return false; 372 next = i.next(); 373 } catch (ServiceConfigurationError sce) { 374 if (sce.getCause() instanceof SecurityException) { 375 // Ignore security exceptions 376 continue; 377 } 378 throw sce; 379 } 380 } 381 return true; 382 } 383 384 public boolean hasNext() { 385 return getNext(); 386 } 387 388 public CharsetProvider next() { 389 if (!getNext()) 390 throw new NoSuchElementException(); 391 CharsetProvider n = next; 392 next = null; 393 return n; 394 } 395 396 public void remove() { 397 throw new UnsupportedOperationException(); 398 } 399 400 }; 401 } 402 403 // Thread-local gate to prevent recursive provider lookups 404 private static ThreadLocal<ThreadLocal<?>> gate = 405 new ThreadLocal<ThreadLocal<?>>(); 406 lookupViaProviders(final String charsetName)407 private static Charset lookupViaProviders(final String charsetName) { 408 409 // The runtime startup sequence looks up standard charsets as a 410 // consequence of the VM's invocation of System.initializeSystemClass 411 // in order to, e.g., set system properties and encode filenames. At 412 // that point the application class loader has not been initialized, 413 // however, so we can't look for providers because doing so will cause 414 // that loader to be prematurely initialized with incomplete 415 // information. 416 // 417 if (!sun.misc.VM.isBooted()) 418 return null; 419 420 if (gate.get() != null) 421 // Avoid recursive provider lookups 422 return null; 423 try { 424 gate.set(gate); 425 426 return AccessController.doPrivileged( 427 new PrivilegedAction<Charset>() { 428 public Charset run() { 429 for (Iterator<CharsetProvider> i = providers(); 430 i.hasNext();) { 431 CharsetProvider cp = i.next(); 432 Charset cs = cp.charsetForName(charsetName); 433 if (cs != null) 434 return cs; 435 } 436 return null; 437 } 438 }); 439 440 } finally { 441 gate.set(null); 442 } 443 } 444 445 // Android-removed: Remove support for the extended charset provider. 446 // 447 /* The extended set of charsets */ 448 // private static Object extendedProviderLock = new Object(); 449 // private static boolean extendedProviderProbed = false; 450 // private static CharsetProvider extendedProvider = null; 451 // 452 // private static void probeExtendedProvider() { 453 // AccessController.doPrivileged(new PrivilegedAction<Object>() { 454 // public Object run() { 455 // try { 456 // Class epc 457 // = Class.forName("sun.nio.cs.ext.ExtendedCharsets"); 458 // extendedProvider = (CharsetProvider)epc.newInstance(); 459 // } catch (ClassNotFoundException x) { 460 // // Extended charsets not available 461 // // (charsets.jar not present) 462 // } catch (InstantiationException x) { 463 // throw new Error(x); 464 // } catch (IllegalAccessException x) { 465 // throw new Error(x); 466 // } 467 // return null; 468 // } 469 // }); 470 // } 471 // 472 // private static Charset lookupExtendedCharset(String charsetName) { 473 // CharsetProvider ecp = null; 474 // synchronized (extendedProviderLock) { 475 // if (!extendedProviderProbed) { 476 // probeExtendedProvider(); 477 // extendedProviderProbed = true; 478 // } 479 // ecp = extendedProvider; 480 // } 481 // return (ecp != null) ? ecp.charsetForName(charsetName) : null; 482 // } 483 484 // We expect most programs to use one Charset repeatedly, so the most recently used Charset 485 // instance is stored in the level 1 cache. We convey a hint to this effect to the VM by putting 486 // the level 1 cache miss code in a separate method. Since charsetName is not necessarily in 487 // canonical form, we store the mapping from both the canonical name and the aliases to the 488 // instance in a map for level 2 cache. 489 private static Charset lookup(String charsetName) { 490 if (charsetName == null) 491 throw new IllegalArgumentException("Null charset name"); 492 493 494 final Map.Entry<String, Charset> cached = cache1; 495 if (cached != null && charsetName.equals(cached.getKey())) 496 return cached.getValue(); 497 return lookup2(charsetName); 498 } 499 500 private static Charset lookup2(String charsetName) { 501 Charset cs; 502 synchronized (cache2) { 503 if ((cs = cache2.get(charsetName)) != null) { 504 cache1 = new AbstractMap.SimpleImmutableEntry<>(charsetName, cs); 505 return cs; 506 } 507 } 508 509 // Android-changed: Drop support for "standard" and "extended" 510 // providers. 511 if ((cs = NativeConverter.charsetForName(charsetName)) != null || 512 (cs = lookupViaProviders(charsetName)) != null) 513 { 514 cache(charsetName, cs); 515 return cs; 516 } 517 518 /* Only need to check the name if we didn't find a charset for it */ 519 checkName(charsetName); 520 return null; 521 } 522 523 /** 524 * Tells whether the named charset is supported. 525 * 526 * @param charsetName 527 * The name of the requested charset; may be either 528 * a canonical name or an alias 529 * 530 * @return <tt>true</tt> if, and only if, support for the named charset 531 * is available in the current Java virtual machine 532 * 533 * @throws IllegalCharsetNameException 534 * If the given charset name is illegal 535 * 536 * @throws IllegalArgumentException 537 * If the given <tt>charsetName</tt> is null 538 */ 539 public static boolean isSupported(String charsetName) { 540 return (lookup(charsetName) != null); 541 } 542 543 /** 544 * Returns a charset object for the named charset. 545 * 546 * @param charsetName 547 * The name of the requested charset; may be either 548 * a canonical name or an alias 549 * 550 * @return A charset object for the named charset 551 * 552 * @throws IllegalCharsetNameException 553 * If the given charset name is illegal 554 * 555 * @throws IllegalArgumentException 556 * If the given <tt>charsetName</tt> is null 557 * 558 * @throws UnsupportedCharsetException 559 * If no support for the named charset is available 560 * in this instance of the Java virtual machine 561 */ 562 public static Charset forName(String charsetName) { 563 Charset cs = lookup(charsetName); 564 if (cs != null) 565 return cs; 566 throw new UnsupportedCharsetException(charsetName); 567 } 568 569 570 /** 571 * Equivalent to {@code forName} but only throws {@code UnsupportedEncodingException}, 572 * which is all pre-nio code claims to throw. 573 * 574 * @hide internal use only 575 */ 576 public static Charset forNameUEE(String charsetName) throws UnsupportedEncodingException { 577 try { 578 return Charset.forName(charsetName); 579 } catch (Exception cause) { 580 UnsupportedEncodingException ex = new UnsupportedEncodingException(charsetName); 581 ex.initCause(cause); 582 throw ex; 583 } 584 } 585 586 587 // Fold charsets from the given iterator into the given map, ignoring 588 // charsets whose names already have entries in the map. 589 // 590 private static void put(Iterator<Charset> i, Map<String,Charset> m) { 591 while (i.hasNext()) { 592 Charset cs = i.next(); 593 if (!m.containsKey(cs.name())) 594 m.put(cs.name(), cs); 595 } 596 } 597 598 /** 599 * Constructs a sorted map from canonical charset names to charset objects. 600 * 601 * <p> The map returned by this method will have one entry for each charset 602 * for which support is available in the current Java virtual machine. If 603 * two or more supported charsets have the same canonical name then the 604 * resulting map will contain just one of them; which one it will contain 605 * is not specified. </p> 606 * 607 * <p> The invocation of this method, and the subsequent use of the 608 * resulting map, may cause time-consuming disk or network I/O operations 609 * to occur. This method is provided for applications that need to 610 * enumerate all of the available charsets, for example to allow user 611 * charset selection. This method is not used by the {@link #forName 612 * forName} method, which instead employs an efficient incremental lookup 613 * algorithm. 614 * 615 * <p> This method may return different results at different times if new 616 * charset providers are dynamically made available to the current Java 617 * virtual machine. In the absence of such changes, the charsets returned 618 * by this method are exactly those that can be retrieved via the {@link 619 * #forName forName} method. </p> 620 * 621 * @return An immutable, case-insensitive map from canonical charset names 622 * to charset objects 623 */ 624 public static SortedMap<String,Charset> availableCharsets() { 625 return AccessController.doPrivileged( 626 new PrivilegedAction<SortedMap<String,Charset>>() { 627 public SortedMap<String,Charset> run() { 628 TreeMap<String,Charset> m = 629 new TreeMap<String,Charset>( 630 ASCIICaseInsensitiveComparator.CASE_INSENSITIVE_ORDER); 631 for (String charsetName : NativeConverter.getAvailableCharsetNames()) { 632 Charset charset = NativeConverter.charsetForName(charsetName); 633 m.put(charset.name(), charset); 634 } 635 // Android-changed: No more "standard" provider. 636 // put(standardProvider.charsets(), m); 637 for (Iterator i = providers(); i.hasNext();) { 638 CharsetProvider cp = (CharsetProvider)i.next(); 639 put(cp.charsets(), m); 640 } 641 return Collections.unmodifiableSortedMap(m); 642 } 643 }); 644 } 645 646 private static Charset defaultCharset; 647 648 /** 649 * Returns the default charset of this Java virtual machine. 650 * 651 * <p>Android note: The Android platform default is always UTF-8. 652 * 653 * @return A charset object for the default charset 654 * 655 * @since 1.5 656 */ 657 public static Charset defaultCharset() { 658 // Android-changed: Use UTF_8 unconditionally. 659 synchronized (Charset.class) { 660 if (defaultCharset == null) { 661 defaultCharset = java.nio.charset.StandardCharsets.UTF_8; 662 } 663 664 return defaultCharset; 665 } 666 } 667 668 669 /* -- Instance fields and methods -- */ 670 671 private final String name; // tickles a bug in oldjavac 672 private final String[] aliases; // tickles a bug in oldjavac 673 private Set<String> aliasSet = null; 674 675 /** 676 * Initializes a new charset with the given canonical name and alias 677 * set. 678 * 679 * @param canonicalName 680 * The canonical name of this charset 681 * 682 * @param aliases 683 * An array of this charset's aliases, or null if it has no aliases 684 * 685 * @throws IllegalCharsetNameException 686 * If the canonical name or any of the aliases are illegal 687 */ 688 protected Charset(String canonicalName, String[] aliases) { 689 checkName(canonicalName); 690 String[] as = (aliases == null) ? new String[0] : aliases; 691 for (int i = 0; i < as.length; i++) 692 checkName(as[i]); 693 this.name = canonicalName; 694 this.aliases = as; 695 } 696 697 /** 698 * Returns this charset's canonical name. 699 * 700 * @return The canonical name of this charset 701 */ 702 public final String name() { 703 return name; 704 } 705 706 /** 707 * Returns a set containing this charset's aliases. 708 * 709 * @return An immutable set of this charset's aliases 710 */ 711 public final Set<String> aliases() { 712 if (aliasSet != null) 713 return aliasSet; 714 int n = aliases.length; 715 HashSet<String> hs = new HashSet<String>(n); 716 for (int i = 0; i < n; i++) 717 hs.add(aliases[i]); 718 aliasSet = Collections.unmodifiableSet(hs); 719 return aliasSet; 720 } 721 722 /** 723 * Returns this charset's human-readable name for the default locale. 724 * 725 * <p> The default implementation of this method simply returns this 726 * charset's canonical name. Concrete subclasses of this class may 727 * override this method in order to provide a localized display name. </p> 728 * 729 * @return The display name of this charset in the default locale 730 */ 731 public String displayName() { 732 return name; 733 } 734 735 /** 736 * Tells whether or not this charset is registered in the <a 737 * href="http://www.iana.org/assignments/character-sets">IANA Charset 738 * Registry</a>. 739 * 740 * @return <tt>true</tt> if, and only if, this charset is known by its 741 * implementor to be registered with the IANA 742 */ 743 public final boolean isRegistered() { 744 return !name.startsWith("X-") && !name.startsWith("x-"); 745 } 746 747 /** 748 * Returns this charset's human-readable name for the given locale. 749 * 750 * <p> The default implementation of this method simply returns this 751 * charset's canonical name. Concrete subclasses of this class may 752 * override this method in order to provide a localized display name. </p> 753 * 754 * @param locale 755 * The locale for which the display name is to be retrieved 756 * 757 * @return The display name of this charset in the given locale 758 */ 759 public String displayName(Locale locale) { 760 return name; 761 } 762 763 /** 764 * Tells whether or not this charset contains the given charset. 765 * 766 * <p> A charset <i>C</i> is said to <i>contain</i> a charset <i>D</i> if, 767 * and only if, every character representable in <i>D</i> is also 768 * representable in <i>C</i>. If this relationship holds then it is 769 * guaranteed that every string that can be encoded in <i>D</i> can also be 770 * encoded in <i>C</i> without performing any replacements. 771 * 772 * <p> That <i>C</i> contains <i>D</i> does not imply that each character 773 * representable in <i>C</i> by a particular byte sequence is represented 774 * in <i>D</i> by the same byte sequence, although sometimes this is the 775 * case. 776 * 777 * <p> Every charset contains itself. 778 * 779 * <p> This method computes an approximation of the containment relation: 780 * If it returns <tt>true</tt> then the given charset is known to be 781 * contained by this charset; if it returns <tt>false</tt>, however, then 782 * it is not necessarily the case that the given charset is not contained 783 * in this charset. 784 * 785 * @param cs 786 * The given charset 787 * 788 * @return <tt>true</tt> if the given charset is contained in this charset 789 */ 790 public abstract boolean contains(Charset cs); 791 792 /** 793 * Constructs a new decoder for this charset. 794 * 795 * @return A new decoder for this charset 796 */ 797 public abstract CharsetDecoder newDecoder(); 798 799 /** 800 * Constructs a new encoder for this charset. 801 * 802 * @return A new encoder for this charset 803 * 804 * @throws UnsupportedOperationException 805 * If this charset does not support encoding 806 */ 807 public abstract CharsetEncoder newEncoder(); 808 809 /** 810 * Tells whether or not this charset supports encoding. 811 * 812 * <p> Nearly all charsets support encoding. The primary exceptions are 813 * special-purpose <i>auto-detect</i> charsets whose decoders can determine 814 * which of several possible encoding schemes is in use by examining the 815 * input byte sequence. Such charsets do not support encoding because 816 * there is no way to determine which encoding should be used on output. 817 * Implementations of such charsets should override this method to return 818 * <tt>false</tt>. </p> 819 * 820 * @return <tt>true</tt> if, and only if, this charset supports encoding 821 */ 822 public boolean canEncode() { 823 return true; 824 } 825 826 /** 827 * Convenience method that decodes bytes in this charset into Unicode 828 * characters. 829 * 830 * <p> An invocation of this method upon a charset <tt>cs</tt> returns the 831 * same result as the expression 832 * 833 * <pre> 834 * cs.newDecoder() 835 * .onMalformedInput(CodingErrorAction.REPLACE) 836 * .onUnmappableCharacter(CodingErrorAction.REPLACE) 837 * .decode(bb); </pre> 838 * 839 * except that it is potentially more efficient because it can cache 840 * decoders between successive invocations. 841 * 842 * <p> This method always replaces malformed-input and unmappable-character 843 * sequences with this charset's default replacement byte array. In order 844 * to detect such sequences, use the {@link 845 * CharsetDecoder#decode(java.nio.ByteBuffer)} method directly. </p> 846 * 847 * @param bb The byte buffer to be decoded 848 * 849 * @return A char buffer containing the decoded characters 850 */ 851 public final CharBuffer decode(ByteBuffer bb) { 852 try { 853 return ThreadLocalCoders.decoderFor(this) 854 .onMalformedInput(CodingErrorAction.REPLACE) 855 .onUnmappableCharacter(CodingErrorAction.REPLACE) 856 .decode(bb); 857 } catch (CharacterCodingException x) { 858 throw new Error(x); // Can't happen 859 } 860 } 861 862 /** 863 * Convenience method that encodes Unicode characters into bytes in this 864 * charset. 865 * 866 * <p> An invocation of this method upon a charset <tt>cs</tt> returns the 867 * same result as the expression 868 * 869 * <pre> 870 * cs.newEncoder() 871 * .onMalformedInput(CodingErrorAction.REPLACE) 872 * .onUnmappableCharacter(CodingErrorAction.REPLACE) 873 * .encode(bb); </pre> 874 * 875 * except that it is potentially more efficient because it can cache 876 * encoders between successive invocations. 877 * 878 * <p> This method always replaces malformed-input and unmappable-character 879 * sequences with this charset's default replacement string. In order to 880 * detect such sequences, use the {@link 881 * CharsetEncoder#encode(java.nio.CharBuffer)} method directly. </p> 882 * 883 * @param cb The char buffer to be encoded 884 * 885 * @return A byte buffer containing the encoded characters 886 */ 887 public final ByteBuffer encode(CharBuffer cb) { 888 try { 889 return ThreadLocalCoders.encoderFor(this) 890 .onMalformedInput(CodingErrorAction.REPLACE) 891 .onUnmappableCharacter(CodingErrorAction.REPLACE) 892 .encode(cb); 893 } catch (CharacterCodingException x) { 894 throw new Error(x); // Can't happen 895 } 896 } 897 898 /** 899 * Convenience method that encodes a string into bytes in this charset. 900 * 901 * <p> An invocation of this method upon a charset <tt>cs</tt> returns the 902 * same result as the expression 903 * 904 * <pre> 905 * cs.encode(CharBuffer.wrap(s)); </pre> 906 * 907 * @param str The string to be encoded 908 * 909 * @return A byte buffer containing the encoded characters 910 */ 911 public final ByteBuffer encode(String str) { 912 return encode(CharBuffer.wrap(str)); 913 } 914 915 /** 916 * Compares this charset to another. 917 * 918 * <p> Charsets are ordered by their canonical names, without regard to 919 * case. </p> 920 * 921 * @param that 922 * The charset to which this charset is to be compared 923 * 924 * @return A negative integer, zero, or a positive integer as this charset 925 * is less than, equal to, or greater than the specified charset 926 */ 927 public final int compareTo(Charset that) { 928 return (name().compareToIgnoreCase(that.name())); 929 } 930 931 /** 932 * Computes a hashcode for this charset. 933 * 934 * @return An integer hashcode 935 */ 936 public final int hashCode() { 937 return name().hashCode(); 938 } 939 940 /** 941 * Tells whether or not this object is equal to another. 942 * 943 * <p> Two charsets are equal if, and only if, they have the same canonical 944 * names. A charset is never equal to any other type of object. </p> 945 * 946 * @return <tt>true</tt> if, and only if, this charset is equal to the 947 * given object 948 */ 949 public final boolean equals(Object ob) { 950 if (!(ob instanceof Charset)) 951 return false; 952 if (this == ob) 953 return true; 954 return name.equals(((Charset)ob).name()); 955 } 956 957 /** 958 * Returns a string describing this charset. 959 * 960 * @return A string describing this charset 961 */ 962 public final String toString() { 963 return name(); 964 } 965 966 } 967