• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2014 The Android Open Source Project
3  * Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved.
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This code is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License version 2 only, as
8  * published by the Free Software Foundation.  Oracle designates this
9  * particular file as subject to the "Classpath" exception as provided
10  * by Oracle in the LICENSE file that accompanied this code.
11  *
12  * This code is distributed in the hope that it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15  * version 2 for more details (a copy is included in the LICENSE file that
16  * accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License version
19  * 2 along with this work; if not, write to the Free Software Foundation,
20  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21  *
22  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
23  * or visit www.oracle.com if you need additional information or have any
24  * questions.
25  */
26 
27 package java.nio.charset;
28 
29 import java.io.UnsupportedEncodingException;
30 import libcore.icu.NativeConverter;
31 import java.nio.ByteBuffer;
32 import java.nio.CharBuffer;
33 import java.nio.charset.spi.CharsetProvider;
34 import java.security.AccessController;
35 import java.security.AccessControlException;
36 import java.security.PrivilegedAction;
37 import java.util.AbstractMap;
38 import java.util.Collections;
39 import java.util.HashMap;
40 import java.util.HashSet;
41 import java.util.Iterator;
42 import java.util.Locale;
43 import java.util.Map;
44 import java.util.NoSuchElementException;
45 import java.util.Set;
46 import java.util.ServiceLoader;
47 import java.util.ServiceConfigurationError;
48 import java.util.SortedMap;
49 import java.util.TreeMap;
50 import sun.misc.ASCIICaseInsensitiveComparator;
51 import sun.nio.cs.ThreadLocalCoders;
52 import sun.security.action.GetPropertyAction;
53 
54 
55 /**
56  * A named mapping between sequences of sixteen-bit Unicode <a
57  * href="../../lang/Character.html#unicode">code units</a> and sequences of
58  * bytes.  This class defines methods for creating decoders and encoders and
59  * for retrieving the various names associated with a charset.  Instances of
60  * this class are immutable.
61  *
62  * <p> This class also defines static methods for testing whether a particular
63  * charset is supported, for locating charset instances by name, and for
64  * constructing a map that contains every charset for which support is
65  * available in the current Java virtual machine.  Support for new charsets can
66  * be added via the service-provider interface defined in the {@link
67  * java.nio.charset.spi.CharsetProvider} class.
68  *
69  * <p> All of the methods defined in this class are safe for use by multiple
70  * concurrent threads.
71  *
72  *
73  * <a name="names"></a><a name="charenc"></a>
74  * <h2>Charset names</h2>
75  *
76  * <p> Charsets are named by strings composed of the following characters:
77  *
78  * <ul>
79  *
80  *   <li> The uppercase letters <tt>'A'</tt> through <tt>'Z'</tt>
81  *        (<tt>'&#92;u0041'</tt>&nbsp;through&nbsp;<tt>'&#92;u005a'</tt>),
82  *
83  *   <li> The lowercase letters <tt>'a'</tt> through <tt>'z'</tt>
84  *        (<tt>'&#92;u0061'</tt>&nbsp;through&nbsp;<tt>'&#92;u007a'</tt>),
85  *
86  *   <li> The digits <tt>'0'</tt> through <tt>'9'</tt>
87  *        (<tt>'&#92;u0030'</tt>&nbsp;through&nbsp;<tt>'&#92;u0039'</tt>),
88  *
89  *   <li> The dash character <tt>'-'</tt>
90  *        (<tt>'&#92;u002d'</tt>,&nbsp;<small>HYPHEN-MINUS</small>),
91  *
92  *   <li> The plus character <tt>'+'</tt>
93  *        (<tt>'&#92;u002b'</tt>,&nbsp;<small>PLUS SIGN</small>),
94  *
95  *   <li> The period character <tt>'.'</tt>
96  *        (<tt>'&#92;u002e'</tt>,&nbsp;<small>FULL STOP</small>),
97  *
98  *   <li> The colon character <tt>':'</tt>
99  *        (<tt>'&#92;u003a'</tt>,&nbsp;<small>COLON</small>), and
100  *
101  *   <li> The underscore character <tt>'_'</tt>
102  *        (<tt>'&#92;u005f'</tt>,&nbsp;<small>LOW&nbsp;LINE</small>).
103  *
104  * </ul>
105  *
106  * A charset name must begin with either a letter or a digit.  The empty string
107  * is not a legal charset name.  Charset names are not case-sensitive; that is,
108  * case is always ignored when comparing charset names.  Charset names
109  * generally follow the conventions documented in <a
110  * href="http://www.ietf.org/rfc/rfc2278.txt"><i>RFC&nbsp;2278:&nbsp;IANA Charset
111  * Registration Procedures</i></a>.
112  *
113  * <p> Every charset has a <i>canonical name</i> and may also have one or more
114  * <i>aliases</i>.  The canonical name is returned by the {@link #name() name} method
115  * of this class.  Canonical names are, by convention, usually in upper case.
116  * The aliases of a charset are returned by the {@link #aliases() aliases}
117  * method.
118  *
119  * <p><a name="hn">Some charsets have an <i>historical name</i> that is defined for
120  * compatibility with previous versions of the Java platform.</a>  A charset's
121  * historical name is either its canonical name or one of its aliases.  The
122  * historical name is returned by the <tt>getEncoding()</tt> methods of the
123  * {@link java.io.InputStreamReader#getEncoding InputStreamReader} and {@link
124  * java.io.OutputStreamWriter#getEncoding OutputStreamWriter} classes.
125  *
126  * <p><a name="iana"> </a>If a charset listed in the <a
127  * href="http://www.iana.org/assignments/character-sets"><i>IANA Charset
128  * Registry</i></a> is supported by an implementation of the Java platform then
129  * its canonical name must be the name listed in the registry. Many charsets
130  * are given more than one name in the registry, in which case the registry
131  * identifies one of the names as <i>MIME-preferred</i>.  If a charset has more
132  * than one registry name then its canonical name must be the MIME-preferred
133  * name and the other names in the registry must be valid aliases.  If a
134  * supported charset is not listed in the IANA registry then its canonical name
135  * must begin with one of the strings <tt>"X-"</tt> or <tt>"x-"</tt>.
136  *
137  * <p> The IANA charset registry does change over time, and so the canonical
138  * name and the aliases of a particular charset may also change over time.  To
139  * ensure compatibility it is recommended that no alias ever be removed from a
140  * charset, and that if the canonical name of a charset is changed then its
141  * previous canonical name be made into an alias.
142  *
143  *
144  * <h2>Standard charsets</h2>
145  *
146  *
147  *
148  * <p><a name="standard">Every implementation of the Java platform is required to support the
149  * following standard charsets.</a>  Consult the release documentation for your
150  * implementation to see if any other charsets are supported.  The behavior
151  * of such optional charsets may differ between implementations.
152  *
153  * <blockquote><table width="80%" summary="Description of standard charsets">
154  * <tr><th align="left">Charset</th><th align="left">Description</th></tr>
155  * <tr><td valign=top><tt>US-ASCII</tt></td>
156  *     <td>Seven-bit ASCII, a.k.a. <tt>ISO646-US</tt>,
157  *         a.k.a. the Basic Latin block of the Unicode character set</td></tr>
158  * <tr><td valign=top><tt>ISO-8859-1&nbsp;&nbsp;</tt></td>
159  *     <td>ISO Latin Alphabet No. 1, a.k.a. <tt>ISO-LATIN-1</tt></td></tr>
160  * <tr><td valign=top><tt>UTF-8</tt></td>
161  *     <td>Eight-bit UCS Transformation Format</td></tr>
162  * <tr><td valign=top><tt>UTF-16BE</tt></td>
163  *     <td>Sixteen-bit UCS Transformation Format,
164  *         big-endian byte&nbsp;order</td></tr>
165  * <tr><td valign=top><tt>UTF-16LE</tt></td>
166  *     <td>Sixteen-bit UCS Transformation Format,
167  *         little-endian byte&nbsp;order</td></tr>
168  * <tr><td valign=top><tt>UTF-16</tt></td>
169  *     <td>Sixteen-bit UCS Transformation Format,
170  *         byte&nbsp;order identified by an optional byte-order mark</td></tr>
171  * </table></blockquote>
172  *
173  * <p> The <tt>UTF-8</tt> charset is specified by <a
174  * href="http://www.ietf.org/rfc/rfc2279.txt"><i>RFC&nbsp;2279</i></a>; the
175  * transformation format upon which it is based is specified in
176  * Amendment&nbsp;2 of ISO&nbsp;10646-1 and is also described in the <a
177  * href="http://www.unicode.org/unicode/standard/standard.html"><i>Unicode
178  * Standard</i></a>.
179  *
180  * <p> The <tt>UTF-16</tt> charsets are specified by <a
181  * href="http://www.ietf.org/rfc/rfc2781.txt"><i>RFC&nbsp;2781</i></a>; the
182  * transformation formats upon which they are based are specified in
183  * Amendment&nbsp;1 of ISO&nbsp;10646-1 and are also described in the <a
184  * href="http://www.unicode.org/unicode/standard/standard.html"><i>Unicode
185  * Standard</i></a>.
186  *
187  * <p> The <tt>UTF-16</tt> charsets use sixteen-bit quantities and are
188  * therefore sensitive to byte order.  In these encodings the byte order of a
189  * stream may be indicated by an initial <i>byte-order mark</i> represented by
190  * the Unicode character <tt>'&#92;uFEFF'</tt>.  Byte-order marks are handled
191  * as follows:
192  *
193  * <ul>
194  *
195  *   <li><p> When decoding, the <tt>UTF-16BE</tt> and <tt>UTF-16LE</tt>
196  *   charsets interpret the initial byte-order marks as a <small>ZERO-WIDTH
197  *   NON-BREAKING SPACE</small>; when encoding, they do not write
198  *   byte-order marks. </p></li>
199 
200  *
201  *   <li><p> When decoding, the <tt>UTF-16</tt> charset interprets the
202  *   byte-order mark at the beginning of the input stream to indicate the
203  *   byte-order of the stream but defaults to big-endian if there is no
204  *   byte-order mark; when encoding, it uses big-endian byte order and writes
205  *   a big-endian byte-order mark. </p></li>
206  *
207  * </ul>
208  *
209  * In any case, byte order marks occurring after the first element of an
210  * input sequence are not omitted since the same code is used to represent
211  * <small>ZERO-WIDTH NON-BREAKING SPACE</small>.
212  *
213  * <p>Android note: The Android platform default is always UTF-8.
214  *
215  * <p>The {@link StandardCharsets} class defines constants for each of the
216  * standard charsets.
217  *
218  * <h2>Terminology</h2>
219  *
220  * <p> The name of this class is taken from the terms used in
221  * <a href="http://www.ietf.org/rfc/rfc2278.txt"><i>RFC&nbsp;2278</i></a>.
222  * In that document a <i>charset</i> is defined as the combination of
223  * one or more coded character sets and a character-encoding scheme.
224  * (This definition is confusing; some other software systems define
225  * <i>charset</i> as a synonym for <i>coded character set</i>.)
226  *
227  * <p> A <i>coded character set</i> is a mapping between a set of abstract
228  * characters and a set of integers.  US-ASCII, ISO&nbsp;8859-1,
229  * JIS&nbsp;X&nbsp;0201, and Unicode are examples of coded character sets.
230  *
231  * <p> Some standards have defined a <i>character set</i> to be simply a
232  * set of abstract characters without an associated assigned numbering.
233  * An alphabet is an example of such a character set.  However, the subtle
234  * distinction between <i>character set</i> and <i>coded character set</i>
235  * is rarely used in practice; the former has become a short form for the
236  * latter, including in the Java API specification.
237  *
238  * <p> A <i>character-encoding scheme</i> is a mapping between one or more
239  * coded character sets and a set of octet (eight-bit byte) sequences.
240  * UTF-8, UTF-16, ISO&nbsp;2022, and EUC are examples of
241  * character-encoding schemes.  Encoding schemes are often associated with
242  * a particular coded character set; UTF-8, for example, is used only to
243  * encode Unicode.  Some schemes, however, are associated with multiple
244  * coded character sets; EUC, for example, can be used to encode
245  * characters in a variety of Asian coded character sets.
246  *
247  * <p> When a coded character set is used exclusively with a single
248  * character-encoding scheme then the corresponding charset is usually
249  * named for the coded character set; otherwise a charset is usually named
250  * for the encoding scheme and, possibly, the locale of the coded
251  * character sets that it supports.  Hence <tt>US-ASCII</tt> is both the
252  * name of a coded character set and of the charset that encodes it, while
253  * <tt>EUC-JP</tt> is the name of the charset that encodes the
254  * JIS&nbsp;X&nbsp;0201, JIS&nbsp;X&nbsp;0208, and JIS&nbsp;X&nbsp;0212
255  * coded character sets for the Japanese language.
256  *
257  * <p> The native character encoding of the Java programming language is
258  * UTF-16.  A charset in the Java platform therefore defines a mapping
259  * between sequences of sixteen-bit UTF-16 code units (that is, sequences
260  * of chars) and sequences of bytes. </p>
261  *
262  *
263  * @author Mark Reinhold
264  * @author JSR-51 Expert Group
265  * @since 1.4
266  *
267  * @see CharsetDecoder
268  * @see CharsetEncoder
269  * @see java.nio.charset.spi.CharsetProvider
270  * @see java.lang.Character
271  */
272 
273 public abstract class Charset
274     implements Comparable<Charset>
275 {
276 
277     /* -- Static methods -- */
278 
279     private static volatile String bugLevel = null;
280 
atBugLevel(String bl)281     static boolean atBugLevel(String bl) {              // package-private
282         String level = bugLevel;
283         if (level == null) {
284             if (!sun.misc.VM.isBooted())
285                 return false;
286             bugLevel = level = AccessController.doPrivileged(
287                 new GetPropertyAction("sun.nio.cs.bugLevel", ""));
288         }
289         return level.equals(bl);
290     }
291 
292     /**
293      * Checks that the given string is a legal charset name. </p>
294      *
295      * @param  s
296      *         A purported charset name
297      *
298      * @throws  IllegalCharsetNameException
299      *          If the given name is not a legal charset name
300      */
checkName(String s)301     private static void checkName(String s) {
302         int n = s.length();
303         if (!atBugLevel("1.4")) {
304             if (n == 0)
305                 throw new IllegalCharsetNameException(s);
306         }
307         for (int i = 0; i < n; i++) {
308             char c = s.charAt(i);
309             if (c >= 'A' && c <= 'Z') continue;
310             if (c >= 'a' && c <= 'z') continue;
311             if (c >= '0' && c <= '9') continue;
312             if (c == '-' && i != 0) continue;
313             if (c == '+' && i != 0) continue;
314             if (c == ':' && i != 0) continue;
315             if (c == '_' && i != 0) continue;
316             if (c == '.' && i != 0) continue;
317             throw new IllegalCharsetNameException(s);
318         }
319     }
320 
321     /* The standard set of charsets */
322     // Android-removed: We use ICU's list of standard charsets.
323     // private static CharsetProvider standardProvider = new StandardCharsets();
324 
325     // Cache of the most-recently-returned charsets,
326     // along with the names that were used to find them
327     //
328     // cache1/2 usage is explained in the lookup method
329     //
330     private static volatile Map.Entry<String, Charset> cache1 = null; // "Level 1" cache
331     private static final HashMap<String, Charset> cache2 = new HashMap<>(); // "Level 2" cache
332 
cache(String charsetName, Charset cs)333     private static void cache(String charsetName, Charset cs) {
334         synchronized(cache2) {
335             String canonicalName = cs.name();
336             Charset canonicalCharset = cache2.get(canonicalName);
337 
338             if (canonicalCharset != null) {
339                 cs = canonicalCharset;
340             } else {
341                 cache2.put(canonicalName, cs);
342 
343                 for (String alias : cs.aliases()) {
344                     cache2.put(alias, cs);
345                 }
346             }
347 
348             cache2.put(charsetName, cs);
349         }
350 
351         cache1 = new AbstractMap.SimpleImmutableEntry<>(charsetName, cs);
352     }
353 
354     // Creates an iterator that walks over the available providers, ignoring
355     // those whose lookup or instantiation causes a security exception to be
356     // thrown.  Should be invoked with full privileges.
357     //
providers()358     private static Iterator<CharsetProvider> providers() {
359         return new Iterator<CharsetProvider>() {
360 
361                 ServiceLoader<CharsetProvider> sl =
362                     ServiceLoader.load(CharsetProvider.class);
363                 Iterator<CharsetProvider> i = sl.iterator();
364 
365                 CharsetProvider next = null;
366 
367                 private boolean getNext() {
368                     while (next == null) {
369                         try {
370                             if (!i.hasNext())
371                                 return false;
372                             next = i.next();
373                         } catch (ServiceConfigurationError sce) {
374                             if (sce.getCause() instanceof SecurityException) {
375                                 // Ignore security exceptions
376                                 continue;
377                             }
378                             throw sce;
379                         }
380                     }
381                     return true;
382                 }
383 
384                 public boolean hasNext() {
385                     return getNext();
386                 }
387 
388                 public CharsetProvider next() {
389                     if (!getNext())
390                         throw new NoSuchElementException();
391                     CharsetProvider n = next;
392                     next = null;
393                     return n;
394                 }
395 
396                 public void remove() {
397                     throw new UnsupportedOperationException();
398                 }
399 
400             };
401     }
402 
403     // Thread-local gate to prevent recursive provider lookups
404     private static ThreadLocal<ThreadLocal<?>> gate =
405             new ThreadLocal<ThreadLocal<?>>();
406 
lookupViaProviders(final String charsetName)407     private static Charset lookupViaProviders(final String charsetName) {
408 
409         // The runtime startup sequence looks up standard charsets as a
410         // consequence of the VM's invocation of System.initializeSystemClass
411         // in order to, e.g., set system properties and encode filenames.  At
412         // that point the application class loader has not been initialized,
413         // however, so we can't look for providers because doing so will cause
414         // that loader to be prematurely initialized with incomplete
415         // information.
416         //
417         if (!sun.misc.VM.isBooted())
418             return null;
419 
420         if (gate.get() != null)
421             // Avoid recursive provider lookups
422             return null;
423         try {
424             gate.set(gate);
425 
426             return AccessController.doPrivileged(
427                 new PrivilegedAction<Charset>() {
428                     public Charset run() {
429                         for (Iterator<CharsetProvider> i = providers();
430                              i.hasNext();) {
431                             CharsetProvider cp = i.next();
432                             Charset cs = cp.charsetForName(charsetName);
433                             if (cs != null)
434                                 return cs;
435                         }
436                         return null;
437                     }
438                 });
439 
440         } finally {
441             gate.set(null);
442         }
443     }
444 
445     // Android-removed: Remove support for the extended charset provider.
446     //
447     /* The extended set of charsets */
448     // private static Object extendedProviderLock = new Object();
449     // private static boolean extendedProviderProbed = false;
450     // private static CharsetProvider extendedProvider = null;
451     //
452     // private static void probeExtendedProvider() {
453     //     AccessController.doPrivileged(new PrivilegedAction<Object>() {
454     //            public Object run() {
455     //                 try {
456     //                     Class epc
457     //                         = Class.forName("sun.nio.cs.ext.ExtendedCharsets");
458     //                     extendedProvider = (CharsetProvider)epc.newInstance();
459     //                 } catch (ClassNotFoundException x) {
460     //                     // Extended charsets not available
461     //                     // (charsets.jar not present)
462     //                 } catch (InstantiationException x) {
463     //                     throw new Error(x);
464     //                 } catch (IllegalAccessException x) {
465     //                     throw new Error(x);
466     //                }
467     //                 return null;
468     //             }
469     //         });
470     // }
471     //
472     // private static Charset lookupExtendedCharset(String charsetName) {
473     //     CharsetProvider ecp = null;
474     //     synchronized (extendedProviderLock) {
475     //         if (!extendedProviderProbed) {
476     //             probeExtendedProvider();
477     //             extendedProviderProbed = true;
478     //         }
479     //         ecp = extendedProvider;
480     //     }
481     //     return (ecp != null) ? ecp.charsetForName(charsetName) : null;
482     // }
483 
484     // We expect most programs to use one Charset repeatedly, so the most recently used Charset
485     // instance is stored in the level 1 cache. We convey a hint to this effect to the VM by putting
486     // the level 1 cache miss code in a separate method. Since charsetName is not necessarily in
487     // canonical form, we store the mapping from both the canonical name and the aliases to the
488     // instance in a map for level 2 cache.
489     private static Charset lookup(String charsetName) {
490         if (charsetName == null)
491             throw new IllegalArgumentException("Null charset name");
492 
493 
494         final Map.Entry<String, Charset> cached = cache1;
495         if (cached != null && charsetName.equals(cached.getKey()))
496             return cached.getValue();
497         return lookup2(charsetName);
498     }
499 
500     private static Charset lookup2(String charsetName) {
501         Charset cs;
502         synchronized (cache2) {
503             if ((cs = cache2.get(charsetName)) != null) {
504                 cache1 = new AbstractMap.SimpleImmutableEntry<>(charsetName, cs);
505                 return cs;
506             }
507         }
508 
509         // Android-changed: Drop support for "standard" and "extended"
510         // providers.
511         if ((cs = NativeConverter.charsetForName(charsetName))  != null ||
512             (cs = lookupViaProviders(charsetName))              != null)
513         {
514             cache(charsetName, cs);
515             return cs;
516         }
517 
518         /* Only need to check the name if we didn't find a charset for it */
519         checkName(charsetName);
520         return null;
521     }
522 
523     /**
524      * Tells whether the named charset is supported.
525      *
526      * @param  charsetName
527      *         The name of the requested charset; may be either
528      *         a canonical name or an alias
529      *
530      * @return  <tt>true</tt> if, and only if, support for the named charset
531      *          is available in the current Java virtual machine
532      *
533      * @throws IllegalCharsetNameException
534      *         If the given charset name is illegal
535      *
536      * @throws  IllegalArgumentException
537      *          If the given <tt>charsetName</tt> is null
538      */
539     public static boolean isSupported(String charsetName) {
540         return (lookup(charsetName) != null);
541     }
542 
543     /**
544      * Returns a charset object for the named charset.
545      *
546      * @param  charsetName
547      *         The name of the requested charset; may be either
548      *         a canonical name or an alias
549      *
550      * @return  A charset object for the named charset
551      *
552      * @throws  IllegalCharsetNameException
553      *          If the given charset name is illegal
554      *
555      * @throws  IllegalArgumentException
556      *          If the given <tt>charsetName</tt> is null
557      *
558      * @throws  UnsupportedCharsetException
559      *          If no support for the named charset is available
560      *          in this instance of the Java virtual machine
561      */
562     public static Charset forName(String charsetName) {
563         Charset cs = lookup(charsetName);
564         if (cs != null)
565             return cs;
566         throw new UnsupportedCharsetException(charsetName);
567     }
568 
569 
570     /**
571      * Equivalent to {@code forName} but only throws {@code UnsupportedEncodingException},
572      * which is all pre-nio code claims to throw.
573      *
574      * @hide internal use only
575      */
576     public static Charset forNameUEE(String charsetName) throws UnsupportedEncodingException {
577         try {
578             return Charset.forName(charsetName);
579         } catch (Exception cause) {
580             UnsupportedEncodingException ex = new UnsupportedEncodingException(charsetName);
581             ex.initCause(cause);
582             throw ex;
583         }
584     }
585 
586 
587     // Fold charsets from the given iterator into the given map, ignoring
588     // charsets whose names already have entries in the map.
589     //
590     private static void put(Iterator<Charset> i, Map<String,Charset> m) {
591         while (i.hasNext()) {
592             Charset cs = i.next();
593             if (!m.containsKey(cs.name()))
594                 m.put(cs.name(), cs);
595         }
596     }
597 
598     /**
599      * Constructs a sorted map from canonical charset names to charset objects.
600      *
601      * <p> The map returned by this method will have one entry for each charset
602      * for which support is available in the current Java virtual machine.  If
603      * two or more supported charsets have the same canonical name then the
604      * resulting map will contain just one of them; which one it will contain
605      * is not specified. </p>
606      *
607      * <p> The invocation of this method, and the subsequent use of the
608      * resulting map, may cause time-consuming disk or network I/O operations
609      * to occur.  This method is provided for applications that need to
610      * enumerate all of the available charsets, for example to allow user
611      * charset selection.  This method is not used by the {@link #forName
612      * forName} method, which instead employs an efficient incremental lookup
613      * algorithm.
614      *
615      * <p> This method may return different results at different times if new
616      * charset providers are dynamically made available to the current Java
617      * virtual machine.  In the absence of such changes, the charsets returned
618      * by this method are exactly those that can be retrieved via the {@link
619      * #forName forName} method.  </p>
620      *
621      * @return An immutable, case-insensitive map from canonical charset names
622      *         to charset objects
623      */
624     public static SortedMap<String,Charset> availableCharsets() {
625         return AccessController.doPrivileged(
626             new PrivilegedAction<SortedMap<String,Charset>>() {
627                 public SortedMap<String,Charset> run() {
628                     TreeMap<String,Charset> m =
629                         new TreeMap<String,Charset>(
630                             ASCIICaseInsensitiveComparator.CASE_INSENSITIVE_ORDER);
631                     for (String charsetName : NativeConverter.getAvailableCharsetNames()) {
632                         Charset charset = NativeConverter.charsetForName(charsetName);
633                         m.put(charset.name(), charset);
634                     }
635                     // Android-changed: No more "standard" provider.
636                     // put(standardProvider.charsets(), m);
637                     for (Iterator i = providers(); i.hasNext();) {
638                         CharsetProvider cp = (CharsetProvider)i.next();
639                         put(cp.charsets(), m);
640                     }
641                     return Collections.unmodifiableSortedMap(m);
642                 }
643             });
644     }
645 
646     private static Charset defaultCharset;
647 
648     /**
649      * Returns the default charset of this Java virtual machine.
650      *
651      * <p>Android note: The Android platform default is always UTF-8.
652      *
653      * @return  A charset object for the default charset
654      *
655      * @since 1.5
656      */
657     public static Charset defaultCharset() {
658         // Android-changed: Use UTF_8 unconditionally.
659         synchronized (Charset.class) {
660             if (defaultCharset == null) {
661                 defaultCharset = java.nio.charset.StandardCharsets.UTF_8;
662             }
663 
664             return defaultCharset;
665         }
666     }
667 
668 
669     /* -- Instance fields and methods -- */
670 
671     private final String name;          // tickles a bug in oldjavac
672     private final String[] aliases;     // tickles a bug in oldjavac
673     private Set<String> aliasSet = null;
674 
675     /**
676      * Initializes a new charset with the given canonical name and alias
677      * set.
678      *
679      * @param  canonicalName
680      *         The canonical name of this charset
681      *
682      * @param  aliases
683      *         An array of this charset's aliases, or null if it has no aliases
684      *
685      * @throws IllegalCharsetNameException
686      *         If the canonical name or any of the aliases are illegal
687      */
688     protected Charset(String canonicalName, String[] aliases) {
689         checkName(canonicalName);
690         String[] as = (aliases == null) ? new String[0] : aliases;
691         for (int i = 0; i < as.length; i++)
692             checkName(as[i]);
693         this.name = canonicalName;
694         this.aliases = as;
695     }
696 
697     /**
698      * Returns this charset's canonical name.
699      *
700      * @return  The canonical name of this charset
701      */
702     public final String name() {
703         return name;
704     }
705 
706     /**
707      * Returns a set containing this charset's aliases.
708      *
709      * @return  An immutable set of this charset's aliases
710      */
711     public final Set<String> aliases() {
712         if (aliasSet != null)
713             return aliasSet;
714         int n = aliases.length;
715         HashSet<String> hs = new HashSet<String>(n);
716         for (int i = 0; i < n; i++)
717             hs.add(aliases[i]);
718         aliasSet = Collections.unmodifiableSet(hs);
719         return aliasSet;
720     }
721 
722     /**
723      * Returns this charset's human-readable name for the default locale.
724      *
725      * <p> The default implementation of this method simply returns this
726      * charset's canonical name.  Concrete subclasses of this class may
727      * override this method in order to provide a localized display name. </p>
728      *
729      * @return  The display name of this charset in the default locale
730      */
731     public String displayName() {
732         return name;
733     }
734 
735     /**
736      * Tells whether or not this charset is registered in the <a
737      * href="http://www.iana.org/assignments/character-sets">IANA Charset
738      * Registry</a>.
739      *
740      * @return  <tt>true</tt> if, and only if, this charset is known by its
741      *          implementor to be registered with the IANA
742      */
743     public final boolean isRegistered() {
744         return !name.startsWith("X-") && !name.startsWith("x-");
745     }
746 
747     /**
748      * Returns this charset's human-readable name for the given locale.
749      *
750      * <p> The default implementation of this method simply returns this
751      * charset's canonical name.  Concrete subclasses of this class may
752      * override this method in order to provide a localized display name. </p>
753      *
754      * @param  locale
755      *         The locale for which the display name is to be retrieved
756      *
757      * @return  The display name of this charset in the given locale
758      */
759     public String displayName(Locale locale) {
760         return name;
761     }
762 
763     /**
764      * Tells whether or not this charset contains the given charset.
765      *
766      * <p> A charset <i>C</i> is said to <i>contain</i> a charset <i>D</i> if,
767      * and only if, every character representable in <i>D</i> is also
768      * representable in <i>C</i>.  If this relationship holds then it is
769      * guaranteed that every string that can be encoded in <i>D</i> can also be
770      * encoded in <i>C</i> without performing any replacements.
771      *
772      * <p> That <i>C</i> contains <i>D</i> does not imply that each character
773      * representable in <i>C</i> by a particular byte sequence is represented
774      * in <i>D</i> by the same byte sequence, although sometimes this is the
775      * case.
776      *
777      * <p> Every charset contains itself.
778      *
779      * <p> This method computes an approximation of the containment relation:
780      * If it returns <tt>true</tt> then the given charset is known to be
781      * contained by this charset; if it returns <tt>false</tt>, however, then
782      * it is not necessarily the case that the given charset is not contained
783      * in this charset.
784      *
785      * @param   cs
786      *          The given charset
787      *
788      * @return  <tt>true</tt> if the given charset is contained in this charset
789      */
790     public abstract boolean contains(Charset cs);
791 
792     /**
793      * Constructs a new decoder for this charset.
794      *
795      * @return  A new decoder for this charset
796      */
797     public abstract CharsetDecoder newDecoder();
798 
799     /**
800      * Constructs a new encoder for this charset.
801      *
802      * @return  A new encoder for this charset
803      *
804      * @throws  UnsupportedOperationException
805      *          If this charset does not support encoding
806      */
807     public abstract CharsetEncoder newEncoder();
808 
809     /**
810      * Tells whether or not this charset supports encoding.
811      *
812      * <p> Nearly all charsets support encoding.  The primary exceptions are
813      * special-purpose <i>auto-detect</i> charsets whose decoders can determine
814      * which of several possible encoding schemes is in use by examining the
815      * input byte sequence.  Such charsets do not support encoding because
816      * there is no way to determine which encoding should be used on output.
817      * Implementations of such charsets should override this method to return
818      * <tt>false</tt>. </p>
819      *
820      * @return  <tt>true</tt> if, and only if, this charset supports encoding
821      */
822     public boolean canEncode() {
823         return true;
824     }
825 
826     /**
827      * Convenience method that decodes bytes in this charset into Unicode
828      * characters.
829      *
830      * <p> An invocation of this method upon a charset <tt>cs</tt> returns the
831      * same result as the expression
832      *
833      * <pre>
834      *     cs.newDecoder()
835      *       .onMalformedInput(CodingErrorAction.REPLACE)
836      *       .onUnmappableCharacter(CodingErrorAction.REPLACE)
837      *       .decode(bb); </pre>
838      *
839      * except that it is potentially more efficient because it can cache
840      * decoders between successive invocations.
841      *
842      * <p> This method always replaces malformed-input and unmappable-character
843      * sequences with this charset's default replacement byte array.  In order
844      * to detect such sequences, use the {@link
845      * CharsetDecoder#decode(java.nio.ByteBuffer)} method directly.  </p>
846      *
847      * @param  bb  The byte buffer to be decoded
848      *
849      * @return  A char buffer containing the decoded characters
850      */
851     public final CharBuffer decode(ByteBuffer bb) {
852         try {
853             return ThreadLocalCoders.decoderFor(this)
854                 .onMalformedInput(CodingErrorAction.REPLACE)
855                 .onUnmappableCharacter(CodingErrorAction.REPLACE)
856                 .decode(bb);
857         } catch (CharacterCodingException x) {
858             throw new Error(x);         // Can't happen
859         }
860     }
861 
862     /**
863      * Convenience method that encodes Unicode characters into bytes in this
864      * charset.
865      *
866      * <p> An invocation of this method upon a charset <tt>cs</tt> returns the
867      * same result as the expression
868      *
869      * <pre>
870      *     cs.newEncoder()
871      *       .onMalformedInput(CodingErrorAction.REPLACE)
872      *       .onUnmappableCharacter(CodingErrorAction.REPLACE)
873      *       .encode(bb); </pre>
874      *
875      * except that it is potentially more efficient because it can cache
876      * encoders between successive invocations.
877      *
878      * <p> This method always replaces malformed-input and unmappable-character
879      * sequences with this charset's default replacement string.  In order to
880      * detect such sequences, use the {@link
881      * CharsetEncoder#encode(java.nio.CharBuffer)} method directly.  </p>
882      *
883      * @param  cb  The char buffer to be encoded
884      *
885      * @return  A byte buffer containing the encoded characters
886      */
887     public final ByteBuffer encode(CharBuffer cb) {
888         try {
889             return ThreadLocalCoders.encoderFor(this)
890                 .onMalformedInput(CodingErrorAction.REPLACE)
891                 .onUnmappableCharacter(CodingErrorAction.REPLACE)
892                 .encode(cb);
893         } catch (CharacterCodingException x) {
894             throw new Error(x);         // Can't happen
895         }
896     }
897 
898     /**
899      * Convenience method that encodes a string into bytes in this charset.
900      *
901      * <p> An invocation of this method upon a charset <tt>cs</tt> returns the
902      * same result as the expression
903      *
904      * <pre>
905      *     cs.encode(CharBuffer.wrap(s)); </pre>
906      *
907      * @param  str  The string to be encoded
908      *
909      * @return  A byte buffer containing the encoded characters
910      */
911     public final ByteBuffer encode(String str) {
912         return encode(CharBuffer.wrap(str));
913     }
914 
915     /**
916      * Compares this charset to another.
917      *
918      * <p> Charsets are ordered by their canonical names, without regard to
919      * case. </p>
920      *
921      * @param  that
922      *         The charset to which this charset is to be compared
923      *
924      * @return A negative integer, zero, or a positive integer as this charset
925      *         is less than, equal to, or greater than the specified charset
926      */
927     public final int compareTo(Charset that) {
928         return (name().compareToIgnoreCase(that.name()));
929     }
930 
931     /**
932      * Computes a hashcode for this charset.
933      *
934      * @return  An integer hashcode
935      */
936     public final int hashCode() {
937         return name().hashCode();
938     }
939 
940     /**
941      * Tells whether or not this object is equal to another.
942      *
943      * <p> Two charsets are equal if, and only if, they have the same canonical
944      * names.  A charset is never equal to any other type of object.  </p>
945      *
946      * @return  <tt>true</tt> if, and only if, this charset is equal to the
947      *          given object
948      */
949     public final boolean equals(Object ob) {
950         if (!(ob instanceof Charset))
951             return false;
952         if (this == ob)
953             return true;
954         return name.equals(((Charset)ob).name());
955     }
956 
957     /**
958      * Returns a string describing this charset.
959      *
960      * @return  A string describing this charset
961      */
962     public final String toString() {
963         return name();
964     }
965 
966 }
967