• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Licensed to the Apache Software Foundation (ASF) under one or more
3  *  contributor license agreements.  See the NOTICE file distributed with
4  *  this work for additional information regarding copyright ownership.
5  *  The ASF licenses this file to You under the Apache License, Version 2.0
6  *  (the "License"); you may not use this file except in compliance with
7  *  the License.  You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  *  Unless required by applicable law or agreed to in writing, software
12  *  distributed under the License is distributed on an "AS IS" BASIS,
13  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  *  See the License for the specific language governing permissions and
15  *  limitations under the License.
16  */
17 
18 package java.nio.charset;
19 
20 import java.io.UTFDataFormatException;
21 import java.nio.ByteOrder;
22 import libcore.io.Memory;
23 import libcore.io.SizeOf;
24 
25 /**
26  * @hide internal use only
27  */
28 public class ModifiedUtf8 {
29     /**
30      * Decodes a byte array containing <i>modified UTF-8</i> bytes into a string.
31      *
32      * <p>Note that although this method decodes the (supposedly impossible) zero byte to U+0000,
33      * that's what the RI does too.
34      */
decode(byte[] in, char[] out, int offset, int utfSize)35     public static String decode(byte[] in, char[] out, int offset, int utfSize) throws UTFDataFormatException {
36         int count = 0, s = 0, a;
37         while (count < utfSize) {
38             if ((out[s] = (char) in[offset + count++]) < '\u0080') {
39                 s++;
40             } else if (((a = out[s]) & 0xe0) == 0xc0) {
41                 if (count >= utfSize) {
42                     throw new UTFDataFormatException("bad second byte at " + count);
43                 }
44                 int b = in[offset + count++];
45                 if ((b & 0xC0) != 0x80) {
46                     throw new UTFDataFormatException("bad second byte at " + (count - 1));
47                 }
48                 out[s++] = (char) (((a & 0x1F) << 6) | (b & 0x3F));
49             } else if ((a & 0xf0) == 0xe0) {
50                 if (count + 1 >= utfSize) {
51                     throw new UTFDataFormatException("bad third byte at " + (count + 1));
52                 }
53                 int b = in[offset + count++];
54                 int c = in[offset + count++];
55                 if (((b & 0xC0) != 0x80) || ((c & 0xC0) != 0x80)) {
56                     throw new UTFDataFormatException("bad second or third byte at " + (count - 2));
57                 }
58                 out[s++] = (char) (((a & 0x0F) << 12) | ((b & 0x3F) << 6) | (c & 0x3F));
59             } else {
60                 throw new UTFDataFormatException("bad byte at " + (count - 1));
61             }
62         }
63         return new String(out, 0, s);
64     }
65 
66     /**
67      * Returns the number of bytes the modified UTF-8 representation of 's' would take. Note
68      * that this is just the space for the bytes representing the characters, not the length
69      * which precedes those bytes, because different callers represent the length differently,
70      * as two, four, or even eight bytes. If {@code shortLength} is true, we'll throw an
71      * exception if the string is too long for its length to be represented by a short.
72      */
countBytes(String s, boolean shortLength)73     public static long countBytes(String s, boolean shortLength) throws UTFDataFormatException {
74         long result = 0;
75         final int length = s.length();
76         for (int i = 0; i < length; ++i) {
77             char ch = s.charAt(i);
78             if (ch != 0 && ch <= 127) { // U+0000 uses two bytes.
79                 ++result;
80             } else if (ch <= 2047) {
81                 result += 2;
82             } else {
83                 result += 3;
84             }
85             if (shortLength && result > 65535) {
86                 throw new UTFDataFormatException("String more than 65535 UTF bytes long");
87             }
88         }
89         return result;
90     }
91 
92     /**
93      * Encodes the <i>modified UTF-8</i> bytes corresponding to string {@code s} into the
94      * byte array {@code dst}, starting at the given {@code offset}.
95      */
encode(byte[] dst, int offset, String s)96     public static void encode(byte[] dst, int offset, String s) {
97         final int length = s.length();
98         for (int i = 0; i < length; i++) {
99             char ch = s.charAt(i);
100             if (ch != 0 && ch <= 127) { // U+0000 uses two bytes.
101                 dst[offset++] = (byte) ch;
102             } else if (ch <= 2047) {
103                 dst[offset++] = (byte) (0xc0 | (0x1f & (ch >> 6)));
104                 dst[offset++] = (byte) (0x80 | (0x3f & ch));
105             } else {
106                 dst[offset++] = (byte) (0xe0 | (0x0f & (ch >> 12)));
107                 dst[offset++] = (byte) (0x80 | (0x3f & (ch >> 6)));
108                 dst[offset++] = (byte) (0x80 | (0x3f & ch));
109             }
110         }
111     }
112 
113     /**
114      * Returns an array containing the <i>modified UTF-8</i> form of {@code s}, using a
115      * big-endian 16-bit length. Throws UTFDataFormatException if {@code s} is too long
116      * for a two-byte length.
117      */
encode(String s)118     public static byte[] encode(String s) throws UTFDataFormatException {
119         int utfCount = (int) ModifiedUtf8.countBytes(s, true);
120         byte[] result = new byte[SizeOf.SHORT + utfCount];
121         Memory.pokeShort(result, 0, (short) utfCount, ByteOrder.BIG_ENDIAN);
122         ModifiedUtf8.encode(result, SizeOf.SHORT, s);
123         return result;
124     }
125 
ModifiedUtf8()126     private ModifiedUtf8() {
127     }
128 }
129