• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2007 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /*
18  * As per the Apache license requirements, this file has been modified
19  * from its original state.
20  *
21  * Such modifications are Copyright (C) 2010 Ben Gruver, and are released
22  * under the original license
23  */
24 
25 package org.jf.util;
26 
27 import javax.annotation.Nonnull;
28 import javax.annotation.Nullable;
29 
30 /**
31  * Constants of type <code>CONSTANT_Utf8_info</code>.
32  */
33 public final class Utf8Utils {
34     /**
35      * Converts a string into its Java-style UTF-8 form. Java-style UTF-8
36      * differs from normal UTF-8 in the handling of character '\0' and
37      * surrogate pairs.
38      *
39      * @param string non-null; the string to convert
40      * @return non-null; the UTF-8 bytes for it
41      */
stringToUtf8Bytes(String string)42     public static byte[] stringToUtf8Bytes(String string) {
43         int len = string.length();
44         byte[] bytes = new byte[len * 3]; // Avoid having to reallocate.
45         int outAt = 0;
46 
47         for (int i = 0; i < len; i++) {
48             char c = string.charAt(i);
49             if ((c != 0) && (c < 0x80)) {
50                 bytes[outAt] = (byte) c;
51                 outAt++;
52             } else if (c < 0x800) {
53                 bytes[outAt] = (byte) (((c >> 6) & 0x1f) | 0xc0);
54                 bytes[outAt + 1] = (byte) ((c & 0x3f) | 0x80);
55                 outAt += 2;
56             } else {
57                 bytes[outAt] = (byte) (((c >> 12) & 0x0f) | 0xe0);
58                 bytes[outAt + 1] = (byte) (((c >> 6) & 0x3f) | 0x80);
59                 bytes[outAt + 2] = (byte) ((c & 0x3f) | 0x80);
60                 outAt += 3;
61             }
62         }
63 
64         byte[] result = new byte[outAt];
65         System.arraycopy(bytes, 0, result, 0, outAt);
66         return result;
67     }
68 
69     private static final ThreadLocal<char[]> localBuffer =
70             new ThreadLocal<char[]> () {
71                 @Override protected char[] initialValue() {
72                     // A reasonably sized initial value
73                     return new char[256];
74                 }
75             };
76 
77     /**
78      * Converts an array of UTF-8 bytes into a string.
79      *
80      * @param bytes non-null; the bytes to convert
81      * @param start the start index of the utf8 string to convert
82      * @param length the length of the utf8 string to convert, not including any null-terminator that might be present
83      * @return non-null; the converted string
84      */
utf8BytesToString(byte[] bytes, int start, int length)85     public static String utf8BytesToString(byte[] bytes, int start, int length) {
86         char[] chars = localBuffer.get();
87         if (chars == null || chars.length < length) {
88             chars = new char[length];
89             localBuffer.set(chars);
90         }
91         int outAt = 0;
92 
93         for (int at = start; length > 0; /*at*/) {
94             int v0 = bytes[at] & 0xFF;
95             char out;
96             switch (v0 >> 4) {
97                 case 0x00: case 0x01: case 0x02: case 0x03:
98                 case 0x04: case 0x05: case 0x06: case 0x07: {
99                     // 0XXXXXXX -- single-byte encoding
100                     length--;
101                     if (v0 == 0) {
102                         // A single zero byte is illegal.
103                         return throwBadUtf8(v0, at);
104                     }
105                     out = (char) v0;
106                     at++;
107                     break;
108                 }
109                 case 0x0c: case 0x0d: {
110                     // 110XXXXX -- two-byte encoding
111                     length -= 2;
112                     if (length < 0) {
113                         return throwBadUtf8(v0, at);
114                     }
115                     int v1 = bytes[at + 1] & 0xFF;
116                     if ((v1 & 0xc0) != 0x80) {
117                         return throwBadUtf8(v1, at + 1);
118                     }
119                     int value = ((v0 & 0x1f) << 6) | (v1 & 0x3f);
120                     if ((value != 0) && (value < 0x80)) {
121                         /*
122                          * This should have been represented with
123                          * one-byte encoding.
124                          */
125                         return throwBadUtf8(v1, at + 1);
126                     }
127                     out = (char) value;
128                     at += 2;
129                     break;
130                 }
131                 case 0x0e: {
132                     // 1110XXXX -- three-byte encoding
133                     length -= 3;
134                     if (length < 0) {
135                         return throwBadUtf8(v0, at);
136                     }
137                     int v1 = bytes[at + 1] & 0xFF;
138                     if ((v1 & 0xc0) != 0x80) {
139                         return throwBadUtf8(v1, at + 1);
140                     }
141                     int v2 = bytes[at + 2] & 0xFF;
142                     if ((v2 & 0xc0) != 0x80) {
143                         return throwBadUtf8(v2, at + 2);
144                     }
145                     int value = ((v0 & 0x0f) << 12) | ((v1 & 0x3f) << 6) |
146                             (v2 & 0x3f);
147                     if (value < 0x800) {
148                         /*
149                          * This should have been represented with one- or
150                          * two-byte encoding.
151                          */
152                         return throwBadUtf8(v2, at + 2);
153                     }
154                     out = (char) value;
155                     at += 3;
156                     break;
157                 }
158                 default: {
159                     // 10XXXXXX, 1111XXXX -- illegal
160                     return throwBadUtf8(v0, at);
161                 }
162             }
163             chars[outAt] = out;
164             outAt++;
165         }
166 
167         return new String(chars, 0, outAt);
168     }
169 
170     /**
171      * Converts an array of UTF-8 bytes into a string.
172      *
173      * @param bytes non-null; the bytes to convert
174      * @param start the start index of the utf8 string to convert
175      * @param utf16Length the number of utf16 characters in the string to decode
176      * @return non-null; the converted string
177      */
utf8BytesWithUtf16LengthToString(@onnull byte[] bytes, int start, int utf16Length)178     public static String utf8BytesWithUtf16LengthToString(@Nonnull byte[] bytes, int start, int utf16Length) {
179         return utf8BytesWithUtf16LengthToString(bytes, start, utf16Length, null);
180     }
181 
182     /**
183      * Converts an array of UTF-8 bytes into a string.
184      *
185      * @param bytes non-null; the bytes to convert
186      * @param start the start index of the utf8 string to convert
187      * @param utf16Length the number of utf16 characters in the string to decode
188      * @param readLength If non-null, the first element will contain the number of bytes read after the method exits
189      * @return non-null; the converted string
190      */
utf8BytesWithUtf16LengthToString(@onnull byte[] bytes, int start, int utf16Length, @Nullable int[] readLength)191     public static String utf8BytesWithUtf16LengthToString(@Nonnull byte[] bytes, int start, int utf16Length,
192                                                           @Nullable int[] readLength) {
193         char[] chars = localBuffer.get();
194         if (chars == null || chars.length < utf16Length) {
195             chars = new char[utf16Length];
196             localBuffer.set(chars);
197         }
198         int outAt = 0;
199 
200         int at = 0;
201         for (at = start; utf16Length > 0; utf16Length--) {
202             int v0 = bytes[at] & 0xFF;
203             char out;
204             switch (v0 >> 4) {
205                 case 0x00: case 0x01: case 0x02: case 0x03:
206                 case 0x04: case 0x05: case 0x06: case 0x07: {
207                     // 0XXXXXXX -- single-byte encoding
208                     if (v0 == 0) {
209                         // A single zero byte is illegal.
210                         return throwBadUtf8(v0, at);
211                     }
212                     out = (char) v0;
213                     at++;
214                     break;
215                 }
216                 case 0x0c: case 0x0d: {
217                     // 110XXXXX -- two-byte encoding
218                     int v1 = bytes[at + 1] & 0xFF;
219                     if ((v1 & 0xc0) != 0x80) {
220                         return throwBadUtf8(v1, at + 1);
221                     }
222                     int value = ((v0 & 0x1f) << 6) | (v1 & 0x3f);
223                     if ((value != 0) && (value < 0x80)) {
224                         /*
225                          * This should have been represented with
226                          * one-byte encoding.
227                          */
228                         return throwBadUtf8(v1, at + 1);
229                     }
230                     out = (char) value;
231                     at += 2;
232                     break;
233                 }
234                 case 0x0e: {
235                     // 1110XXXX -- three-byte encoding
236                     int v1 = bytes[at + 1] & 0xFF;
237                     if ((v1 & 0xc0) != 0x80) {
238                         return throwBadUtf8(v1, at + 1);
239                     }
240                     int v2 = bytes[at + 2] & 0xFF;
241                     if ((v2 & 0xc0) != 0x80) {
242                         return throwBadUtf8(v2, at + 2);
243                     }
244                     int value = ((v0 & 0x0f) << 12) | ((v1 & 0x3f) << 6) |
245                         (v2 & 0x3f);
246                     if (value < 0x800) {
247                         /*
248                          * This should have been represented with one- or
249                          * two-byte encoding.
250                          */
251                         return throwBadUtf8(v2, at + 2);
252                     }
253                     out = (char) value;
254                     at += 3;
255                     break;
256                 }
257                 default: {
258                     // 10XXXXXX, 1111XXXX -- illegal
259                     return throwBadUtf8(v0, at);
260                 }
261             }
262             chars[outAt] = out;
263             outAt++;
264         }
265 
266         if (readLength != null && readLength.length > 0) {
267             readLength[0] = at - start;
268             readLength[0] = at - start;
269         }
270         return new String(chars, 0, outAt);
271     }
272 
273     /**
274      * Helper for {@link #utf8BytesToString}, which throws the right
275      * exception for a bogus utf-8 byte.
276      *
277      * @param value the byte value
278      * @param offset the file offset
279      * @return never
280      * @throws IllegalArgumentException always thrown
281      */
throwBadUtf8(int value, int offset)282     private static String throwBadUtf8(int value, int offset) {
283         throw new IllegalArgumentException("bad utf-8 byte " + Hex.u1(value) +
284                                            " at offset " + Hex.u4(offset));
285     }
286 }
287