• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2014 Google Inc. All rights reserved.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.flatbuffers;
18 
19 import java.nio.ByteBuffer;
20 
21 import static java.lang.Character.MIN_HIGH_SURROGATE;
22 import static java.lang.Character.MIN_LOW_SURROGATE;
23 import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
24 
25 public abstract class Utf8 {
26 
27   /**
28    * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
29    * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
30    * both time and space.
31    *
32    * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
33    *     surrogates)
34    */
encodedLength(CharSequence sequence)35   public abstract int encodedLength(CharSequence sequence);
36 
37   /**
38    * Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding.
39    *
40    * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
41    * and the capabilities of the platform.
42    *
43    * @param in the source string to be encoded
44    * @param out the target buffer to receive the encoded string.
45    */
encodeUtf8(CharSequence in, ByteBuffer out)46   public abstract void encodeUtf8(CharSequence in, ByteBuffer out);
47 
48   /**
49    * Decodes the given UTF-8 portion of the {@link ByteBuffer} into a {@link String}.
50    *
51    * @throws IllegalArgumentException if the input is not valid UTF-8.
52    */
decodeUtf8(ByteBuffer buffer, int offset, int length)53   public abstract String decodeUtf8(ByteBuffer buffer, int offset, int length);
54 
55   private static Utf8 DEFAULT;
56 
57   /**
58    * Get the default UTF-8 processor.
59    * @return the default processor
60    */
getDefault()61   public static Utf8 getDefault() {
62     if (DEFAULT == null) {
63       DEFAULT = new Utf8Safe();
64     }
65     return DEFAULT;
66   }
67 
68   /**
69    * Set the default instance of the UTF-8 processor.
70    * @param instance the new instance to use
71    */
setDefault(Utf8 instance)72   public static void setDefault(Utf8 instance) {
73     DEFAULT = instance;
74   }
75 
76   /**
77    * Utility methods for decoding bytes into {@link String}. Callers are responsible for extracting
78    * bytes (possibly using Unsafe methods), and checking remaining bytes. All other UTF-8 validity
79    * checks and codepoint conversion happen in this class.
80    */
81   static class DecodeUtil {
82 
83     /**
84      * Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'.
85      */
isOneByte(byte b)86     static boolean isOneByte(byte b) {
87       return b >= 0;
88     }
89 
90     /**
91      * Returns whether this is a two-byte codepoint with the form '10XXXXXX'.
92      */
isTwoBytes(byte b)93     static boolean isTwoBytes(byte b) {
94       return b < (byte) 0xE0;
95     }
96 
97     /**
98      * Returns whether this is a three-byte codepoint with the form '110XXXXX'.
99      */
isThreeBytes(byte b)100     static boolean isThreeBytes(byte b) {
101       return b < (byte) 0xF0;
102     }
103 
handleOneByte(byte byte1, char[] resultArr, int resultPos)104     static void handleOneByte(byte byte1, char[] resultArr, int resultPos) {
105       resultArr[resultPos] = (char) byte1;
106     }
107 
handleTwoBytes( byte byte1, byte byte2, char[] resultArr, int resultPos)108     static void handleTwoBytes(
109         byte byte1, byte byte2, char[] resultArr, int resultPos)
110         throws IllegalArgumentException {
111       // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and
112       // overlong 2-byte, '11000001'.
113       if (byte1 < (byte) 0xC2
114               || isNotTrailingByte(byte2)) {
115         throw new IllegalArgumentException("Invalid UTF-8");
116       }
117       resultArr[resultPos] = (char) (((byte1 & 0x1F) << 6) | trailingByteValue(byte2));
118     }
119 
handleThreeBytes( byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos)120     static void handleThreeBytes(
121         byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos)
122         throws IllegalArgumentException {
123       if (isNotTrailingByte(byte2)
124               // overlong? 5 most significant bits must not all be zero
125               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
126               // check for illegal surrogate codepoints
127               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
128               || isNotTrailingByte(byte3)) {
129         throw new IllegalArgumentException("Invalid UTF-8");
130       }
131       resultArr[resultPos] = (char)
132                                  (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3));
133     }
134 
handleFourBytes( byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)135     static void handleFourBytes(
136         byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)
137         throws IllegalArgumentException{
138       if (isNotTrailingByte(byte2)
139               // Check that 1 <= plane <= 16.  Tricky optimized form of:
140               //   valid 4-byte leading byte?
141               // if (byte1 > (byte) 0xF4 ||
142               //   overlong? 4 most significant bits must not all be zero
143               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
144               //   codepoint larger than the highest code point (U+10FFFF)?
145               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
146               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
147               || isNotTrailingByte(byte3)
148               || isNotTrailingByte(byte4)) {
149         throw new IllegalArgumentException("Invalid UTF-8");
150       }
151       int codepoint = ((byte1 & 0x07) << 18)
152                           | (trailingByteValue(byte2) << 12)
153                           | (trailingByteValue(byte3) << 6)
154                           | trailingByteValue(byte4);
155       resultArr[resultPos] = DecodeUtil.highSurrogate(codepoint);
156       resultArr[resultPos + 1] = DecodeUtil.lowSurrogate(codepoint);
157     }
158 
159     /**
160      * Returns whether the byte is not a valid continuation of the form '10XXXXXX'.
161      */
isNotTrailingByte(byte b)162     private static boolean isNotTrailingByte(byte b) {
163       return b > (byte) 0xBF;
164     }
165 
166     /**
167      * Returns the actual value of the trailing byte (removes the prefix '10') for composition.
168      */
trailingByteValue(byte b)169     private static int trailingByteValue(byte b) {
170       return b & 0x3F;
171     }
172 
highSurrogate(int codePoint)173     private static char highSurrogate(int codePoint) {
174       return (char) ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10))
175                          + (codePoint >>> 10));
176     }
177 
lowSurrogate(int codePoint)178     private static char lowSurrogate(int codePoint) {
179       return (char) (MIN_LOW_SURROGATE + (codePoint & 0x3ff));
180     }
181   }
182 
183   // These UTF-8 handling methods are copied from Guava's Utf8Unsafe class with a modification to throw
184   // a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can
185   // fallback to more lenient behavior.
186   static class UnpairedSurrogateException extends IllegalArgumentException {
UnpairedSurrogateException(int index, int length)187     UnpairedSurrogateException(int index, int length) {
188       super("Unpaired surrogate at index " + index + " of " + length);
189     }
190   }
191 }
192