1 /* 2 * Copyright 2014 Google Inc. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.flatbuffers; 18 19 import java.nio.ByteBuffer; 20 21 import static java.lang.Character.MIN_HIGH_SURROGATE; 22 import static java.lang.Character.MIN_LOW_SURROGATE; 23 import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT; 24 25 public abstract class Utf8 { 26 27 /** 28 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, 29 * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in 30 * both time and space. 31 * 32 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired 33 * surrogates) 34 */ encodedLength(CharSequence sequence)35 public abstract int encodedLength(CharSequence sequence); 36 37 /** 38 * Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding. 39 * 40 * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct) 41 * and the capabilities of the platform. 42 * 43 * @param in the source string to be encoded 44 * @param out the target buffer to receive the encoded string. 45 */ encodeUtf8(CharSequence in, ByteBuffer out)46 public abstract void encodeUtf8(CharSequence in, ByteBuffer out); 47 48 /** 49 * Decodes the given UTF-8 portion of the {@link ByteBuffer} into a {@link String}. 50 * 51 * @throws IllegalArgumentException if the input is not valid UTF-8. 52 */ decodeUtf8(ByteBuffer buffer, int offset, int length)53 public abstract String decodeUtf8(ByteBuffer buffer, int offset, int length); 54 55 private static Utf8 DEFAULT; 56 57 /** 58 * Get the default UTF-8 processor. 59 * @return the default processor 60 */ getDefault()61 public static Utf8 getDefault() { 62 if (DEFAULT == null) { 63 DEFAULT = new Utf8Safe(); 64 } 65 return DEFAULT; 66 } 67 68 /** 69 * Set the default instance of the UTF-8 processor. 70 * @param instance the new instance to use 71 */ setDefault(Utf8 instance)72 public static void setDefault(Utf8 instance) { 73 DEFAULT = instance; 74 } 75 76 /** 77 * Utility methods for decoding bytes into {@link String}. Callers are responsible for extracting 78 * bytes (possibly using Unsafe methods), and checking remaining bytes. All other UTF-8 validity 79 * checks and codepoint conversion happen in this class. 80 */ 81 static class DecodeUtil { 82 83 /** 84 * Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'. 85 */ isOneByte(byte b)86 static boolean isOneByte(byte b) { 87 return b >= 0; 88 } 89 90 /** 91 * Returns whether this is a two-byte codepoint with the form '10XXXXXX'. 92 */ isTwoBytes(byte b)93 static boolean isTwoBytes(byte b) { 94 return b < (byte) 0xE0; 95 } 96 97 /** 98 * Returns whether this is a three-byte codepoint with the form '110XXXXX'. 99 */ isThreeBytes(byte b)100 static boolean isThreeBytes(byte b) { 101 return b < (byte) 0xF0; 102 } 103 handleOneByte(byte byte1, char[] resultArr, int resultPos)104 static void handleOneByte(byte byte1, char[] resultArr, int resultPos) { 105 resultArr[resultPos] = (char) byte1; 106 } 107 handleTwoBytes( byte byte1, byte byte2, char[] resultArr, int resultPos)108 static void handleTwoBytes( 109 byte byte1, byte byte2, char[] resultArr, int resultPos) 110 throws IllegalArgumentException { 111 // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and 112 // overlong 2-byte, '11000001'. 113 if (byte1 < (byte) 0xC2 114 || isNotTrailingByte(byte2)) { 115 throw new IllegalArgumentException("Invalid UTF-8"); 116 } 117 resultArr[resultPos] = (char) (((byte1 & 0x1F) << 6) | trailingByteValue(byte2)); 118 } 119 handleThreeBytes( byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos)120 static void handleThreeBytes( 121 byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos) 122 throws IllegalArgumentException { 123 if (isNotTrailingByte(byte2) 124 // overlong? 5 most significant bits must not all be zero 125 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 126 // check for illegal surrogate codepoints 127 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 128 || isNotTrailingByte(byte3)) { 129 throw new IllegalArgumentException("Invalid UTF-8"); 130 } 131 resultArr[resultPos] = (char) 132 (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3)); 133 } 134 handleFourBytes( byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)135 static void handleFourBytes( 136 byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos) 137 throws IllegalArgumentException{ 138 if (isNotTrailingByte(byte2) 139 // Check that 1 <= plane <= 16. Tricky optimized form of: 140 // valid 4-byte leading byte? 141 // if (byte1 > (byte) 0xF4 || 142 // overlong? 4 most significant bits must not all be zero 143 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 144 // codepoint larger than the highest code point (U+10FFFF)? 145 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 146 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 147 || isNotTrailingByte(byte3) 148 || isNotTrailingByte(byte4)) { 149 throw new IllegalArgumentException("Invalid UTF-8"); 150 } 151 int codepoint = ((byte1 & 0x07) << 18) 152 | (trailingByteValue(byte2) << 12) 153 | (trailingByteValue(byte3) << 6) 154 | trailingByteValue(byte4); 155 resultArr[resultPos] = DecodeUtil.highSurrogate(codepoint); 156 resultArr[resultPos + 1] = DecodeUtil.lowSurrogate(codepoint); 157 } 158 159 /** 160 * Returns whether the byte is not a valid continuation of the form '10XXXXXX'. 161 */ isNotTrailingByte(byte b)162 private static boolean isNotTrailingByte(byte b) { 163 return b > (byte) 0xBF; 164 } 165 166 /** 167 * Returns the actual value of the trailing byte (removes the prefix '10') for composition. 168 */ trailingByteValue(byte b)169 private static int trailingByteValue(byte b) { 170 return b & 0x3F; 171 } 172 highSurrogate(int codePoint)173 private static char highSurrogate(int codePoint) { 174 return (char) ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10)) 175 + (codePoint >>> 10)); 176 } 177 lowSurrogate(int codePoint)178 private static char lowSurrogate(int codePoint) { 179 return (char) (MIN_LOW_SURROGATE + (codePoint & 0x3ff)); 180 } 181 } 182 183 // These UTF-8 handling methods are copied from Guava's Utf8Unsafe class with a modification to throw 184 // a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can 185 // fallback to more lenient behavior. 186 static class UnpairedSurrogateException extends IllegalArgumentException { UnpairedSurrogateException(int index, int length)187 UnpairedSurrogateException(int index, int length) { 188 super("Unpaired surrogate at index " + index + " of " + length); 189 } 190 } 191 } 192