1 /* 2 * Copyright 2021 Google Inc. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 @file:Suppress("NOTHING_TO_INLINE") 17 package com.google.flatbuffers.kotlin 18 19 public object Utf8 { 20 /** 21 * Returns the number of bytes in the UTF-8-encoded form of `sequence`. For a string, 22 * this method is equivalent to `string.getBytes(UTF_8).length`, but is more efficient in 23 * both time and space. 24 * 25 * @throws IllegalArgumentException if `sequence` contains ill-formed UTF-16 (unpaired 26 * surrogates) 27 */ computeEncodedLengthnull28 private fun computeEncodedLength(sequence: CharSequence): Int { 29 // Warning to maintainers: this implementation is highly optimized. 30 val utf16Length = sequence.length 31 var utf8Length = utf16Length 32 var i = 0 33 34 // This loop optimizes for pure ASCII. 35 while (i < utf16Length && sequence[i].toInt() < 0x80) { 36 i++ 37 } 38 39 // This loop optimizes for chars less than 0x800. 40 while (i < utf16Length) { 41 val c = sequence[i] 42 if (c.toInt() < 0x800) { 43 utf8Length += 0x7f - c.toInt() ushr 31 // branch free! 44 } else { 45 utf8Length += encodedLengthGeneral(sequence, i) 46 break 47 } 48 i++ 49 } 50 if (utf8Length < utf16Length) { 51 // Necessary and sufficient condition for overflow because of maximum 3x expansion 52 error("UTF-8 length does not fit in int: ${(utf8Length + (1L shl 32))}") 53 } 54 return utf8Length 55 } 56 encodedLengthGeneralnull57 private fun encodedLengthGeneral(sequence: CharSequence, start: Int): Int { 58 val utf16Length = sequence.length 59 var utf8Length = 0 60 var i = start 61 while (i < utf16Length) { 62 val c = sequence[i] 63 if (c.toInt() < 0x800) { 64 utf8Length += 0x7f - c.toInt() ushr 31 // branch free! 65 } else { 66 utf8Length += 2 67 if (c.isSurrogate()) { 68 // Check that we have a well-formed surrogate pair. 69 val cp: Int = codePointAt(sequence, i) 70 if (cp < MIN_SUPPLEMENTARY_CODE_POINT) { 71 errorSurrogate(i, utf16Length) 72 } 73 i++ 74 } 75 } 76 i++ 77 } 78 return utf8Length 79 } 80 81 /** 82 * Returns the number of bytes in the UTF-8-encoded form of `sequence`. For a string, 83 * this method is equivalent to `string.getBytes(UTF_8).length`, but is more efficient in 84 * both time and space. 85 * 86 * @throws IllegalArgumentException if `sequence` contains ill-formed UTF-16 (unpaired 87 * surrogates) 88 */ encodedLengthnull89 public fun encodedLength(sequence: CharSequence): Int = computeEncodedLength(sequence) 90 91 /** 92 * Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'. 93 */ 94 public inline fun isOneByte(b: Byte): Boolean = b >= 0 95 96 /** 97 * Returns whether this is a two-byte codepoint with the form 110xxxxx 0xC0..0xDF. 98 */ 99 public inline fun isTwoBytes(b: Byte): Boolean = b < 0xE0.toByte() 100 101 /** 102 * Returns whether this is a three-byte codepoint with the form 1110xxxx 0xE0..0xEF. 103 */ 104 public inline fun isThreeBytes(b: Byte): Boolean = b < 0xF0.toByte() 105 106 /** 107 * Returns whether this is a four-byte codepoint with the form 11110xxx 0xF0..0xF4. 108 */ 109 public inline fun isFourByte(b: Byte): Boolean = b < 0xF8.toByte() 110 111 public fun handleOneByte(byte1: Byte, resultArr: CharArray, resultPos: Int) { 112 resultArr[resultPos] = byte1.toChar() 113 } 114 handleTwoBytesnull115 public fun handleTwoBytes( 116 byte1: Byte, 117 byte2: Byte, 118 resultArr: CharArray, 119 resultPos: Int 120 ) { 121 // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and 122 // overlong 2-byte, '11000001'. 123 if (byte1 < 0xC2.toByte()) { 124 error("Invalid UTF-8: Illegal leading byte in 2 bytes utf") 125 } 126 if (isNotTrailingByte(byte2)) { 127 error("Invalid UTF-8: Illegal trailing byte in 2 bytes utf") 128 } 129 resultArr[resultPos] = (byte1.toInt() and 0x1F shl 6 or trailingByteValue(byte2)).toChar() 130 } 131 handleThreeBytesnull132 public fun handleThreeBytes( 133 byte1: Byte, 134 byte2: Byte, 135 byte3: Byte, 136 resultArr: CharArray, 137 resultPos: Int 138 ) { 139 if (isNotTrailingByte(byte2) || // overlong? 5 most significant bits must not all be zero 140 byte1 == 0xE0.toByte() && byte2 < 0xA0.toByte() || // check for illegal surrogate codepoints 141 byte1 == 0xED.toByte() && byte2 >= 0xA0.toByte() || 142 isNotTrailingByte(byte3) 143 ) { 144 error("Invalid UTF-8") 145 } 146 resultArr[resultPos] = 147 (byte1.toInt() and 0x0F shl 12 or (trailingByteValue(byte2) shl 6) or trailingByteValue(byte3)).toChar() 148 } 149 handleFourBytesnull150 public fun handleFourBytes( 151 byte1: Byte, 152 byte2: Byte, 153 byte3: Byte, 154 byte4: Byte, 155 resultArr: CharArray, 156 resultPos: Int 157 ) { 158 if (isNotTrailingByte(byte2) || // Check that 1 <= plane <= 16. Tricky optimized form of: 159 // valid 4-byte leading byte? 160 // if (byte1 > (byte) 0xF4 || 161 // overlong? 4 most significant bits must not all be zero 162 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 163 // codepoint larger than the highest code point (U+10FFFF)? 164 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 165 (byte1.toInt() shl 28) + (byte2 - 0x90.toByte()) shr 30 != 0 || isNotTrailingByte(byte3) || 166 isNotTrailingByte(byte4) 167 ) { 168 error("Invalid UTF-8") 169 } 170 val codepoint: Int = ( 171 byte1.toInt() and 0x07 shl 18 172 or (trailingByteValue(byte2) shl 12) 173 or (trailingByteValue(byte3) shl 6) 174 or trailingByteValue(byte4) 175 ) 176 resultArr[resultPos] = highSurrogate(codepoint) 177 resultArr[resultPos + 1] = lowSurrogate(codepoint) 178 } 179 180 /** 181 * Returns whether the byte is not a valid continuation of the form '10XXXXXX'. 182 */ isNotTrailingBytenull183 private fun isNotTrailingByte(b: Byte): Boolean = b > 0xBF.toByte() 184 185 /** 186 * Returns the actual value of the trailing byte (removes the prefix '10') for composition. 187 */ 188 private fun trailingByteValue(b: Byte): Int = b.toInt() and 0x3F 189 190 private fun highSurrogate(codePoint: Int): Char = 191 ( 192 Char.MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT ushr 10) + 193 (codePoint ushr 10) 194 ) 195 196 private fun lowSurrogate(codePoint: Int): Char = (Char.MIN_LOW_SURROGATE + (codePoint and 0x3ff)) 197 198 /** 199 * Encode a [CharSequence] UTF8 codepoint into a byte array. 200 * @param `in` CharSequence to be encoded 201 * @param start start position of the first char in the codepoint 202 * @param out byte array of 4 bytes to be filled 203 * @return return the amount of bytes occupied by the codepoint 204 */ 205 public fun encodeUtf8CodePoint(input: CharSequence, start: Int, out: ByteArray): Int { 206 // utf8 codepoint needs at least 4 bytes 207 val inLength = input.length 208 if (start >= inLength) { 209 return 0 210 } 211 val c = input[start] 212 return if (c.toInt() < 0x80) { 213 // One byte (0xxx xxxx) 214 out[0] = c.toByte() 215 1 216 } else if (c.toInt() < 0x800) { 217 // Two bytes (110x xxxx 10xx xxxx) 218 out[0] = (0xC0 or (c.toInt() ushr 6)).toByte() 219 out[1] = (0x80 or (0x3F and c.toInt())).toByte() 220 2 221 } else if (c < Char.MIN_SURROGATE || Char.MAX_SURROGATE < c) { 222 // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx) 223 // Maximum single-char code point is 0xFFFF, 16 bits. 224 out[0] = (0xE0 or (c.toInt() ushr 12)).toByte() 225 out[1] = (0x80 or (0x3F and (c.toInt() ushr 6))).toByte() 226 out[2] = (0x80 or (0x3F and c.toInt())).toByte() 227 3 228 } else { 229 // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx) 230 // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8 231 // bytes 232 val low: Char = input[start + 1] 233 if (start + 1 == inLength || !(c.isHighSurrogate() and low.isLowSurrogate())) { 234 errorSurrogate(start, inLength) 235 } 236 val codePoint: Int = toCodePoint(c, low) 237 out[0] = (0xF shl 4 or (codePoint ushr 18)).toByte() 238 out[1] = (0x80 or (0x3F and (codePoint ushr 12))).toByte() 239 out[2] = (0x80 or (0x3F and (codePoint ushr 6))).toByte() 240 out[3] = (0x80 or (0x3F and codePoint)).toByte() 241 4 242 } 243 } 244 245 // Decodes a code point starting at index into out. Out parameter 246 // should have at least 2 chars. decodeUtf8CodePointnull247 public fun decodeUtf8CodePoint(bytes: ReadBuffer, index: Int, out: CharArray) { 248 // Bitwise OR combines the sign bits so any negative value fails the check. 249 val b1 = bytes[index] 250 when { 251 isOneByte(b1) -> handleOneByte(b1, out, 0) 252 isTwoBytes(b1) -> handleTwoBytes(b1, bytes[index + 1], out, 0) 253 isThreeBytes(b1) -> handleThreeBytes(b1, bytes[index + 1], bytes[index + 2], out, 0) 254 else -> handleFourBytes(b1, bytes[index + 1], bytes[index + 2], bytes[index + 3], out, 0) 255 } 256 } 257 decodeUtf8Arraynull258 public fun decodeUtf8Array(bytes: ByteArray, index: Int = 0, size: Int = bytes.size): String { 259 // Bitwise OR combines the sign bits so any negative value fails the check. 260 if (index or size or bytes.size - index - size < 0) { 261 error("buffer length=${bytes.size}, index=$index, size=$size") 262 } 263 var offset = index 264 val limit = offset + size 265 266 // The longest possible resulting String is the same as the number of input bytes, when it is 267 // all ASCII. For other cases, this over-allocates and we will truncate in the end. 268 val resultArr = CharArray(size) 269 var resultPos = 0 270 271 // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this). 272 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). 273 while (offset < limit) { 274 val b = bytes[offset] 275 if (!isOneByte(b)) { 276 break 277 } 278 offset++ 279 handleOneByte(b, resultArr, resultPos++) 280 } 281 while (offset < limit) { 282 val byte1 = bytes[offset++] 283 if (isOneByte(byte1)) { 284 handleOneByte(byte1, resultArr, resultPos++) 285 // It's common for there to be multiple ASCII characters in a run mixed in, so add an 286 // extra optimized loop to take care of these runs. 287 while (offset < limit) { 288 val b = bytes[offset] 289 if (!isOneByte(b)) { 290 break 291 } 292 offset++ 293 handleOneByte(b, resultArr, resultPos++) 294 } 295 } else if (isTwoBytes(byte1)) { 296 if (offset >= limit) { 297 error("Invalid UTF-8") 298 } 299 handleTwoBytes( 300 byte1, /* byte2 */ 301 bytes[offset++], resultArr, resultPos++ 302 ) 303 } else if (isThreeBytes(byte1)) { 304 if (offset >= limit - 1) { 305 error("Invalid UTF-8") 306 } 307 handleThreeBytes( 308 byte1, /* byte2 */ 309 bytes[offset++], /* byte3 */ 310 bytes[offset++], 311 resultArr, 312 resultPos++ 313 ) 314 } else { 315 if (offset >= limit - 2) { 316 error("Invalid UTF-8") 317 } 318 handleFourBytes( 319 byte1, /* byte2 */ 320 bytes[offset++], /* byte3 */ 321 bytes[offset++], /* byte4 */ 322 bytes[offset++], 323 resultArr, 324 resultPos++ 325 ) 326 // 4-byte case requires two chars. 327 resultPos++ 328 } 329 } 330 return resultArr.concatToString(0, resultPos) 331 } 332 encodeUtf8Arraynull333 public fun encodeUtf8Array(input: CharSequence, out: ByteArray, offset: Int = 0, length: Int = out.size - offset): Int { 334 val utf16Length = input.length 335 var j = offset 336 var i = 0 337 val limit = offset + length 338 // Designed to take advantage of 339 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination 340 341 if (utf16Length == 0) 342 return 0 343 var cc: Char = input[i] 344 while (i < utf16Length && i + j < limit && input[i].also { cc = it }.toInt() < 0x80) { 345 out[j + i] = cc.toByte() 346 i++ 347 } 348 if (i == utf16Length) { 349 return j + utf16Length 350 } 351 j += i 352 var c: Char 353 while (i < utf16Length) { 354 c = input[i] 355 if (c.toInt() < 0x80 && j < limit) { 356 out[j++] = c.toByte() 357 } else if (c.toInt() < 0x800 && j <= limit - 2) { // 11 bits, two UTF-8 bytes 358 out[j++] = (0xF shl 6 or (c.toInt() ushr 6)).toByte() 359 out[j++] = (0x80 or (0x3F and c.toInt())).toByte() 360 } else if ((c < Char.MIN_SURROGATE || Char.MAX_SURROGATE < c) && j <= limit - 3) { 361 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes 362 out[j++] = (0xF shl 5 or (c.toInt() ushr 12)).toByte() 363 out[j++] = (0x80 or (0x3F and (c.toInt() ushr 6))).toByte() 364 out[j++] = (0x80 or (0x3F and c.toInt())).toByte() 365 } else if (j <= limit - 4) { 366 // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, 367 // four UTF-8 bytes 368 var low: Char = Char.MIN_VALUE 369 if (i + 1 == input.length || 370 !isSurrogatePair(c, input[++i].also { low = it }) 371 ) { 372 errorSurrogate(i - 1, utf16Length) 373 } 374 val codePoint: Int = toCodePoint(c, low) 375 out[j++] = (0xF shl 4 or (codePoint ushr 18)).toByte() 376 out[j++] = (0x80 or (0x3F and (codePoint ushr 12))).toByte() 377 out[j++] = (0x80 or (0x3F and (codePoint ushr 6))).toByte() 378 out[j++] = (0x80 or (0x3F and codePoint)).toByte() 379 } else { 380 // If we are surrogates and we're not a surrogate pair, always throw an 381 // UnpairedSurrogateException instead of an ArrayOutOfBoundsException. 382 if (Char.MIN_SURROGATE <= c && c <= Char.MAX_SURROGATE && 383 (i + 1 == input.length || !isSurrogatePair(c, input[i + 1])) 384 ) { 385 errorSurrogate(i, utf16Length) 386 } 387 error("Failed writing character ${c.toShort().toString(radix = 16)} at index $j") 388 } 389 i++ 390 } 391 return j 392 } 393 codePointAtnull394 public fun codePointAt(seq: CharSequence, position: Int): Int { 395 var index = position 396 val c1 = seq[index] 397 if (c1.isHighSurrogate() && ++index < seq.length) { 398 val c2 = seq[index] 399 if (c2.isLowSurrogate()) { 400 return toCodePoint(c1, c2) 401 } 402 } 403 return c1.toInt() 404 } 405 isSurrogatePairnull406 private fun isSurrogatePair(high: Char, low: Char) = high.isHighSurrogate() and low.isLowSurrogate() 407 408 private fun toCodePoint(high: Char, low: Char): Int = (high.toInt() shl 10) + low.toInt() + 409 (MIN_SUPPLEMENTARY_CODE_POINT - (Char.MIN_HIGH_SURROGATE.toInt() shl 10) - Char.MIN_LOW_SURROGATE.toInt()) 410 411 private fun errorSurrogate(i: Int, utf16Length: Int): Unit = 412 error("Unpaired surrogate at index $i of $utf16Length length") 413 414 // The minimum value of Unicode supplementary code point, constant `U+10000`. 415 private const val MIN_SUPPLEMENTARY_CODE_POINT = 0x010000 416 } 417