android-14.0.0_r21/s

/*
 * Copyright 2021 Google Inc. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
@file:Suppress("NOTHING_TO_INLINE")
package com.google.flatbuffers.kotlin

public object Utf8 {
  /**
   * Returns the number of bytes in the UTF-8-encoded form of `sequence`. For a string,
   * this method is equivalent to `string.getBytes(UTF_8).length`, but is more efficient in
   * both time and space.
   *
   * @throws IllegalArgumentException if `sequence` contains ill-formed UTF-16 (unpaired
   * surrogates)
   */
  private fun computeEncodedLength(sequence: CharSequence): Int {
    // Warning to maintainers: this implementation is highly optimized.
    val utf16Length = sequence.length
    var utf8Length = utf16Length
    var i = 0

    // This loop optimizes for pure ASCII.
    while (i < utf16Length && sequence[i].toInt() < 0x80) {
      i++
    }

    // This loop optimizes for chars less than 0x800.
    while (i < utf16Length) {
      val c = sequence[i]
      if (c.toInt() < 0x800) {
        utf8Length += 0x7f - c.toInt() ushr 31 // branch free!
      } else {
        utf8Length += encodedLengthGeneral(sequence, i)
        break
      }
      i++
    }
    if (utf8Length < utf16Length) {
      // Necessary and sufficient condition for overflow because of maximum 3x expansion
      error("UTF-8 length does not fit in int: ${(utf8Length + (1L shl 32))}")
    }
    return utf8Length
  }

  private fun encodedLengthGeneral(sequence: CharSequence, start: Int): Int {
    val utf16Length = sequence.length
    var utf8Length = 0
    var i = start
    while (i < utf16Length) {
      val c = sequence[i]
      if (c.toInt() < 0x800) {
        utf8Length += 0x7f - c.toInt() ushr 31 // branch free!
      } else {
        utf8Length += 2
        if (c.isSurrogate()) {
          // Check that we have a well-formed surrogate pair.
          val cp: Int = codePointAt(sequence, i)
          if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
            errorSurrogate(i, utf16Length)
          }
          i++
        }
      }
      i++
    }
    return utf8Length
  }

  /**
   * Returns the number of bytes in the UTF-8-encoded form of `sequence`. For a string,
   * this method is equivalent to `string.getBytes(UTF_8).length`, but is more efficient in
   * both time and space.
   *
   * @throws IllegalArgumentException if `sequence` contains ill-formed UTF-16 (unpaired
   * surrogates)
   */
  public fun encodedLength(sequence: CharSequence): Int = computeEncodedLength(sequence)

  /**
   * Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'.
   */
  public inline fun isOneByte(b: Byte): Boolean = b >= 0

  /**
   * Returns whether this is a two-byte codepoint with the form 110xxxxx  0xC0..0xDF.
   */
  public inline fun isTwoBytes(b: Byte): Boolean = b < 0xE0.toByte()

  /**
   * Returns whether this is a three-byte codepoint with the form 1110xxxx  0xE0..0xEF.
   */
  public inline fun isThreeBytes(b: Byte): Boolean = b < 0xF0.toByte()

  /**
   * Returns whether this is a four-byte codepoint with the form 11110xxx  0xF0..0xF4.
   */
  public inline fun isFourByte(b: Byte): Boolean = b < 0xF8.toByte()

  public fun handleOneByte(byte1: Byte, resultArr: CharArray, resultPos: Int) {
    resultArr[resultPos] = byte1.toChar()
  }

  public fun handleTwoBytes(
    byte1: Byte,
    byte2: Byte,
    resultArr: CharArray,
    resultPos: Int
  ) {
    // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and
    // overlong 2-byte, '11000001'.
    if (byte1 < 0xC2.toByte()) {
      error("Invalid UTF-8: Illegal leading byte in 2 bytes utf")
    }
    if (isNotTrailingByte(byte2)) {
      error("Invalid UTF-8: Illegal trailing byte in 2 bytes utf")
    }
    resultArr[resultPos] = (byte1.toInt() and 0x1F shl 6 or trailingByteValue(byte2)).toChar()
  }

  public fun handleThreeBytes(
    byte1: Byte,
    byte2: Byte,
    byte3: Byte,
    resultArr: CharArray,
    resultPos: Int
  ) {
    if (isNotTrailingByte(byte2) || // overlong? 5 most significant bits must not all be zero
      byte1 == 0xE0.toByte() && byte2 < 0xA0.toByte() || // check for illegal surrogate codepoints
      byte1 == 0xED.toByte() && byte2 >= 0xA0.toByte() ||
      isNotTrailingByte(byte3)
    ) {
      error("Invalid UTF-8")
    }
    resultArr[resultPos] =
      (byte1.toInt() and 0x0F shl 12 or (trailingByteValue(byte2) shl 6) or trailingByteValue(byte3)).toChar()
  }

  public fun handleFourBytes(
    byte1: Byte,
    byte2: Byte,
    byte3: Byte,
    byte4: Byte,
    resultArr: CharArray,
    resultPos: Int
  ) {
    if (isNotTrailingByte(byte2) || // Check that 1 <= plane <= 16.  Tricky optimized form of:
      //   valid 4-byte leading byte?
      // if (byte1 > (byte) 0xF4 ||
      //   overlong? 4 most significant bits must not all be zero
      //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
      //   codepoint larger than the highest code point (U+10FFFF)?
      //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
      (byte1.toInt() shl 28) + (byte2 - 0x90.toByte()) shr 30 != 0 || isNotTrailingByte(byte3) ||
      isNotTrailingByte(byte4)
    ) {
      error("Invalid UTF-8")
    }
    val codepoint: Int = (
      byte1.toInt() and 0x07 shl 18
        or (trailingByteValue(byte2) shl 12)
        or (trailingByteValue(byte3) shl 6)
        or trailingByteValue(byte4)
      )
    resultArr[resultPos] = highSurrogate(codepoint)
    resultArr[resultPos + 1] = lowSurrogate(codepoint)
  }

  /**
   * Returns whether the byte is not a valid continuation of the form '10XXXXXX'.
   */
  private fun isNotTrailingByte(b: Byte): Boolean = b > 0xBF.toByte()

  /**
   * Returns the actual value of the trailing byte (removes the prefix '10') for composition.
   */
  private fun trailingByteValue(b: Byte): Int = b.toInt() and 0x3F

  private fun highSurrogate(codePoint: Int): Char =
    (
      Char.MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT ushr 10) +
        (codePoint ushr 10)
      )

  private fun lowSurrogate(codePoint: Int): Char = (Char.MIN_LOW_SURROGATE + (codePoint and 0x3ff))

  /**
   * Encode a [CharSequence] UTF8 codepoint into a byte array.
   * @param `in` CharSequence to be encoded
   * @param start start position of the first char in the codepoint
   * @param out byte array of 4 bytes to be filled
   * @return return the amount of bytes occupied by the codepoint
   */
  public fun encodeUtf8CodePoint(input: CharSequence, start: Int, out: ByteArray): Int {
    // utf8 codepoint needs at least 4 bytes
    val inLength = input.length
    if (start >= inLength) {
      return 0
    }
    val c = input[start]
    return if (c.toInt() < 0x80) {
      // One byte (0xxx xxxx)
      out[0] = c.toByte()
      1
    } else if (c.toInt() < 0x800) {
      // Two bytes (110x xxxx 10xx xxxx)
      out[0] = (0xC0 or (c.toInt() ushr 6)).toByte()
      out[1] = (0x80 or (0x3F and c.toInt())).toByte()
      2
    } else if (c < Char.MIN_SURROGATE || Char.MAX_SURROGATE < c) {
      // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx)
      // Maximum single-char code point is 0xFFFF, 16 bits.
      out[0] = (0xE0 or (c.toInt() ushr 12)).toByte()
      out[1] = (0x80 or (0x3F and (c.toInt() ushr 6))).toByte()
      out[2] = (0x80 or (0x3F and c.toInt())).toByte()
      3
    } else {
      // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx)
      // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
      // bytes
      val low: Char = input[start + 1]
      if (start + 1 == inLength || !(c.isHighSurrogate() and low.isLowSurrogate())) {
        errorSurrogate(start, inLength)
      }
      val codePoint: Int = toCodePoint(c, low)
      out[0] = (0xF shl 4 or (codePoint ushr 18)).toByte()
      out[1] = (0x80 or (0x3F and (codePoint ushr 12))).toByte()
      out[2] = (0x80 or (0x3F and (codePoint ushr 6))).toByte()
      out[3] = (0x80 or (0x3F and codePoint)).toByte()
      4
    }
  }

  // Decodes a code point starting at index into out. Out parameter
  // should have at least 2 chars.
  public fun decodeUtf8CodePoint(bytes: ReadBuffer, index: Int, out: CharArray) {
    // Bitwise OR combines the sign bits so any negative value fails the check.
    val b1 = bytes[index]
    when {
      isOneByte(b1) -> handleOneByte(b1, out, 0)
      isTwoBytes(b1) -> handleTwoBytes(b1, bytes[index + 1], out, 0)
      isThreeBytes(b1) -> handleThreeBytes(b1, bytes[index + 1], bytes[index + 2], out, 0)
      else -> handleFourBytes(b1, bytes[index + 1], bytes[index + 2], bytes[index + 3], out, 0)
    }
  }

  public fun decodeUtf8Array(bytes: ByteArray, index: Int = 0, size: Int = bytes.size): String {
    // Bitwise OR combines the sign bits so any negative value fails the check.
    if (index or size or bytes.size - index - size < 0) {
      error("buffer length=${bytes.size}, index=$index, size=$size")
    }
    var offset = index
    val limit = offset + size

    // The longest possible resulting String is the same as the number of input bytes, when it is
    // all ASCII. For other cases, this over-allocates and we will truncate in the end.
    val resultArr = CharArray(size)
    var resultPos = 0

    // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
    // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
    while (offset < limit) {
      val b = bytes[offset]
      if (!isOneByte(b)) {
        break
      }
      offset++
      handleOneByte(b, resultArr, resultPos++)
    }
    while (offset < limit) {
      val byte1 = bytes[offset++]
      if (isOneByte(byte1)) {
        handleOneByte(byte1, resultArr, resultPos++)
        // It's common for there to be multiple ASCII characters in a run mixed in, so add an
        // extra optimized loop to take care of these runs.
        while (offset < limit) {
          val b = bytes[offset]
          if (!isOneByte(b)) {
            break
          }
          offset++
          handleOneByte(b, resultArr, resultPos++)
        }
      } else if (isTwoBytes(byte1)) {
        if (offset >= limit) {
          error("Invalid UTF-8")
        }
        handleTwoBytes(
          byte1, /* byte2 */
          bytes[offset++], resultArr, resultPos++
        )
      } else if (isThreeBytes(byte1)) {
        if (offset >= limit - 1) {
          error("Invalid UTF-8")
        }
        handleThreeBytes(
          byte1, /* byte2 */
          bytes[offset++], /* byte3 */
          bytes[offset++],
          resultArr,
          resultPos++
        )
      } else {
        if (offset >= limit - 2) {
          error("Invalid UTF-8")
        }
        handleFourBytes(
          byte1, /* byte2 */
          bytes[offset++], /* byte3 */
          bytes[offset++], /* byte4 */
          bytes[offset++],
          resultArr,
          resultPos++
        )
        // 4-byte case requires two chars.
        resultPos++
      }
    }
    return resultArr.concatToString(0, resultPos)
  }

  public fun encodeUtf8Array(input: CharSequence, out: ByteArray, offset: Int = 0, length: Int = out.size - offset): Int {
    val utf16Length = input.length
    var j = offset
    var i = 0
    val limit = offset + length
    // Designed to take advantage of
    // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination

    if (utf16Length == 0)
      return 0
    var cc: Char = input[i]
    while (i < utf16Length && i + j < limit && input[i].also { cc = it }.toInt() < 0x80) {
      out[j + i] = cc.toByte()
      i++
    }
    if (i == utf16Length) {
      return j + utf16Length
    }
    j += i
    var c: Char
    while (i < utf16Length) {
      c = input[i]
      if (c.toInt() < 0x80 && j < limit) {
        out[j++] = c.toByte()
      } else if (c.toInt() < 0x800 && j <= limit - 2) { // 11 bits, two UTF-8 bytes
        out[j++] = (0xF shl 6 or (c.toInt() ushr 6)).toByte()
        out[j++] = (0x80 or (0x3F and c.toInt())).toByte()
      } else if ((c < Char.MIN_SURROGATE || Char.MAX_SURROGATE < c) && j <= limit - 3) {
        // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes
        out[j++] = (0xF shl 5 or (c.toInt() ushr 12)).toByte()
        out[j++] = (0x80 or (0x3F and (c.toInt() ushr 6))).toByte()
        out[j++] = (0x80 or (0x3F and c.toInt())).toByte()
      } else if (j <= limit - 4) {
        // Minimum code point represented by a surrogate pair is 0x10000, 17 bits,
        // four UTF-8 bytes
        var low: Char = Char.MIN_VALUE
        if (i + 1 == input.length ||
          !isSurrogatePair(c, input[++i].also { low = it })
        ) {
          errorSurrogate(i - 1, utf16Length)
        }
        val codePoint: Int = toCodePoint(c, low)
        out[j++] = (0xF shl 4 or (codePoint ushr 18)).toByte()
        out[j++] = (0x80 or (0x3F and (codePoint ushr 12))).toByte()
        out[j++] = (0x80 or (0x3F and (codePoint ushr 6))).toByte()
        out[j++] = (0x80 or (0x3F and codePoint)).toByte()
      } else {
        // If we are surrogates and we're not a surrogate pair, always throw an
        // UnpairedSurrogateException instead of an ArrayOutOfBoundsException.
        if (Char.MIN_SURROGATE <= c && c <= Char.MAX_SURROGATE &&
          (i + 1 == input.length || !isSurrogatePair(c, input[i + 1]))
        ) {
          errorSurrogate(i, utf16Length)
        }
        error("Failed writing character ${c.toShort().toString(radix = 16)} at index $j")
      }
      i++
    }
    return j
  }

  public fun codePointAt(seq: CharSequence, position: Int): Int {
    var index = position
    val c1 = seq[index]
    if (c1.isHighSurrogate() && ++index < seq.length) {
      val c2 = seq[index]
      if (c2.isLowSurrogate()) {
        return toCodePoint(c1, c2)
      }
    }
    return c1.toInt()
  }

  private fun isSurrogatePair(high: Char, low: Char) = high.isHighSurrogate() and low.isLowSurrogate()

  private fun toCodePoint(high: Char, low: Char): Int = (high.toInt() shl 10) + low.toInt() +
    (MIN_SUPPLEMENTARY_CODE_POINT - (Char.MIN_HIGH_SURROGATE.toInt() shl 10) - Char.MIN_LOW_SURROGATE.toInt())

  private fun errorSurrogate(i: Int, utf16Length: Int): Unit =
    error("Unpaired surrogate at index $i of $utf16Length length")

  // The minimum value of Unicode supplementary code point, constant `U+10000`.
  private const val MIN_SUPPLEMENTARY_CODE_POINT = 0x010000
}