• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2021 Google Inc. All rights reserved.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 @file:Suppress("NOTHING_TO_INLINE")
17 package com.google.flatbuffers.kotlin
18 
19 public object Utf8 {
20   /**
21    * Returns the number of bytes in the UTF-8-encoded form of `sequence`. For a string,
22    * this method is equivalent to `string.getBytes(UTF_8).length`, but is more efficient in
23    * both time and space.
24    *
25    * @throws IllegalArgumentException if `sequence` contains ill-formed UTF-16 (unpaired
26    * surrogates)
27    */
computeEncodedLengthnull28   private fun computeEncodedLength(sequence: CharSequence): Int {
29     // Warning to maintainers: this implementation is highly optimized.
30     val utf16Length = sequence.length
31     var utf8Length = utf16Length
32     var i = 0
33 
34     // This loop optimizes for pure ASCII.
35     while (i < utf16Length && sequence[i].toInt() < 0x80) {
36       i++
37     }
38 
39     // This loop optimizes for chars less than 0x800.
40     while (i < utf16Length) {
41       val c = sequence[i]
42       if (c.toInt() < 0x800) {
43         utf8Length += 0x7f - c.toInt() ushr 31 // branch free!
44       } else {
45         utf8Length += encodedLengthGeneral(sequence, i)
46         break
47       }
48       i++
49     }
50     if (utf8Length < utf16Length) {
51       // Necessary and sufficient condition for overflow because of maximum 3x expansion
52       error("UTF-8 length does not fit in int: ${(utf8Length + (1L shl 32))}")
53     }
54     return utf8Length
55   }
56 
encodedLengthGeneralnull57   private fun encodedLengthGeneral(sequence: CharSequence, start: Int): Int {
58     val utf16Length = sequence.length
59     var utf8Length = 0
60     var i = start
61     while (i < utf16Length) {
62       val c = sequence[i]
63       if (c.toInt() < 0x800) {
64         utf8Length += 0x7f - c.toInt() ushr 31 // branch free!
65       } else {
66         utf8Length += 2
67         if (c.isSurrogate()) {
68           // Check that we have a well-formed surrogate pair.
69           val cp: Int = codePointAt(sequence, i)
70           if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
71             errorSurrogate(i, utf16Length)
72           }
73           i++
74         }
75       }
76       i++
77     }
78     return utf8Length
79   }
80 
81   /**
82    * Returns the number of bytes in the UTF-8-encoded form of `sequence`. For a string,
83    * this method is equivalent to `string.getBytes(UTF_8).length`, but is more efficient in
84    * both time and space.
85    *
86    * @throws IllegalArgumentException if `sequence` contains ill-formed UTF-16 (unpaired
87    * surrogates)
88    */
encodedLengthnull89   public fun encodedLength(sequence: CharSequence): Int = computeEncodedLength(sequence)
90 
91   /**
92    * Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'.
93    */
94   public inline fun isOneByte(b: Byte): Boolean = b >= 0
95 
96   /**
97    * Returns whether this is a two-byte codepoint with the form 110xxxxx  0xC0..0xDF.
98    */
99   public inline fun isTwoBytes(b: Byte): Boolean = b < 0xE0.toByte()
100 
101   /**
102    * Returns whether this is a three-byte codepoint with the form 1110xxxx  0xE0..0xEF.
103    */
104   public inline fun isThreeBytes(b: Byte): Boolean = b < 0xF0.toByte()
105 
106   /**
107    * Returns whether this is a four-byte codepoint with the form 11110xxx  0xF0..0xF4.
108    */
109   public inline fun isFourByte(b: Byte): Boolean = b < 0xF8.toByte()
110 
111   public fun handleOneByte(byte1: Byte, resultArr: CharArray, resultPos: Int) {
112     resultArr[resultPos] = byte1.toChar()
113   }
114 
handleTwoBytesnull115   public fun handleTwoBytes(
116     byte1: Byte,
117     byte2: Byte,
118     resultArr: CharArray,
119     resultPos: Int
120   ) {
121     // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and
122     // overlong 2-byte, '11000001'.
123     if (byte1 < 0xC2.toByte()) {
124       error("Invalid UTF-8: Illegal leading byte in 2 bytes utf")
125     }
126     if (isNotTrailingByte(byte2)) {
127       error("Invalid UTF-8: Illegal trailing byte in 2 bytes utf")
128     }
129     resultArr[resultPos] = (byte1.toInt() and 0x1F shl 6 or trailingByteValue(byte2)).toChar()
130   }
131 
handleThreeBytesnull132   public fun handleThreeBytes(
133     byte1: Byte,
134     byte2: Byte,
135     byte3: Byte,
136     resultArr: CharArray,
137     resultPos: Int
138   ) {
139     if (isNotTrailingByte(byte2) || // overlong? 5 most significant bits must not all be zero
140       byte1 == 0xE0.toByte() && byte2 < 0xA0.toByte() || // check for illegal surrogate codepoints
141       byte1 == 0xED.toByte() && byte2 >= 0xA0.toByte() ||
142       isNotTrailingByte(byte3)
143     ) {
144       error("Invalid UTF-8")
145     }
146     resultArr[resultPos] =
147       (byte1.toInt() and 0x0F shl 12 or (trailingByteValue(byte2) shl 6) or trailingByteValue(byte3)).toChar()
148   }
149 
handleFourBytesnull150   public fun handleFourBytes(
151     byte1: Byte,
152     byte2: Byte,
153     byte3: Byte,
154     byte4: Byte,
155     resultArr: CharArray,
156     resultPos: Int
157   ) {
158     if (isNotTrailingByte(byte2) || // Check that 1 <= plane <= 16.  Tricky optimized form of:
159       //   valid 4-byte leading byte?
160       // if (byte1 > (byte) 0xF4 ||
161       //   overlong? 4 most significant bits must not all be zero
162       //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
163       //   codepoint larger than the highest code point (U+10FFFF)?
164       //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
165       (byte1.toInt() shl 28) + (byte2 - 0x90.toByte()) shr 30 != 0 || isNotTrailingByte(byte3) ||
166       isNotTrailingByte(byte4)
167     ) {
168       error("Invalid UTF-8")
169     }
170     val codepoint: Int = (
171       byte1.toInt() and 0x07 shl 18
172         or (trailingByteValue(byte2) shl 12)
173         or (trailingByteValue(byte3) shl 6)
174         or trailingByteValue(byte4)
175       )
176     resultArr[resultPos] = highSurrogate(codepoint)
177     resultArr[resultPos + 1] = lowSurrogate(codepoint)
178   }
179 
180   /**
181    * Returns whether the byte is not a valid continuation of the form '10XXXXXX'.
182    */
isNotTrailingBytenull183   private fun isNotTrailingByte(b: Byte): Boolean = b > 0xBF.toByte()
184 
185   /**
186    * Returns the actual value of the trailing byte (removes the prefix '10') for composition.
187    */
188   private fun trailingByteValue(b: Byte): Int = b.toInt() and 0x3F
189 
190   private fun highSurrogate(codePoint: Int): Char =
191     (
192       Char.MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT ushr 10) +
193         (codePoint ushr 10)
194       )
195 
196   private fun lowSurrogate(codePoint: Int): Char = (Char.MIN_LOW_SURROGATE + (codePoint and 0x3ff))
197 
198   /**
199    * Encode a [CharSequence] UTF8 codepoint into a byte array.
200    * @param `in` CharSequence to be encoded
201    * @param start start position of the first char in the codepoint
202    * @param out byte array of 4 bytes to be filled
203    * @return return the amount of bytes occupied by the codepoint
204    */
205   public fun encodeUtf8CodePoint(input: CharSequence, start: Int, out: ByteArray): Int {
206     // utf8 codepoint needs at least 4 bytes
207     val inLength = input.length
208     if (start >= inLength) {
209       return 0
210     }
211     val c = input[start]
212     return if (c.toInt() < 0x80) {
213       // One byte (0xxx xxxx)
214       out[0] = c.toByte()
215       1
216     } else if (c.toInt() < 0x800) {
217       // Two bytes (110x xxxx 10xx xxxx)
218       out[0] = (0xC0 or (c.toInt() ushr 6)).toByte()
219       out[1] = (0x80 or (0x3F and c.toInt())).toByte()
220       2
221     } else if (c < Char.MIN_SURROGATE || Char.MAX_SURROGATE < c) {
222       // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx)
223       // Maximum single-char code point is 0xFFFF, 16 bits.
224       out[0] = (0xE0 or (c.toInt() ushr 12)).toByte()
225       out[1] = (0x80 or (0x3F and (c.toInt() ushr 6))).toByte()
226       out[2] = (0x80 or (0x3F and c.toInt())).toByte()
227       3
228     } else {
229       // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx)
230       // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
231       // bytes
232       val low: Char = input[start + 1]
233       if (start + 1 == inLength || !(c.isHighSurrogate() and low.isLowSurrogate())) {
234         errorSurrogate(start, inLength)
235       }
236       val codePoint: Int = toCodePoint(c, low)
237       out[0] = (0xF shl 4 or (codePoint ushr 18)).toByte()
238       out[1] = (0x80 or (0x3F and (codePoint ushr 12))).toByte()
239       out[2] = (0x80 or (0x3F and (codePoint ushr 6))).toByte()
240       out[3] = (0x80 or (0x3F and codePoint)).toByte()
241       4
242     }
243   }
244 
245   // Decodes a code point starting at index into out. Out parameter
246   // should have at least 2 chars.
decodeUtf8CodePointnull247   public fun decodeUtf8CodePoint(bytes: ReadBuffer, index: Int, out: CharArray) {
248     // Bitwise OR combines the sign bits so any negative value fails the check.
249     val b1 = bytes[index]
250     when {
251       isOneByte(b1) -> handleOneByte(b1, out, 0)
252       isTwoBytes(b1) -> handleTwoBytes(b1, bytes[index + 1], out, 0)
253       isThreeBytes(b1) -> handleThreeBytes(b1, bytes[index + 1], bytes[index + 2], out, 0)
254       else -> handleFourBytes(b1, bytes[index + 1], bytes[index + 2], bytes[index + 3], out, 0)
255     }
256   }
257 
decodeUtf8Arraynull258   public fun decodeUtf8Array(bytes: ByteArray, index: Int = 0, size: Int = bytes.size): String {
259     // Bitwise OR combines the sign bits so any negative value fails the check.
260     if (index or size or bytes.size - index - size < 0) {
261       error("buffer length=${bytes.size}, index=$index, size=$size")
262     }
263     var offset = index
264     val limit = offset + size
265 
266     // The longest possible resulting String is the same as the number of input bytes, when it is
267     // all ASCII. For other cases, this over-allocates and we will truncate in the end.
268     val resultArr = CharArray(size)
269     var resultPos = 0
270 
271     // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
272     // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
273     while (offset < limit) {
274       val b = bytes[offset]
275       if (!isOneByte(b)) {
276         break
277       }
278       offset++
279       handleOneByte(b, resultArr, resultPos++)
280     }
281     while (offset < limit) {
282       val byte1 = bytes[offset++]
283       if (isOneByte(byte1)) {
284         handleOneByte(byte1, resultArr, resultPos++)
285         // It's common for there to be multiple ASCII characters in a run mixed in, so add an
286         // extra optimized loop to take care of these runs.
287         while (offset < limit) {
288           val b = bytes[offset]
289           if (!isOneByte(b)) {
290             break
291           }
292           offset++
293           handleOneByte(b, resultArr, resultPos++)
294         }
295       } else if (isTwoBytes(byte1)) {
296         if (offset >= limit) {
297           error("Invalid UTF-8")
298         }
299         handleTwoBytes(
300           byte1, /* byte2 */
301           bytes[offset++], resultArr, resultPos++
302         )
303       } else if (isThreeBytes(byte1)) {
304         if (offset >= limit - 1) {
305           error("Invalid UTF-8")
306         }
307         handleThreeBytes(
308           byte1, /* byte2 */
309           bytes[offset++], /* byte3 */
310           bytes[offset++],
311           resultArr,
312           resultPos++
313         )
314       } else {
315         if (offset >= limit - 2) {
316           error("Invalid UTF-8")
317         }
318         handleFourBytes(
319           byte1, /* byte2 */
320           bytes[offset++], /* byte3 */
321           bytes[offset++], /* byte4 */
322           bytes[offset++],
323           resultArr,
324           resultPos++
325         )
326         // 4-byte case requires two chars.
327         resultPos++
328       }
329     }
330     return resultArr.concatToString(0, resultPos)
331   }
332 
encodeUtf8Arraynull333   public fun encodeUtf8Array(input: CharSequence, out: ByteArray, offset: Int = 0, length: Int = out.size - offset): Int {
334     val utf16Length = input.length
335     var j = offset
336     var i = 0
337     val limit = offset + length
338     // Designed to take advantage of
339     // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination
340 
341     if (utf16Length == 0)
342       return 0
343     var cc: Char = input[i]
344     while (i < utf16Length && i + j < limit && input[i].also { cc = it }.toInt() < 0x80) {
345       out[j + i] = cc.toByte()
346       i++
347     }
348     if (i == utf16Length) {
349       return j + utf16Length
350     }
351     j += i
352     var c: Char
353     while (i < utf16Length) {
354       c = input[i]
355       if (c.toInt() < 0x80 && j < limit) {
356         out[j++] = c.toByte()
357       } else if (c.toInt() < 0x800 && j <= limit - 2) { // 11 bits, two UTF-8 bytes
358         out[j++] = (0xF shl 6 or (c.toInt() ushr 6)).toByte()
359         out[j++] = (0x80 or (0x3F and c.toInt())).toByte()
360       } else if ((c < Char.MIN_SURROGATE || Char.MAX_SURROGATE < c) && j <= limit - 3) {
361         // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes
362         out[j++] = (0xF shl 5 or (c.toInt() ushr 12)).toByte()
363         out[j++] = (0x80 or (0x3F and (c.toInt() ushr 6))).toByte()
364         out[j++] = (0x80 or (0x3F and c.toInt())).toByte()
365       } else if (j <= limit - 4) {
366         // Minimum code point represented by a surrogate pair is 0x10000, 17 bits,
367         // four UTF-8 bytes
368         var low: Char = Char.MIN_VALUE
369         if (i + 1 == input.length ||
370           !isSurrogatePair(c, input[++i].also { low = it })
371         ) {
372           errorSurrogate(i - 1, utf16Length)
373         }
374         val codePoint: Int = toCodePoint(c, low)
375         out[j++] = (0xF shl 4 or (codePoint ushr 18)).toByte()
376         out[j++] = (0x80 or (0x3F and (codePoint ushr 12))).toByte()
377         out[j++] = (0x80 or (0x3F and (codePoint ushr 6))).toByte()
378         out[j++] = (0x80 or (0x3F and codePoint)).toByte()
379       } else {
380         // If we are surrogates and we're not a surrogate pair, always throw an
381         // UnpairedSurrogateException instead of an ArrayOutOfBoundsException.
382         if (Char.MIN_SURROGATE <= c && c <= Char.MAX_SURROGATE &&
383           (i + 1 == input.length || !isSurrogatePair(c, input[i + 1]))
384         ) {
385           errorSurrogate(i, utf16Length)
386         }
387         error("Failed writing character ${c.toShort().toString(radix = 16)} at index $j")
388       }
389       i++
390     }
391     return j
392   }
393 
codePointAtnull394   public fun codePointAt(seq: CharSequence, position: Int): Int {
395     var index = position
396     val c1 = seq[index]
397     if (c1.isHighSurrogate() && ++index < seq.length) {
398       val c2 = seq[index]
399       if (c2.isLowSurrogate()) {
400         return toCodePoint(c1, c2)
401       }
402     }
403     return c1.toInt()
404   }
405 
isSurrogatePairnull406   private fun isSurrogatePair(high: Char, low: Char) = high.isHighSurrogate() and low.isLowSurrogate()
407 
408   private fun toCodePoint(high: Char, low: Char): Int = (high.toInt() shl 10) + low.toInt() +
409     (MIN_SUPPLEMENTARY_CODE_POINT - (Char.MIN_HIGH_SURROGATE.toInt() shl 10) - Char.MIN_LOW_SURROGATE.toInt())
410 
411   private fun errorSurrogate(i: Int, utf16Length: Int): Unit =
412     error("Unpaired surrogate at index $i of $utf16Length length")
413 
414   // The minimum value of Unicode supplementary code point, constant `U+10000`.
415   private const val MIN_SUPPLEMENTARY_CODE_POINT = 0x010000
416 }
417