• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
<lambda>null2  * Copyright (C) 2017 Square, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /**
18  * Okio assumes most applications use UTF-8 exclusively, and offers optimized implementations of
19  * common operations on UTF-8 strings.
20  *
21  * <table border="1" cellspacing="0" cellpadding="3" summary="">
22  * <tr>
23  * <th></th>
24  * <th>[ByteString]</th>
25  * <th>[Buffer], [BufferedSink], [BufferedSource]</th>
26  * </tr>
27  * <tr>
28  * <td>Encode a string</td>
29  * <td>[ByteString.encodeUtf8]</td>
30  * <td>[BufferedSink.writeUtf8]</td>
31  * </tr>
32  * <tr>
33  * <td>Encode a code point</td>
34  * <td></td>
35  * <td>[BufferedSink.writeUtf8CodePoint]</td>
36  * </tr>
37  * <tr>
38  * <td>Decode a string</td>
39  * <td>[ByteString.utf8]</td>
40  * <td>[BufferedSource.readUtf8], [BufferedSource.readUtf8]</td>
41  * </tr>
42  * <tr>
43  * <td>Decode a code point</td>
44  * <td></td>
45  * <td>[BufferedSource.readUtf8CodePoint]</td>
46  * </tr>
47  * <tr>
48  * <td>Decode until the next `\r\n` or `\n`</td>
49  * <td></td>
50  * <td>[BufferedSource.readUtf8LineStrict],
51  * [BufferedSource.readUtf8LineStrict]</td>
52  * </tr>
53  * <tr>
54  * <td>Decode until the next `\r\n`, `\n`, or `EOF`</td>
55  * <td></td>
56  * <td>[BufferedSource.readUtf8Line]</td>
57  * </tr>
58  * <tr>
59  * <td>Measure the bytes in a UTF-8 string</td>
60  * <td colspan="2">[Utf8.size], [Utf8.size]</td>
61  * </tr>
62  * </table>
63  */
64 @file:JvmName("Utf8")
65 
66 package okio
67 
68 import kotlin.jvm.JvmName
69 import kotlin.jvm.JvmOverloads
70 
71 /**
72  * Returns the number of bytes used to encode the slice of `string` as UTF-8 when using
73  * [BufferedSink.writeUtf8].
74  */
75 @JvmOverloads
76 @JvmName("size")
77 fun String.utf8Size(beginIndex: Int = 0, endIndex: Int = length): Long {
78   require(beginIndex >= 0) { "beginIndex < 0: $beginIndex" }
79   require(endIndex >= beginIndex) { "endIndex < beginIndex: $endIndex < $beginIndex" }
80   require(endIndex <= length) { "endIndex > string.length: $endIndex > $length" }
81 
82   var result = 0L
83   var i = beginIndex
84   while (i < endIndex) {
85     val c = this[i].toInt()
86 
87     if (c < 0x80) {
88       // A 7-bit character with 1 byte.
89       result++
90       i++
91     } else if (c < 0x800) {
92       // An 11-bit character with 2 bytes.
93       result += 2
94       i++
95     } else if (c < 0xd800 || c > 0xdfff) {
96       // A 16-bit character with 3 bytes.
97       result += 3
98       i++
99     } else {
100       val low = if (i + 1 < endIndex) this[i + 1].toInt() else 0
101       if (c > 0xdbff || low < 0xdc00 || low > 0xdfff) {
102         // A malformed surrogate, which yields '?'.
103         result++
104         i++
105       } else {
106         // A 21-bit character with 4 bytes.
107         result += 4
108         i += 2
109       }
110     }
111   }
112 
113   return result
114 }
115 
116 internal const val REPLACEMENT_BYTE: Byte = '?'.toByte()
117 internal const val REPLACEMENT_CHARACTER: Char = '\ufffd'
118 internal const val REPLACEMENT_CODE_POINT: Int = REPLACEMENT_CHARACTER.toInt()
119 
120 @Suppress("NOTHING_TO_INLINE") // Syntactic sugar.
isIsoControlnull121 internal inline fun isIsoControl(codePoint: Int): Boolean =
122   (codePoint in 0x00..0x1F) || (codePoint in 0x7F..0x9F)
123 
124 @Suppress("NOTHING_TO_INLINE") // Syntactic sugar.
125 internal inline fun isUtf8Continuation(byte: Byte): Boolean {
126   // 0b10xxxxxx
127   return byte and 0xc0 == 0x80
128 }
129 
130 // TODO combine with Buffer.writeUtf8?
131 // TODO combine with Buffer.writeUtf8CodePoint?
processUtf8Bytesnull132 internal inline fun String.processUtf8Bytes(
133   beginIndex: Int,
134   endIndex: Int,
135   yield: (Byte) -> Unit
136 ) {
137   // Transcode a UTF-16 String to UTF-8 bytes.
138   var index = beginIndex
139   while (index < endIndex) {
140     val c = this[index]
141 
142     when {
143       c < '\u0080' -> {
144         // Emit a 7-bit character with 1 byte.
145         yield(c.toByte()) // 0xxxxxxx
146         index++
147 
148         // Assume there is going to be more ASCII
149         while (index < endIndex && this[index] < '\u0080') {
150           yield(this[index++].toByte())
151         }
152       }
153 
154       c < '\u0800' -> {
155         // Emit a 11-bit character with 2 bytes.
156         /* ktlint-disable no-multi-spaces */
157         yield((c.toInt() shr 6          or 0xc0).toByte()) // 110xxxxx
158         yield((c.toInt()       and 0x3f or 0x80).toByte()) // 10xxxxxx
159         /* ktlint-enable no-multi-spaces */
160         index++
161       }
162 
163       c !in '\ud800'..'\udfff' -> {
164         // Emit a 16-bit character with 3 bytes.
165         /* ktlint-disable no-multi-spaces */
166         yield((c.toInt() shr 12          or 0xe0).toByte()) // 1110xxxx
167         yield((c.toInt() shr  6 and 0x3f or 0x80).toByte()) // 10xxxxxx
168         yield((c.toInt()        and 0x3f or 0x80).toByte()) // 10xxxxxx
169         /* ktlint-enable no-multi-spaces */
170         index++
171       }
172 
173       else -> {
174         // c is a surrogate. Make sure it is a high surrogate & that its successor is a low
175         // surrogate. If not, the UTF-16 is invalid, in which case we emit a replacement
176         // byte.
177         if (c > '\udbff' ||
178           endIndex <= index + 1 ||
179           this[index + 1] !in '\udc00'..'\udfff'
180         ) {
181           yield(REPLACEMENT_BYTE)
182           index++
183         } else {
184           // UTF-16 high surrogate: 110110xxxxxxxxxx (10 bits)
185           // UTF-16 low surrogate:  110111yyyyyyyyyy (10 bits)
186           // Unicode code point:    00010000000000000000 + xxxxxxxxxxyyyyyyyyyy (21 bits)
187           val codePoint = (
188             ((c.toInt() shl 10) + this[index + 1].toInt()) +
189               (0x010000 - (0xd800 shl 10) - 0xdc00)
190             )
191 
192           // Emit a 21-bit character with 4 bytes.
193           /* ktlint-disable no-multi-spaces */
194           yield((codePoint shr 18          or 0xf0).toByte()) // 11110xxx
195           yield((codePoint shr 12 and 0x3f or 0x80).toByte()) // 10xxxxxx
196           yield((codePoint shr 6  and 0x3f or 0x80).toByte()) // 10xxyyyy
197           yield((codePoint        and 0x3f or 0x80).toByte()) // 10yyyyyy
198           /* ktlint-enable no-multi-spaces */
199           index += 2
200         }
201       }
202     }
203   }
204 }
205 
206 // TODO combine with Buffer.readUtf8CodePoint?
processUtf8CodePointsnull207 internal inline fun ByteArray.processUtf8CodePoints(
208   beginIndex: Int,
209   endIndex: Int,
210   yield: (Int) -> Unit
211 ) {
212   var index = beginIndex
213   while (index < endIndex) {
214     val b0 = this[index]
215     when {
216       b0 >= 0 -> {
217         // 0b0xxxxxxx
218         yield(b0.toInt())
219         index++
220 
221         // Assume there is going to be more ASCII
222         while (index < endIndex && this[index] >= 0) {
223           yield(this[index++].toInt())
224         }
225       }
226       b0 shr 5 == -2 -> {
227         // 0b110xxxxx
228         index += process2Utf8Bytes(index, endIndex) { yield(it) }
229       }
230       b0 shr 4 == -2 -> {
231         // 0b1110xxxx
232         index += process3Utf8Bytes(index, endIndex) { yield(it) }
233       }
234       b0 shr 3 == -2 -> {
235         // 0b11110xxx
236         index += process4Utf8Bytes(index, endIndex) { yield(it) }
237       }
238       else -> {
239         // 0b10xxxxxx - Unexpected continuation
240         // 0b111111xxx - Unknown encoding
241         yield(REPLACEMENT_CODE_POINT)
242         index++
243       }
244     }
245   }
246 }
247 
248 // Value added to the high UTF-16 surrogate after shifting
249 internal const val HIGH_SURROGATE_HEADER = 0xd800 - (0x010000 ushr 10)
250 // Value added to the low UTF-16 surrogate after masking
251 internal const val LOG_SURROGATE_HEADER = 0xdc00
252 
253 // TODO combine with Buffer.readUtf8?
processUtf16Charsnull254 internal inline fun ByteArray.processUtf16Chars(
255   beginIndex: Int,
256   endIndex: Int,
257   yield: (Char) -> Unit
258 ) {
259   var index = beginIndex
260   while (index < endIndex) {
261     val b0 = this[index]
262     when {
263       b0 >= 0 -> {
264         // 0b0xxxxxxx
265         yield(b0.toChar())
266         index++
267 
268         // Assume there is going to be more ASCII
269         // This is almost double the performance of the outer loop
270         while (index < endIndex && this[index] >= 0) {
271           yield(this[index++].toChar())
272         }
273       }
274       b0 shr 5 == -2 -> {
275         // 0b110xxxxx
276         index += process2Utf8Bytes(index, endIndex) { yield(it.toChar()) }
277       }
278       b0 shr 4 == -2 -> {
279         // 0b1110xxxx
280         index += process3Utf8Bytes(index, endIndex) { yield(it.toChar()) }
281       }
282       b0 shr 3 == -2 -> {
283         // 0b11110xxx
284         index += process4Utf8Bytes(index, endIndex) { codePoint ->
285           if (codePoint != REPLACEMENT_CODE_POINT) {
286             // Unicode code point:    00010000000000000000 + xxxxxxxxxxyyyyyyyyyy (21 bits)
287             // UTF-16 high surrogate: 110110xxxxxxxxxx (10 bits)
288             // UTF-16 low surrogate:  110111yyyyyyyyyy (10 bits)
289             /* ktlint-disable no-multi-spaces paren-spacing */
290             yield(((codePoint ushr 10   ) + HIGH_SURROGATE_HEADER).toChar())
291             /* ktlint-enable no-multi-spaces paren-spacing */
292             yield(((codePoint and 0x03ff) + LOG_SURROGATE_HEADER).toChar())
293           } else {
294             yield(REPLACEMENT_CHARACTER)
295           }
296         }
297       }
298       else -> {
299         // 0b10xxxxxx - Unexpected continuation
300         // 0b111111xxx - Unknown encoding
301         yield(REPLACEMENT_CHARACTER)
302         index++
303       }
304     }
305   }
306 }
307 
308 // ===== UTF-8 Encoding and Decoding ===== //
309 /*
310 The following 3 methods take advantage of using XOR on 2's complement store
311 numbers to quickly and efficiently combine the important data of UTF-8 encoded
312 bytes. This will be best explained using an example, so lets take the following
313 encoded character '∇' = \u2207.
314 
315 Using the Unicode code point for this character, 0x2207, we will split the
316 binary representation into 3 sections as follows:
317 
318     0x2207 = 0b0010 0010 0000 0111
319                xxxx yyyy yyzz zzzz
320 
321 Now take each section of bits and add the appropriate header:
322 
323     utf8(0x2207) = 0b1110 xxxx 0b10yy yyyy 0b10zz zzzz
324                  = 0b1110 0010 0b1000 1000 0b1000 0111
325                  = 0xe2        0x88        0x87
326 
327 We have now just encoded this as a 3 byte UTF-8 character. More information
328 about different sizes of characters can be found here:
329     https://en.wikipedia.org/wiki/UTF-8
330 
331 Encoding was pretty easy, but decoding is a bit more complicated. We need to
332 first determine the number of bytes used to represent the character, strip all
333 the headers, and then combine all the bits into a single integer. Let's use the
334 character we just encoded and work backwards, taking advantage of 2's complement
335 integer representation and the XOR function.
336 
337 Let's look at the decimal representation of these bytes:
338 
339     0xe2, 0x88, 0x87 = -30, -120, -121
340 
341 The first interesting thing to notice is that UTF-8 headers all start with 1 -
342 except for ASCII which is encoded as a single byte - which means all UTF-8 bytes
343 will be negative. So converting these to integers results in a lot of 1's added
344 because they are store as 2's complement:
345 
346     0xe2 =  -30 = 0xffff ffe2
347     0x88 = -120 = 0xffff ff88
348     0x87 = -121 = 0xffff ff87
349 
350 Now let's XOR these with their corresponding UTF-8 byte headers to see what
351 happens:
352 
353     0xffff ffe2 xor 0xffff ffe0 = 0x0000 0002
354     0xffff ff88 xor 0xffff ff80 = 0x0000 0008
355     0xffff ff87 xor 0xffff ff80 = 0x0000 0007
356 
357 ***This is why we must first convert the byte header mask to a byte and then
358 back to an integer, so it is properly converted to a 2's complement negative
359 number which can be applied to each byte.***
360 
361 Now let's look at the binary representation to see how we can combine these to
362 create the Unicode code point:
363 
364     0b0000 0010    0b0000 1000    0b0000 0111
365     0b1110 xxxx    0b10yy yyyy    0b10zz zzzz
366 
367 Combining each section will require some bit shifting, but then they can just
368 be OR'd together. They can also be XOR'd together which makes use of a single,
369 COMMUTATIVE, operator through the entire calculation.
370 
371       << 12 = 00000010
372       <<  6 =       00001000
373       <<  0 =             00000111
374         XOR = 00000010001000000111
375 
376  code point = 0b0010 0010 0000 0111
377             = 0x2207
378 
379 And there we have it! The decoded UTF-8 character '∇'! And because the XOR
380 operator is commutative, we can re-arrange all this XOR and shifting to create
381 a single mask that can be applied to 3-byte UTF-8 characters after their bytes
382 have been shifted and XOR'd together.
383  */
384 
385 // Mask used to remove byte headers from a 2 byte encoded UTF-8 character
386 internal const val MASK_2BYTES = 0x0f80
387 // MASK_2BYTES =
388 //    (0xc0.toByte() shl 6) xor
389 //    (0x80.toByte().toInt())
390 
process2Utf8Bytesnull391 internal inline fun ByteArray.process2Utf8Bytes(
392   beginIndex: Int,
393   endIndex: Int,
394   yield: (Int) -> Unit
395 ): Int {
396   if (endIndex <= beginIndex + 1) {
397     yield(REPLACEMENT_CODE_POINT)
398     // Only 1 byte remaining - underflow
399     return 1
400   }
401 
402   val b0 = this[beginIndex]
403   val b1 = this[beginIndex + 1]
404   if (!isUtf8Continuation(b1)) {
405     yield(REPLACEMENT_CODE_POINT)
406     return 1
407   }
408 
409   val codePoint =
410     (
411       MASK_2BYTES
412         xor (b1.toInt())
413         xor (b0.toInt() shl 6)
414       )
415 
416   when {
417     codePoint < 0x80 -> {
418       yield(REPLACEMENT_CODE_POINT) // Reject overlong code points.
419     }
420     else -> {
421       yield(codePoint)
422     }
423   }
424   return 2
425 }
426 
427 // Mask used to remove byte headers from a 3 byte encoded UTF-8 character
428 internal const val MASK_3BYTES = -0x01e080
429 // MASK_3BYTES =
430 //    (0xe0.toByte() shl 12) xor
431 //    (0x80.toByte() shl 6) xor
432 //    (0x80.toByte().toInt())
433 
process3Utf8Bytesnull434 internal inline fun ByteArray.process3Utf8Bytes(
435   beginIndex: Int,
436   endIndex: Int,
437   yield: (Int) -> Unit
438 ): Int {
439   if (endIndex <= beginIndex + 2) {
440     // At least 2 bytes remaining
441     yield(REPLACEMENT_CODE_POINT)
442     if (endIndex <= beginIndex + 1 || !isUtf8Continuation(this[beginIndex + 1])) {
443       // Only 1 byte remaining - underflow
444       // Or 2nd byte is not a continuation - malformed
445       return 1
446     } else {
447       // Only 2 bytes remaining - underflow
448       return 2
449     }
450   }
451 
452   val b0 = this[beginIndex]
453   val b1 = this[beginIndex + 1]
454   if (!isUtf8Continuation(b1)) {
455     yield(REPLACEMENT_CODE_POINT)
456     return 1
457   }
458   val b2 = this[beginIndex + 2]
459   if (!isUtf8Continuation(b2)) {
460     yield(REPLACEMENT_CODE_POINT)
461     return 2
462   }
463 
464   val codePoint =
465     (
466       MASK_3BYTES
467         xor (b2.toInt())
468         xor (b1.toInt() shl 6)
469         xor (b0.toInt() shl 12)
470       )
471 
472   when {
473     codePoint < 0x800 -> {
474       yield(REPLACEMENT_CODE_POINT) // Reject overlong code points.
475     }
476     codePoint in 0xd800..0xdfff -> {
477       yield(REPLACEMENT_CODE_POINT) // Reject partial surrogates.
478     }
479     else -> {
480       yield(codePoint)
481     }
482   }
483   return 3
484 }
485 
486 // Mask used to remove byte headers from a 4 byte encoded UTF-8 character
487 internal const val MASK_4BYTES = 0x381f80
488 // MASK_4BYTES =
489 //    (0xf0.toByte() shl 18) xor
490 //    (0x80.toByte() shl 12) xor
491 //    (0x80.toByte() shl 6) xor
492 //    (0x80.toByte().toInt())
493 
process4Utf8Bytesnull494 internal inline fun ByteArray.process4Utf8Bytes(
495   beginIndex: Int,
496   endIndex: Int,
497   yield: (Int) -> Unit
498 ): Int {
499   if (endIndex <= beginIndex + 3) {
500     // At least 3 bytes remaining
501     yield(REPLACEMENT_CODE_POINT)
502     if (endIndex <= beginIndex + 1 || !isUtf8Continuation(this[beginIndex + 1])) {
503       // Only 1 byte remaining - underflow
504       // Or 2nd byte is not a continuation - malformed
505       return 1
506     } else if (endIndex <= beginIndex + 2 || !isUtf8Continuation(this[beginIndex + 2])) {
507       // Only 2 bytes remaining - underflow
508       // Or 3rd byte is not a continuation - malformed
509       return 2
510     } else {
511       // Only 3 bytes remaining - underflow
512       return 3
513     }
514   }
515 
516   val b0 = this[beginIndex]
517   val b1 = this[beginIndex + 1]
518   if (!isUtf8Continuation(b1)) {
519     yield(REPLACEMENT_CODE_POINT)
520     return 1
521   }
522   val b2 = this[beginIndex + 2]
523   if (!isUtf8Continuation(b2)) {
524     yield(REPLACEMENT_CODE_POINT)
525     return 2
526   }
527   val b3 = this[beginIndex + 3]
528   if (!isUtf8Continuation(b3)) {
529     yield(REPLACEMENT_CODE_POINT)
530     return 3
531   }
532 
533   val codePoint =
534     (
535       MASK_4BYTES
536         xor (b3.toInt())
537         xor (b2.toInt() shl 6)
538         xor (b1.toInt() shl 12)
539         xor (b0.toInt() shl 18)
540       )
541 
542   when {
543     codePoint > 0x10ffff -> {
544       yield(REPLACEMENT_CODE_POINT) // Reject code points larger than the Unicode maximum.
545     }
546     codePoint in 0xd800..0xdfff -> {
547       yield(REPLACEMENT_CODE_POINT) // Reject partial surrogates.
548     }
549     codePoint < 0x10000 -> {
550       yield(REPLACEMENT_CODE_POINT) // Reject overlong code points.
551     }
552     else -> {
553       yield(codePoint)
554     }
555   }
556   return 4
557 }
558