1 /*
<lambda>null2 * Copyright (C) 2017 Square, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 /**
18 * Okio assumes most applications use UTF-8 exclusively, and offers optimized implementations of
19 * common operations on UTF-8 strings.
20 *
21 * <table border="1" cellspacing="0" cellpadding="3" summary="">
22 * <tr>
23 * <th></th>
24 * <th>[ByteString]</th>
25 * <th>[Buffer], [BufferedSink], [BufferedSource]</th>
26 * </tr>
27 * <tr>
28 * <td>Encode a string</td>
29 * <td>[ByteString.encodeUtf8]</td>
30 * <td>[BufferedSink.writeUtf8]</td>
31 * </tr>
32 * <tr>
33 * <td>Encode a code point</td>
34 * <td></td>
35 * <td>[BufferedSink.writeUtf8CodePoint]</td>
36 * </tr>
37 * <tr>
38 * <td>Decode a string</td>
39 * <td>[ByteString.utf8]</td>
40 * <td>[BufferedSource.readUtf8], [BufferedSource.readUtf8]</td>
41 * </tr>
42 * <tr>
43 * <td>Decode a code point</td>
44 * <td></td>
45 * <td>[BufferedSource.readUtf8CodePoint]</td>
46 * </tr>
47 * <tr>
48 * <td>Decode until the next `\r\n` or `\n`</td>
49 * <td></td>
50 * <td>[BufferedSource.readUtf8LineStrict],
51 * [BufferedSource.readUtf8LineStrict]</td>
52 * </tr>
53 * <tr>
54 * <td>Decode until the next `\r\n`, `\n`, or `EOF`</td>
55 * <td></td>
56 * <td>[BufferedSource.readUtf8Line]</td>
57 * </tr>
58 * <tr>
59 * <td>Measure the bytes in a UTF-8 string</td>
60 * <td colspan="2">[Utf8.size], [Utf8.size]</td>
61 * </tr>
62 * </table>
63 */
64 @file:JvmName("Utf8")
65
66 package okio
67
68 import kotlin.jvm.JvmName
69 import kotlin.jvm.JvmOverloads
70
71 /**
72 * Returns the number of bytes used to encode the slice of `string` as UTF-8 when using
73 * [BufferedSink.writeUtf8].
74 */
75 @JvmOverloads
76 @JvmName("size")
77 fun String.utf8Size(beginIndex: Int = 0, endIndex: Int = length): Long {
78 require(beginIndex >= 0) { "beginIndex < 0: $beginIndex" }
79 require(endIndex >= beginIndex) { "endIndex < beginIndex: $endIndex < $beginIndex" }
80 require(endIndex <= length) { "endIndex > string.length: $endIndex > $length" }
81
82 var result = 0L
83 var i = beginIndex
84 while (i < endIndex) {
85 val c = this[i].toInt()
86
87 if (c < 0x80) {
88 // A 7-bit character with 1 byte.
89 result++
90 i++
91 } else if (c < 0x800) {
92 // An 11-bit character with 2 bytes.
93 result += 2
94 i++
95 } else if (c < 0xd800 || c > 0xdfff) {
96 // A 16-bit character with 3 bytes.
97 result += 3
98 i++
99 } else {
100 val low = if (i + 1 < endIndex) this[i + 1].toInt() else 0
101 if (c > 0xdbff || low < 0xdc00 || low > 0xdfff) {
102 // A malformed surrogate, which yields '?'.
103 result++
104 i++
105 } else {
106 // A 21-bit character with 4 bytes.
107 result += 4
108 i += 2
109 }
110 }
111 }
112
113 return result
114 }
115
116 internal const val REPLACEMENT_BYTE: Byte = '?'.toByte()
117 internal const val REPLACEMENT_CHARACTER: Char = '\ufffd'
118 internal const val REPLACEMENT_CODE_POINT: Int = REPLACEMENT_CHARACTER.toInt()
119
120 @Suppress("NOTHING_TO_INLINE") // Syntactic sugar.
isIsoControlnull121 internal inline fun isIsoControl(codePoint: Int): Boolean =
122 (codePoint in 0x00..0x1F) || (codePoint in 0x7F..0x9F)
123
124 @Suppress("NOTHING_TO_INLINE") // Syntactic sugar.
125 internal inline fun isUtf8Continuation(byte: Byte): Boolean {
126 // 0b10xxxxxx
127 return byte and 0xc0 == 0x80
128 }
129
130 // TODO combine with Buffer.writeUtf8?
131 // TODO combine with Buffer.writeUtf8CodePoint?
processUtf8Bytesnull132 internal inline fun String.processUtf8Bytes(
133 beginIndex: Int,
134 endIndex: Int,
135 yield: (Byte) -> Unit
136 ) {
137 // Transcode a UTF-16 String to UTF-8 bytes.
138 var index = beginIndex
139 while (index < endIndex) {
140 val c = this[index]
141
142 when {
143 c < '\u0080' -> {
144 // Emit a 7-bit character with 1 byte.
145 yield(c.toByte()) // 0xxxxxxx
146 index++
147
148 // Assume there is going to be more ASCII
149 while (index < endIndex && this[index] < '\u0080') {
150 yield(this[index++].toByte())
151 }
152 }
153
154 c < '\u0800' -> {
155 // Emit a 11-bit character with 2 bytes.
156 /* ktlint-disable no-multi-spaces */
157 yield((c.toInt() shr 6 or 0xc0).toByte()) // 110xxxxx
158 yield((c.toInt() and 0x3f or 0x80).toByte()) // 10xxxxxx
159 /* ktlint-enable no-multi-spaces */
160 index++
161 }
162
163 c !in '\ud800'..'\udfff' -> {
164 // Emit a 16-bit character with 3 bytes.
165 /* ktlint-disable no-multi-spaces */
166 yield((c.toInt() shr 12 or 0xe0).toByte()) // 1110xxxx
167 yield((c.toInt() shr 6 and 0x3f or 0x80).toByte()) // 10xxxxxx
168 yield((c.toInt() and 0x3f or 0x80).toByte()) // 10xxxxxx
169 /* ktlint-enable no-multi-spaces */
170 index++
171 }
172
173 else -> {
174 // c is a surrogate. Make sure it is a high surrogate & that its successor is a low
175 // surrogate. If not, the UTF-16 is invalid, in which case we emit a replacement
176 // byte.
177 if (c > '\udbff' ||
178 endIndex <= index + 1 ||
179 this[index + 1] !in '\udc00'..'\udfff'
180 ) {
181 yield(REPLACEMENT_BYTE)
182 index++
183 } else {
184 // UTF-16 high surrogate: 110110xxxxxxxxxx (10 bits)
185 // UTF-16 low surrogate: 110111yyyyyyyyyy (10 bits)
186 // Unicode code point: 00010000000000000000 + xxxxxxxxxxyyyyyyyyyy (21 bits)
187 val codePoint = (
188 ((c.toInt() shl 10) + this[index + 1].toInt()) +
189 (0x010000 - (0xd800 shl 10) - 0xdc00)
190 )
191
192 // Emit a 21-bit character with 4 bytes.
193 /* ktlint-disable no-multi-spaces */
194 yield((codePoint shr 18 or 0xf0).toByte()) // 11110xxx
195 yield((codePoint shr 12 and 0x3f or 0x80).toByte()) // 10xxxxxx
196 yield((codePoint shr 6 and 0x3f or 0x80).toByte()) // 10xxyyyy
197 yield((codePoint and 0x3f or 0x80).toByte()) // 10yyyyyy
198 /* ktlint-enable no-multi-spaces */
199 index += 2
200 }
201 }
202 }
203 }
204 }
205
206 // TODO combine with Buffer.readUtf8CodePoint?
processUtf8CodePointsnull207 internal inline fun ByteArray.processUtf8CodePoints(
208 beginIndex: Int,
209 endIndex: Int,
210 yield: (Int) -> Unit
211 ) {
212 var index = beginIndex
213 while (index < endIndex) {
214 val b0 = this[index]
215 when {
216 b0 >= 0 -> {
217 // 0b0xxxxxxx
218 yield(b0.toInt())
219 index++
220
221 // Assume there is going to be more ASCII
222 while (index < endIndex && this[index] >= 0) {
223 yield(this[index++].toInt())
224 }
225 }
226 b0 shr 5 == -2 -> {
227 // 0b110xxxxx
228 index += process2Utf8Bytes(index, endIndex) { yield(it) }
229 }
230 b0 shr 4 == -2 -> {
231 // 0b1110xxxx
232 index += process3Utf8Bytes(index, endIndex) { yield(it) }
233 }
234 b0 shr 3 == -2 -> {
235 // 0b11110xxx
236 index += process4Utf8Bytes(index, endIndex) { yield(it) }
237 }
238 else -> {
239 // 0b10xxxxxx - Unexpected continuation
240 // 0b111111xxx - Unknown encoding
241 yield(REPLACEMENT_CODE_POINT)
242 index++
243 }
244 }
245 }
246 }
247
248 // Value added to the high UTF-16 surrogate after shifting
249 internal const val HIGH_SURROGATE_HEADER = 0xd800 - (0x010000 ushr 10)
250 // Value added to the low UTF-16 surrogate after masking
251 internal const val LOG_SURROGATE_HEADER = 0xdc00
252
253 // TODO combine with Buffer.readUtf8?
processUtf16Charsnull254 internal inline fun ByteArray.processUtf16Chars(
255 beginIndex: Int,
256 endIndex: Int,
257 yield: (Char) -> Unit
258 ) {
259 var index = beginIndex
260 while (index < endIndex) {
261 val b0 = this[index]
262 when {
263 b0 >= 0 -> {
264 // 0b0xxxxxxx
265 yield(b0.toChar())
266 index++
267
268 // Assume there is going to be more ASCII
269 // This is almost double the performance of the outer loop
270 while (index < endIndex && this[index] >= 0) {
271 yield(this[index++].toChar())
272 }
273 }
274 b0 shr 5 == -2 -> {
275 // 0b110xxxxx
276 index += process2Utf8Bytes(index, endIndex) { yield(it.toChar()) }
277 }
278 b0 shr 4 == -2 -> {
279 // 0b1110xxxx
280 index += process3Utf8Bytes(index, endIndex) { yield(it.toChar()) }
281 }
282 b0 shr 3 == -2 -> {
283 // 0b11110xxx
284 index += process4Utf8Bytes(index, endIndex) { codePoint ->
285 if (codePoint != REPLACEMENT_CODE_POINT) {
286 // Unicode code point: 00010000000000000000 + xxxxxxxxxxyyyyyyyyyy (21 bits)
287 // UTF-16 high surrogate: 110110xxxxxxxxxx (10 bits)
288 // UTF-16 low surrogate: 110111yyyyyyyyyy (10 bits)
289 /* ktlint-disable no-multi-spaces paren-spacing */
290 yield(((codePoint ushr 10 ) + HIGH_SURROGATE_HEADER).toChar())
291 /* ktlint-enable no-multi-spaces paren-spacing */
292 yield(((codePoint and 0x03ff) + LOG_SURROGATE_HEADER).toChar())
293 } else {
294 yield(REPLACEMENT_CHARACTER)
295 }
296 }
297 }
298 else -> {
299 // 0b10xxxxxx - Unexpected continuation
300 // 0b111111xxx - Unknown encoding
301 yield(REPLACEMENT_CHARACTER)
302 index++
303 }
304 }
305 }
306 }
307
308 // ===== UTF-8 Encoding and Decoding ===== //
309 /*
310 The following 3 methods take advantage of using XOR on 2's complement store
311 numbers to quickly and efficiently combine the important data of UTF-8 encoded
312 bytes. This will be best explained using an example, so lets take the following
313 encoded character '∇' = \u2207.
314
315 Using the Unicode code point for this character, 0x2207, we will split the
316 binary representation into 3 sections as follows:
317
318 0x2207 = 0b0010 0010 0000 0111
319 xxxx yyyy yyzz zzzz
320
321 Now take each section of bits and add the appropriate header:
322
323 utf8(0x2207) = 0b1110 xxxx 0b10yy yyyy 0b10zz zzzz
324 = 0b1110 0010 0b1000 1000 0b1000 0111
325 = 0xe2 0x88 0x87
326
327 We have now just encoded this as a 3 byte UTF-8 character. More information
328 about different sizes of characters can be found here:
329 https://en.wikipedia.org/wiki/UTF-8
330
331 Encoding was pretty easy, but decoding is a bit more complicated. We need to
332 first determine the number of bytes used to represent the character, strip all
333 the headers, and then combine all the bits into a single integer. Let's use the
334 character we just encoded and work backwards, taking advantage of 2's complement
335 integer representation and the XOR function.
336
337 Let's look at the decimal representation of these bytes:
338
339 0xe2, 0x88, 0x87 = -30, -120, -121
340
341 The first interesting thing to notice is that UTF-8 headers all start with 1 -
342 except for ASCII which is encoded as a single byte - which means all UTF-8 bytes
343 will be negative. So converting these to integers results in a lot of 1's added
344 because they are store as 2's complement:
345
346 0xe2 = -30 = 0xffff ffe2
347 0x88 = -120 = 0xffff ff88
348 0x87 = -121 = 0xffff ff87
349
350 Now let's XOR these with their corresponding UTF-8 byte headers to see what
351 happens:
352
353 0xffff ffe2 xor 0xffff ffe0 = 0x0000 0002
354 0xffff ff88 xor 0xffff ff80 = 0x0000 0008
355 0xffff ff87 xor 0xffff ff80 = 0x0000 0007
356
357 ***This is why we must first convert the byte header mask to a byte and then
358 back to an integer, so it is properly converted to a 2's complement negative
359 number which can be applied to each byte.***
360
361 Now let's look at the binary representation to see how we can combine these to
362 create the Unicode code point:
363
364 0b0000 0010 0b0000 1000 0b0000 0111
365 0b1110 xxxx 0b10yy yyyy 0b10zz zzzz
366
367 Combining each section will require some bit shifting, but then they can just
368 be OR'd together. They can also be XOR'd together which makes use of a single,
369 COMMUTATIVE, operator through the entire calculation.
370
371 << 12 = 00000010
372 << 6 = 00001000
373 << 0 = 00000111
374 XOR = 00000010001000000111
375
376 code point = 0b0010 0010 0000 0111
377 = 0x2207
378
379 And there we have it! The decoded UTF-8 character '∇'! And because the XOR
380 operator is commutative, we can re-arrange all this XOR and shifting to create
381 a single mask that can be applied to 3-byte UTF-8 characters after their bytes
382 have been shifted and XOR'd together.
383 */
384
385 // Mask used to remove byte headers from a 2 byte encoded UTF-8 character
386 internal const val MASK_2BYTES = 0x0f80
387 // MASK_2BYTES =
388 // (0xc0.toByte() shl 6) xor
389 // (0x80.toByte().toInt())
390
process2Utf8Bytesnull391 internal inline fun ByteArray.process2Utf8Bytes(
392 beginIndex: Int,
393 endIndex: Int,
394 yield: (Int) -> Unit
395 ): Int {
396 if (endIndex <= beginIndex + 1) {
397 yield(REPLACEMENT_CODE_POINT)
398 // Only 1 byte remaining - underflow
399 return 1
400 }
401
402 val b0 = this[beginIndex]
403 val b1 = this[beginIndex + 1]
404 if (!isUtf8Continuation(b1)) {
405 yield(REPLACEMENT_CODE_POINT)
406 return 1
407 }
408
409 val codePoint =
410 (
411 MASK_2BYTES
412 xor (b1.toInt())
413 xor (b0.toInt() shl 6)
414 )
415
416 when {
417 codePoint < 0x80 -> {
418 yield(REPLACEMENT_CODE_POINT) // Reject overlong code points.
419 }
420 else -> {
421 yield(codePoint)
422 }
423 }
424 return 2
425 }
426
427 // Mask used to remove byte headers from a 3 byte encoded UTF-8 character
428 internal const val MASK_3BYTES = -0x01e080
429 // MASK_3BYTES =
430 // (0xe0.toByte() shl 12) xor
431 // (0x80.toByte() shl 6) xor
432 // (0x80.toByte().toInt())
433
process3Utf8Bytesnull434 internal inline fun ByteArray.process3Utf8Bytes(
435 beginIndex: Int,
436 endIndex: Int,
437 yield: (Int) -> Unit
438 ): Int {
439 if (endIndex <= beginIndex + 2) {
440 // At least 2 bytes remaining
441 yield(REPLACEMENT_CODE_POINT)
442 if (endIndex <= beginIndex + 1 || !isUtf8Continuation(this[beginIndex + 1])) {
443 // Only 1 byte remaining - underflow
444 // Or 2nd byte is not a continuation - malformed
445 return 1
446 } else {
447 // Only 2 bytes remaining - underflow
448 return 2
449 }
450 }
451
452 val b0 = this[beginIndex]
453 val b1 = this[beginIndex + 1]
454 if (!isUtf8Continuation(b1)) {
455 yield(REPLACEMENT_CODE_POINT)
456 return 1
457 }
458 val b2 = this[beginIndex + 2]
459 if (!isUtf8Continuation(b2)) {
460 yield(REPLACEMENT_CODE_POINT)
461 return 2
462 }
463
464 val codePoint =
465 (
466 MASK_3BYTES
467 xor (b2.toInt())
468 xor (b1.toInt() shl 6)
469 xor (b0.toInt() shl 12)
470 )
471
472 when {
473 codePoint < 0x800 -> {
474 yield(REPLACEMENT_CODE_POINT) // Reject overlong code points.
475 }
476 codePoint in 0xd800..0xdfff -> {
477 yield(REPLACEMENT_CODE_POINT) // Reject partial surrogates.
478 }
479 else -> {
480 yield(codePoint)
481 }
482 }
483 return 3
484 }
485
486 // Mask used to remove byte headers from a 4 byte encoded UTF-8 character
487 internal const val MASK_4BYTES = 0x381f80
488 // MASK_4BYTES =
489 // (0xf0.toByte() shl 18) xor
490 // (0x80.toByte() shl 12) xor
491 // (0x80.toByte() shl 6) xor
492 // (0x80.toByte().toInt())
493
process4Utf8Bytesnull494 internal inline fun ByteArray.process4Utf8Bytes(
495 beginIndex: Int,
496 endIndex: Int,
497 yield: (Int) -> Unit
498 ): Int {
499 if (endIndex <= beginIndex + 3) {
500 // At least 3 bytes remaining
501 yield(REPLACEMENT_CODE_POINT)
502 if (endIndex <= beginIndex + 1 || !isUtf8Continuation(this[beginIndex + 1])) {
503 // Only 1 byte remaining - underflow
504 // Or 2nd byte is not a continuation - malformed
505 return 1
506 } else if (endIndex <= beginIndex + 2 || !isUtf8Continuation(this[beginIndex + 2])) {
507 // Only 2 bytes remaining - underflow
508 // Or 3rd byte is not a continuation - malformed
509 return 2
510 } else {
511 // Only 3 bytes remaining - underflow
512 return 3
513 }
514 }
515
516 val b0 = this[beginIndex]
517 val b1 = this[beginIndex + 1]
518 if (!isUtf8Continuation(b1)) {
519 yield(REPLACEMENT_CODE_POINT)
520 return 1
521 }
522 val b2 = this[beginIndex + 2]
523 if (!isUtf8Continuation(b2)) {
524 yield(REPLACEMENT_CODE_POINT)
525 return 2
526 }
527 val b3 = this[beginIndex + 3]
528 if (!isUtf8Continuation(b3)) {
529 yield(REPLACEMENT_CODE_POINT)
530 return 3
531 }
532
533 val codePoint =
534 (
535 MASK_4BYTES
536 xor (b3.toInt())
537 xor (b2.toInt() shl 6)
538 xor (b1.toInt() shl 12)
539 xor (b0.toInt() shl 18)
540 )
541
542 when {
543 codePoint > 0x10ffff -> {
544 yield(REPLACEMENT_CODE_POINT) // Reject code points larger than the Unicode maximum.
545 }
546 codePoint in 0xd800..0xdfff -> {
547 yield(REPLACEMENT_CODE_POINT) // Reject partial surrogates.
548 }
549 codePoint < 0x10000 -> {
550 yield(REPLACEMENT_CODE_POINT) // Reject overlong code points.
551 }
552 else -> {
553 yield(codePoint)
554 }
555 }
556 return 4
557 }
558