1/** 2 * @fileoverview A UTF8 decoder. 3 */ 4goog.module('protobuf.binary.textencoding'); 5 6const {checkElementIndex} = goog.require('protobuf.internal.checks'); 7 8/** 9 * Combines an array of codePoints into a string. 10 * @param {!Array<number>} codePoints 11 * @return {string} 12 */ 13function codePointsToString(codePoints) { 14 // Performance: http://jsperf.com/string-fromcharcode-test/13 15 let s = '', i = 0; 16 const length = codePoints.length; 17 const BATCH_SIZE = 10000; 18 while (i < length) { 19 const end = Math.min(i + BATCH_SIZE, length); 20 s += String.fromCharCode.apply(null, codePoints.slice(i, end)); 21 i = end; 22 } 23 return s; 24} 25 26/** 27 * Decodes raw bytes into a string. 28 * Supports codepoints from U+0000 up to U+10FFFF. 29 * (http://en.wikipedia.org/wiki/UTF-8). 30 * @param {!DataView} bytes 31 * @return {string} 32 */ 33function decode(bytes) { 34 let cursor = 0; 35 const codePoints = []; 36 37 while (cursor < bytes.byteLength) { 38 const c = bytes.getUint8(cursor++); 39 if (c < 0x80) { // Regular 7-bit ASCII. 40 codePoints.push(c); 41 } else if (c < 0xC0) { 42 // UTF-8 continuation mark. We are out of sync. This 43 // might happen if we attempted to read a character 44 // with more than four bytes. 45 continue; 46 } else if (c < 0xE0) { // UTF-8 with two bytes. 47 checkElementIndex(cursor, bytes.byteLength); 48 const c2 = bytes.getUint8(cursor++); 49 codePoints.push(((c & 0x1F) << 6) | (c2 & 0x3F)); 50 } else if (c < 0xF0) { // UTF-8 with three bytes. 51 checkElementIndex(cursor + 1, bytes.byteLength); 52 const c2 = bytes.getUint8(cursor++); 53 const c3 = bytes.getUint8(cursor++); 54 codePoints.push(((c & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F)); 55 } else if (c < 0xF8) { // UTF-8 with 4 bytes. 56 checkElementIndex(cursor + 2, bytes.byteLength); 57 const c2 = bytes.getUint8(cursor++); 58 const c3 = bytes.getUint8(cursor++); 59 const c4 = bytes.getUint8(cursor++); 60 // Characters written on 4 bytes have 21 bits for a codepoint. 61 // We can't fit that on 16bit characters, so we use surrogates. 62 let codepoint = ((c & 0x07) << 18) | ((c2 & 0x3F) << 12) | 63 ((c3 & 0x3F) << 6) | (c4 & 0x3F); 64 // Surrogates formula from wikipedia. 65 // 1. Subtract 0x10000 from codepoint 66 codepoint -= 0x10000; 67 // 2. Split this into the high 10-bit value and the low 10-bit value 68 // 3. Add 0xD800 to the high value to form the high surrogate 69 // 4. Add 0xDC00 to the low value to form the low surrogate: 70 const low = (codepoint & 0x3FF) + 0xDC00; 71 const high = ((codepoint >> 10) & 0x3FF) + 0xD800; 72 codePoints.push(high, low); 73 } 74 } 75 return codePointsToString(codePoints); 76} 77 78/** 79 * Writes a UTF16 JavaScript string to the buffer encoded as UTF8. 80 * @param {string} value The string to write. 81 * @return {!Uint8Array} An array containing the encoded bytes. 82 */ 83function encode(value) { 84 const buffer = []; 85 86 for (let i = 0; i < value.length; i++) { 87 const c1 = value.charCodeAt(i); 88 89 if (c1 < 0x80) { 90 buffer.push(c1); 91 } else if (c1 < 0x800) { 92 buffer.push((c1 >> 6) | 0xC0); 93 buffer.push((c1 & 0x3F) | 0x80); 94 } else if (c1 < 0xD800 || c1 >= 0xE000) { 95 buffer.push((c1 >> 12) | 0xE0); 96 buffer.push(((c1 >> 6) & 0x3F) | 0x80); 97 buffer.push((c1 & 0x3F) | 0x80); 98 } else { 99 // surrogate pair 100 i++; 101 checkElementIndex(i, value.length); 102 const c2 = value.charCodeAt(i); 103 const paired = 0x10000 + (((c1 & 0x3FF) << 10) | (c2 & 0x3FF)); 104 buffer.push((paired >> 18) | 0xF0); 105 buffer.push(((paired >> 12) & 0x3F) | 0x80); 106 buffer.push(((paired >> 6) & 0x3F) | 0x80); 107 buffer.push((paired & 0x3F) | 0x80); 108 } 109 } 110 return new Uint8Array(buffer); 111} 112 113exports = { 114 decode, 115 encode, 116}; 117