1 /* Copyright JS Foundation and other contributors, http://js.foundation 2 * 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16 #ifndef LIT_GLOBALS_H 17 #define LIT_GLOBALS_H 18 19 #include "jrt.h" 20 21 /** 22 * ECMAScript standard defines terms "code unit" and "character" as 16-bit unsigned value 23 * used to represent 16-bit unit of text, this is the same as code unit in UTF-16 (See ECMA-262 5.1 Chapter 6). 24 * 25 * The term "code point" or "Unicode character" is used to refer a single Unicode scalar value (may be longer 26 * than 16 bits: 0x0 - 0x10FFFFF). One code point could be represented with one ore two 16-bit code units. 27 * 28 * According to the standard all strings and source text are assumed to be a sequence of code units. 29 * Length of a string equals to number of code units in the string, which is not the same as number of Unicode 30 * characters in a string. 31 * 32 * Internally JerryScript engine uses UTF-8 representation of strings to reduce memory overhead. Unicode character 33 * occupies from one to four bytes in UTF-8 representation. 34 * 35 * Unicode scalar value | Bytes in UTF-8 | Bytes in UTF-16 36 * | (internal representation) | 37 * ---------------------------------------------------------------------- 38 * 0x0 - 0x7F | 1 byte | 2 bytes 39 * 0x80 - 0x7FF | 2 bytes | 2 bytes 40 * 0x800 - 0xFFFF | 3 bytes | 2 bytes 41 * 0x10000 - 0x10FFFF | 4 bytes | 4 bytes 42 * 43 * Scalar values from 0xD800 to 0xDFFF are permanently reserved by Unicode standard to encode high and low 44 * surrogates in UTF-16 (Code points 0x10000 - 0x10FFFF are encoded via pair of surrogates in UTF-16). 45 * Despite that the official Unicode standard says that no UTF forms can encode these code points, we allow 46 * them to be encoded inside strings. The reason for that is compatibility with ECMA standard. 47 * 48 * For example, assume a string which consists one Unicode character: 0x1D700 (Mathematical Italic Small Epsilon). 49 * It has the following representation in UTF-16: 0xD835 0xDF00. 50 * 51 * ECMA standard allows extracting a substring from this string: 52 * > var str = String.fromCharCode (0xD835, 0xDF00); // Create a string containing one character: 0x1D700 53 * > str.length; // 2 54 * > var str1 = str.substring (0, 1); 55 * > str1.length; // 1 56 * > str1.charCodeAt (0); // 55349 (this equals to 0xD835) 57 * 58 * Internally original string would be represented in UTF-8 as the following byte sequence: 0xF0 0x9D 0x9C 0x80. 59 * After substring extraction high surrogate 0xD835 should be encoded via UTF-8: 0xED 0xA0 0xB5. 60 * 61 * Pair of low and high surrogates encoded separately should never occur in internal string representation, 62 * it should be encoded as any code point and occupy 4 bytes. So, when constructing a string from two surrogates, 63 * it should be processed gracefully; 64 * > var str1 = String.fromCharCode (0xD835); // 0xED 0xA0 0xB5 - internal representation 65 * > var str2 = String.fromCharCode (0xDF00); // 0xED 0xBC 0x80 - internal representation 66 * > var str = str1 + str2; // 0xF0 0x9D 0x9C 0x80 - internal representation, 67 * // !!! not 0xED 0xA0 0xB5 0xED 0xBC 0x80 68 */ 69 70 /** 71 * Description of an ecma-character, which represents 16-bit code unit, 72 * which is equal to UTF-16 character (see Chapter 6 from ECMA-262 5.1) 73 */ 74 typedef uint16_t ecma_char_t; 75 76 /** 77 * Description of a collection's/string's length 78 */ 79 typedef uint32_t ecma_length_t; 80 81 /** 82 * Max bytes needed to represent a code unit (utf-16 char) via utf-8 encoding 83 */ 84 #define LIT_UTF8_MAX_BYTES_IN_CODE_UNIT (3) 85 86 /** 87 * Max bytes needed to represent a code point (Unicode character) via utf-8 encoding 88 */ 89 #define LIT_UTF8_MAX_BYTES_IN_CODE_POINT (4) 90 91 /** 92 * Max bytes needed to represent a code unit (utf-16 char) via cesu-8 encoding 93 */ 94 #define LIT_CESU8_MAX_BYTES_IN_CODE_UNIT (3) 95 96 /** 97 * Max bytes needed to represent a code point (Unicode character) via cesu-8 encoding 98 */ 99 #define LIT_CESU8_MAX_BYTES_IN_CODE_POINT (6) 100 101 /** 102 * A byte of utf-8 string 103 */ 104 typedef uint8_t lit_utf8_byte_t; 105 106 /** 107 * Size of a utf-8 string in bytes 108 */ 109 typedef uint32_t lit_utf8_size_t; 110 111 /** 112 * Size of a magic string in bytes 113 */ 114 typedef uint8_t lit_magic_size_t; 115 116 /** 117 * Unicode code point 118 */ 119 typedef uint32_t lit_code_point_t; 120 121 /** 122 * ECMA string hash 123 */ 124 typedef uint32_t lit_string_hash_t; 125 126 #endif /* !LIT_GLOBALS_H */ 127