• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright JS Foundation and other contributors, http://js.foundation
2  *
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef LIT_GLOBALS_H
17 #define LIT_GLOBALS_H
18 
19 #include "jrt.h"
20 
21 /**
22  * ECMAScript standard defines terms "code unit" and "character" as 16-bit unsigned value
23  * used to represent 16-bit unit of text, this is the same as code unit in UTF-16 (See ECMA-262 5.1 Chapter 6).
24  *
25  * The term "code point" or "Unicode character" is used to refer a single Unicode scalar value (may be longer
26  * than 16 bits: 0x0 - 0x10FFFFF). One code point could be represented with one ore two 16-bit code units.
27  *
28  * According to the standard all strings and source text are assumed to be a sequence of code units.
29  * Length of a string equals to number of code units in the string, which is not the same as number of Unicode
30  * characters in a string.
31  *
32  * Internally JerryScript engine uses UTF-8 representation of strings to reduce memory overhead. Unicode character
33  * occupies from one to four bytes in UTF-8 representation.
34  *
35  * Unicode scalar value   | Bytes in UTF-8             | Bytes in UTF-16
36  *                        | (internal representation)  |
37  * ----------------------------------------------------------------------
38  *  0x0     - 0x7F        |  1 byte                    |  2 bytes
39  *  0x80    - 0x7FF       |  2 bytes                   |  2 bytes
40  *  0x800   - 0xFFFF      |  3 bytes                   |  2 bytes
41  *  0x10000 - 0x10FFFF    |  4 bytes                   |  4 bytes
42  *
43  * Scalar values from 0xD800 to 0xDFFF are permanently reserved by Unicode standard to encode high and low
44  * surrogates in UTF-16 (Code points 0x10000 - 0x10FFFF are encoded via pair of surrogates in UTF-16).
45  * Despite that the official Unicode standard says that no UTF forms can encode these code points, we allow
46  * them to be encoded inside strings. The reason for that is compatibility with ECMA standard.
47  *
48  * For example, assume a string which consists one Unicode character: 0x1D700 (Mathematical Italic Small Epsilon).
49  * It has the following representation in UTF-16: 0xD835 0xDF00.
50  *
51  * ECMA standard allows extracting a substring from this string:
52  * > var str = String.fromCharCode (0xD835, 0xDF00); // Create a string containing one character: 0x1D700
53  * > str.length; // 2
54  * > var str1 = str.substring (0, 1);
55  * > str1.length; // 1
56  * > str1.charCodeAt (0); // 55349 (this equals to 0xD835)
57  *
58  * Internally original string would be represented in UTF-8 as the following byte sequence: 0xF0 0x9D 0x9C 0x80.
59  * After substring extraction high surrogate 0xD835 should be encoded via UTF-8: 0xED 0xA0 0xB5.
60  *
61  * Pair of low and high surrogates encoded separately should never occur in internal string representation,
62  * it should be encoded as any code point and occupy 4 bytes. So, when constructing a string from two surrogates,
63  * it should be processed gracefully;
64  * > var str1 = String.fromCharCode (0xD835); // 0xED 0xA0 0xB5 - internal representation
65  * > var str2 = String.fromCharCode (0xDF00); // 0xED 0xBC 0x80 - internal representation
66  * > var str = str1 + str2; // 0xF0 0x9D 0x9C 0x80 - internal representation,
67  *                          // !!! not 0xED 0xA0 0xB5 0xED 0xBC 0x80
68  */
69 
70 /**
71  * Description of an ecma-character, which represents 16-bit code unit,
72  * which is equal to UTF-16 character (see Chapter 6 from ECMA-262 5.1)
73  */
74 typedef uint16_t ecma_char_t;
75 
76 /**
77  * Description of a collection's/string's length
78  */
79 typedef uint32_t ecma_length_t;
80 
81 /**
82  * Max bytes needed to represent a code unit (utf-16 char) via utf-8 encoding
83  */
84 #define LIT_UTF8_MAX_BYTES_IN_CODE_UNIT (3)
85 
86 /**
87  * Max bytes needed to represent a code point (Unicode character) via utf-8 encoding
88  */
89 #define LIT_UTF8_MAX_BYTES_IN_CODE_POINT (4)
90 
91 /**
92  * Max bytes needed to represent a code unit (utf-16 char) via cesu-8 encoding
93  */
94 #define LIT_CESU8_MAX_BYTES_IN_CODE_UNIT (3)
95 
96 /**
97  * Max bytes needed to represent a code point (Unicode character) via cesu-8 encoding
98  */
99 #define LIT_CESU8_MAX_BYTES_IN_CODE_POINT (6)
100 
101 /**
102  * A byte of utf-8 string
103  */
104 typedef uint8_t lit_utf8_byte_t;
105 
106 /**
107  * Size of a utf-8 string in bytes
108  */
109 typedef uint32_t lit_utf8_size_t;
110 
111 /**
112  * Size of a magic string in bytes
113  */
114 typedef uint8_t lit_magic_size_t;
115 
116 /**
117  * Unicode code point
118  */
119 typedef uint32_t lit_code_point_t;
120 
121 /**
122  * ECMA string hash
123  */
124 typedef uint32_t lit_string_hash_t;
125 
126 #endif /* !LIT_GLOBALS_H */
127