• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright JS Foundation and other contributors, http://js.foundation
2  *
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef LIT_STRINGS_H
17 #define LIT_STRINGS_H
18 
19 #include "jrt.h"
20 #include "lit-globals.h"
21 
22 /**
23  * Null character (used in few cases as utf-8 string end marker)
24  */
25 #define LIT_BYTE_NULL (0)
26 
27 /**
28  * For the formal definition of Unicode transformation formats (UTF) see Section 3.9, Unicode Encoding Forms in The
29  * Unicode Standard (http://www.unicode.org/versions/Unicode3.0.0/ch03.pdf#G7404).
30  */
31 #define LIT_UNICODE_CODE_POINT_NULL (0x0)
32 #define LIT_UNICODE_CODE_POINT_MAX (0x10FFFF)
33 
34 #define LIT_UTF16_CODE_UNIT_MAX (0xFFFF)
35 #define LIT_UTF16_FIRST_SURROGATE_CODE_POINT (0x10000)
36 #define LIT_UTF16_LOW_SURROGATE_MARKER (0xDC00)
37 #define LIT_UTF16_HIGH_SURROGATE_MARKER (0xD800)
38 #define LIT_UTF16_HIGH_SURROGATE_MIN (0xD800)
39 #define LIT_UTF16_HIGH_SURROGATE_MAX (0xDBFF)
40 #define LIT_UTF16_LOW_SURROGATE_MIN (0xDC00)
41 #define LIT_UTF16_LOW_SURROGATE_MAX (0xDFFF)
42 #define LIT_UTF16_BITS_IN_SURROGATE (10)
43 #define LIT_UTF16_LAST_10_BITS_MASK (0x3FF)
44 
45 #define LIT_UTF8_1_BYTE_MARKER (0x00)
46 #define LIT_UTF8_2_BYTE_MARKER (0xC0)
47 #define LIT_UTF8_3_BYTE_MARKER (0xE0)
48 #define LIT_UTF8_4_BYTE_MARKER (0xF0)
49 #define LIT_UTF8_EXTRA_BYTE_MARKER (0x80)
50 
51 #define LIT_UTF8_1_BYTE_MASK (0x80)
52 #define LIT_UTF8_2_BYTE_MASK (0xE0)
53 #define LIT_UTF8_3_BYTE_MASK (0xF0)
54 #define LIT_UTF8_4_BYTE_MASK (0xF8)
55 #define LIT_UTF8_EXTRA_BYTE_MASK (0xC0)
56 
57 #define LIT_UTF8_LAST_7_BITS_MASK (0x7F)
58 #define LIT_UTF8_LAST_6_BITS_MASK (0x3F)
59 #define LIT_UTF8_LAST_5_BITS_MASK (0x1F)
60 #define LIT_UTF8_LAST_4_BITS_MASK (0x0F)
61 #define LIT_UTF8_LAST_3_BITS_MASK (0x07)
62 #define LIT_UTF8_LAST_2_BITS_MASK (0x03)
63 #define LIT_UTF8_LAST_1_BIT_MASK  (0x01)
64 
65 #define LIT_UTF8_BITS_IN_EXTRA_BYTES (6)
66 
67 #define LIT_UTF8_1_BYTE_CODE_POINT_MAX (0x7F)
68 #define LIT_UTF8_2_BYTE_CODE_POINT_MIN (0x80)
69 #define LIT_UTF8_2_BYTE_CODE_POINT_MAX (0x7FF)
70 #define LIT_UTF8_3_BYTE_CODE_POINT_MIN (0x800)
71 #define LIT_UTF8_3_BYTE_CODE_POINT_MAX (LIT_UTF16_CODE_UNIT_MAX)
72 #define LIT_UTF8_4_BYTE_CODE_POINT_MIN (0x10000)
73 #define LIT_UTF8_4_BYTE_CODE_POINT_MAX (LIT_UNICODE_CODE_POINT_MAX)
74 
75 /**
76  * Differnce between byte count needed to represent code point greater than 0xFFFF
77  * in common UTF-8 (4 bytes required) and CESU-8 (6 bytes required)
78  */
79 #define LIT_UTF8_CESU8_SURROGATE_SIZE_DIF (2 * LIT_UTF8_MAX_BYTES_IN_CODE_UNIT - LIT_UTF8_MAX_BYTES_IN_CODE_POINT)
80 
81 /**
82  * Byte values >= LIT_UTF8_FIRST_BYTE_MAX are not allowed in internal strings
83  */
84 #define LIT_UTF8_FIRST_BYTE_MAX (0xF8)
85 
86 /* validation */
87 bool lit_is_valid_utf8_string (const lit_utf8_byte_t *utf8_buf_p, lit_utf8_size_t buf_size);
88 bool lit_is_valid_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, lit_utf8_size_t buf_size);
89 
90 /* checks */
91 bool lit_is_code_point_utf16_low_surrogate (lit_code_point_t code_point);
92 bool lit_is_code_point_utf16_high_surrogate (lit_code_point_t code_point);
93 
94 /* size */
95 lit_utf8_size_t lit_zt_utf8_string_size (const lit_utf8_byte_t *utf8_str_p);
96 lit_utf8_size_t lit_get_utf8_size_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, lit_utf8_size_t cesu8_buf_size);
97 
98 /* length */
99 ecma_length_t lit_utf8_string_length (const lit_utf8_byte_t *utf8_buf_p, lit_utf8_size_t utf8_buf_size);
100 ecma_length_t lit_get_utf8_length_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, lit_utf8_size_t cesu8_buf_size);
101 
102 /* hash */
103 lit_string_hash_t lit_utf8_string_calc_hash (const lit_utf8_byte_t *utf8_buf_p, lit_utf8_size_t utf8_buf_size);
104 lit_string_hash_t lit_utf8_string_hash_combine (lit_string_hash_t hash_basis, const lit_utf8_byte_t *utf8_buf_p,
105                                                 lit_utf8_size_t utf8_buf_size);
106 
107 /* code unit access */
108 ecma_char_t lit_utf8_string_code_unit_at (const lit_utf8_byte_t *utf8_buf_p, lit_utf8_size_t utf8_buf_size,
109                                           ecma_length_t code_unit_offset);
110 lit_utf8_size_t lit_get_unicode_char_size_by_utf8_first_byte (const lit_utf8_byte_t first_byte);
111 
112 /* conversion */
113 lit_utf8_size_t lit_code_unit_to_utf8 (ecma_char_t code_unit, lit_utf8_byte_t *buf_p);
114 lit_utf8_size_t lit_code_point_to_utf8 (lit_code_point_t code_point, lit_utf8_byte_t *buf);
115 lit_utf8_size_t lit_code_point_to_cesu8 (lit_code_point_t code_point, lit_utf8_byte_t *buf);
116 lit_utf8_size_t lit_convert_cesu8_string_to_utf8_string (const lit_utf8_byte_t *cesu8_string,
117                                                          lit_utf8_size_t cesu8_size,
118                                                          lit_utf8_byte_t *utf8_string,
119                                                          lit_utf8_size_t utf8_size);
120 lit_code_point_t lit_convert_surrogate_pair_to_code_point (ecma_char_t high_surrogate, ecma_char_t low_surrogate);
121 
122 bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, lit_utf8_size_t string1_size,
123                                           const lit_utf8_byte_t *string2_p, lit_utf8_size_t string2_size);
124 
125 uint8_t lit_utf16_encode_code_point (lit_code_point_t cp, ecma_char_t *cu_p);
126 
127 /* read code point from buffer */
128 lit_utf8_size_t lit_read_code_point_from_utf8 (const lit_utf8_byte_t *buf_p, lit_utf8_size_t buf_size,
129                                                lit_code_point_t *code_point);
130 
131 lit_utf8_size_t lit_read_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p,
132                                               ecma_char_t *code_point);
133 
134 lit_utf8_size_t lit_read_prev_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p,
135                                                    ecma_char_t *code_point);
136 
137 ecma_char_t lit_cesu8_read_next (const lit_utf8_byte_t **buf_p);
138 ecma_char_t lit_cesu8_read_prev (const lit_utf8_byte_t **buf_p);
139 ecma_char_t lit_cesu8_peek_next (const lit_utf8_byte_t *buf_p);
140 ecma_char_t lit_cesu8_peek_prev (const lit_utf8_byte_t *buf_p);
141 void lit_utf8_incr (const lit_utf8_byte_t **buf_p);
142 void lit_utf8_decr (const lit_utf8_byte_t **buf_p);
143 
144 #endif /* !LIT_STRINGS_H */
145