1 /* Copyright JS Foundation and other contributors, http://js.foundation 2 * 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16 #ifndef LIT_STRINGS_H 17 #define LIT_STRINGS_H 18 19 #include "jrt.h" 20 #include "lit-globals.h" 21 22 /** 23 * Null character (used in few cases as utf-8 string end marker) 24 */ 25 #define LIT_BYTE_NULL (0) 26 27 /** 28 * For the formal definition of Unicode transformation formats (UTF) see Section 3.9, Unicode Encoding Forms in The 29 * Unicode Standard (http://www.unicode.org/versions/Unicode3.0.0/ch03.pdf#G7404). 30 */ 31 #define LIT_UNICODE_CODE_POINT_NULL (0x0) 32 #define LIT_UNICODE_CODE_POINT_MAX (0x10FFFF) 33 34 #define LIT_UTF16_CODE_UNIT_MAX (0xFFFF) 35 #define LIT_UTF16_FIRST_SURROGATE_CODE_POINT (0x10000) 36 #define LIT_UTF16_LOW_SURROGATE_MARKER (0xDC00) 37 #define LIT_UTF16_HIGH_SURROGATE_MARKER (0xD800) 38 #define LIT_UTF16_HIGH_SURROGATE_MIN (0xD800) 39 #define LIT_UTF16_HIGH_SURROGATE_MAX (0xDBFF) 40 #define LIT_UTF16_LOW_SURROGATE_MIN (0xDC00) 41 #define LIT_UTF16_LOW_SURROGATE_MAX (0xDFFF) 42 #define LIT_UTF16_BITS_IN_SURROGATE (10) 43 #define LIT_UTF16_LAST_10_BITS_MASK (0x3FF) 44 45 #define LIT_UTF8_1_BYTE_MARKER (0x00) 46 #define LIT_UTF8_2_BYTE_MARKER (0xC0) 47 #define LIT_UTF8_3_BYTE_MARKER (0xE0) 48 #define LIT_UTF8_4_BYTE_MARKER (0xF0) 49 #define LIT_UTF8_EXTRA_BYTE_MARKER (0x80) 50 51 #define LIT_UTF8_1_BYTE_MASK (0x80) 52 #define LIT_UTF8_2_BYTE_MASK (0xE0) 53 #define LIT_UTF8_3_BYTE_MASK (0xF0) 54 #define LIT_UTF8_4_BYTE_MASK (0xF8) 55 #define LIT_UTF8_EXTRA_BYTE_MASK (0xC0) 56 57 #define LIT_UTF8_LAST_7_BITS_MASK (0x7F) 58 #define LIT_UTF8_LAST_6_BITS_MASK (0x3F) 59 #define LIT_UTF8_LAST_5_BITS_MASK (0x1F) 60 #define LIT_UTF8_LAST_4_BITS_MASK (0x0F) 61 #define LIT_UTF8_LAST_3_BITS_MASK (0x07) 62 #define LIT_UTF8_LAST_2_BITS_MASK (0x03) 63 #define LIT_UTF8_LAST_1_BIT_MASK (0x01) 64 65 #define LIT_UTF8_BITS_IN_EXTRA_BYTES (6) 66 67 #define LIT_UTF8_1_BYTE_CODE_POINT_MAX (0x7F) 68 #define LIT_UTF8_2_BYTE_CODE_POINT_MIN (0x80) 69 #define LIT_UTF8_2_BYTE_CODE_POINT_MAX (0x7FF) 70 #define LIT_UTF8_3_BYTE_CODE_POINT_MIN (0x800) 71 #define LIT_UTF8_3_BYTE_CODE_POINT_MAX (LIT_UTF16_CODE_UNIT_MAX) 72 #define LIT_UTF8_4_BYTE_CODE_POINT_MIN (0x10000) 73 #define LIT_UTF8_4_BYTE_CODE_POINT_MAX (LIT_UNICODE_CODE_POINT_MAX) 74 75 /** 76 * Differnce between byte count needed to represent code point greater than 0xFFFF 77 * in common UTF-8 (4 bytes required) and CESU-8 (6 bytes required) 78 */ 79 #define LIT_UTF8_CESU8_SURROGATE_SIZE_DIF (2 * LIT_UTF8_MAX_BYTES_IN_CODE_UNIT - LIT_UTF8_MAX_BYTES_IN_CODE_POINT) 80 81 /** 82 * Byte values >= LIT_UTF8_FIRST_BYTE_MAX are not allowed in internal strings 83 */ 84 #define LIT_UTF8_FIRST_BYTE_MAX (0xF8) 85 86 /* validation */ 87 bool lit_is_valid_utf8_string (const lit_utf8_byte_t *utf8_buf_p, lit_utf8_size_t buf_size); 88 bool lit_is_valid_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, lit_utf8_size_t buf_size); 89 90 /* checks */ 91 bool lit_is_code_point_utf16_low_surrogate (lit_code_point_t code_point); 92 bool lit_is_code_point_utf16_high_surrogate (lit_code_point_t code_point); 93 94 /* size */ 95 lit_utf8_size_t lit_zt_utf8_string_size (const lit_utf8_byte_t *utf8_str_p); 96 lit_utf8_size_t lit_get_utf8_size_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, lit_utf8_size_t cesu8_buf_size); 97 98 /* length */ 99 ecma_length_t lit_utf8_string_length (const lit_utf8_byte_t *utf8_buf_p, lit_utf8_size_t utf8_buf_size); 100 ecma_length_t lit_get_utf8_length_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, lit_utf8_size_t cesu8_buf_size); 101 102 /* hash */ 103 lit_string_hash_t lit_utf8_string_calc_hash (const lit_utf8_byte_t *utf8_buf_p, lit_utf8_size_t utf8_buf_size); 104 lit_string_hash_t lit_utf8_string_hash_combine (lit_string_hash_t hash_basis, const lit_utf8_byte_t *utf8_buf_p, 105 lit_utf8_size_t utf8_buf_size); 106 107 /* code unit access */ 108 ecma_char_t lit_utf8_string_code_unit_at (const lit_utf8_byte_t *utf8_buf_p, lit_utf8_size_t utf8_buf_size, 109 ecma_length_t code_unit_offset); 110 lit_utf8_size_t lit_get_unicode_char_size_by_utf8_first_byte (const lit_utf8_byte_t first_byte); 111 112 /* conversion */ 113 lit_utf8_size_t lit_code_unit_to_utf8 (ecma_char_t code_unit, lit_utf8_byte_t *buf_p); 114 lit_utf8_size_t lit_code_point_to_utf8 (lit_code_point_t code_point, lit_utf8_byte_t *buf); 115 lit_utf8_size_t lit_code_point_to_cesu8 (lit_code_point_t code_point, lit_utf8_byte_t *buf); 116 lit_utf8_size_t lit_convert_cesu8_string_to_utf8_string (const lit_utf8_byte_t *cesu8_string, 117 lit_utf8_size_t cesu8_size, 118 lit_utf8_byte_t *utf8_string, 119 lit_utf8_size_t utf8_size); 120 lit_code_point_t lit_convert_surrogate_pair_to_code_point (ecma_char_t high_surrogate, ecma_char_t low_surrogate); 121 122 bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, lit_utf8_size_t string1_size, 123 const lit_utf8_byte_t *string2_p, lit_utf8_size_t string2_size); 124 125 uint8_t lit_utf16_encode_code_point (lit_code_point_t cp, ecma_char_t *cu_p); 126 127 /* read code point from buffer */ 128 lit_utf8_size_t lit_read_code_point_from_utf8 (const lit_utf8_byte_t *buf_p, lit_utf8_size_t buf_size, 129 lit_code_point_t *code_point); 130 131 lit_utf8_size_t lit_read_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, 132 ecma_char_t *code_point); 133 134 lit_utf8_size_t lit_read_prev_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, 135 ecma_char_t *code_point); 136 137 ecma_char_t lit_cesu8_read_next (const lit_utf8_byte_t **buf_p); 138 ecma_char_t lit_cesu8_read_prev (const lit_utf8_byte_t **buf_p); 139 ecma_char_t lit_cesu8_peek_next (const lit_utf8_byte_t *buf_p); 140 ecma_char_t lit_cesu8_peek_prev (const lit_utf8_byte_t *buf_p); 141 void lit_utf8_incr (const lit_utf8_byte_t **buf_p); 142 void lit_utf8_decr (const lit_utf8_byte_t **buf_p); 143 144 #endif /* !LIT_STRINGS_H */ 145