1 /* Copyright JS Foundation and other contributors, http://js.foundation
2 *
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "lit-strings.h"
17
18 #include "jrt-libc-includes.h"
19
20 /**
21 * Validate utf-8 string
22 *
23 * NOTE:
24 * Isolated surrogates are allowed.
25 * Correct pair of surrogates is not allowed, it should be represented as 4-byte utf-8 character.
26 *
27 * @return true if utf-8 string is well-formed
28 * false otherwise
29 */
30 bool
lit_is_valid_utf8_string(const lit_utf8_byte_t * utf8_buf_p,lit_utf8_size_t buf_size)31 lit_is_valid_utf8_string (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */
32 lit_utf8_size_t buf_size) /**< string size */
33 {
34 lit_utf8_size_t idx = 0;
35
36 bool is_prev_code_point_high_surrogate = false;
37 while (idx < buf_size)
38 {
39 lit_utf8_byte_t c = utf8_buf_p[idx++];
40 if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
41 {
42 is_prev_code_point_high_surrogate = false;
43 continue;
44 }
45
46 lit_code_point_t code_point = 0;
47 lit_code_point_t min_code_point = 0;
48 lit_utf8_size_t extra_bytes_count;
49 if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
50 {
51 extra_bytes_count = 1;
52 min_code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN;
53 code_point = ((uint32_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
54 }
55 else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
56 {
57 extra_bytes_count = 2;
58 min_code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN;
59 code_point = ((uint32_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
60 }
61 else if ((c & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER)
62 {
63 extra_bytes_count = 3;
64 min_code_point = LIT_UTF8_4_BYTE_CODE_POINT_MIN;
65 code_point = ((uint32_t) (c & LIT_UTF8_LAST_3_BITS_MASK));
66 }
67 else
68 {
69 /* utf-8 string could not contain 5- and 6-byte sequences. */
70 return false;
71 }
72
73 if (idx + extra_bytes_count > buf_size)
74 {
75 /* utf-8 string breaks in the middle */
76 return false;
77 }
78
79 for (lit_utf8_size_t offset = 0; offset < extra_bytes_count; ++offset)
80 {
81 c = utf8_buf_p[idx + offset];
82 if ((c & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
83 {
84 /* invalid continuation byte */
85 return false;
86 }
87 code_point <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
88 code_point |= (c & LIT_UTF8_LAST_6_BITS_MASK);
89 }
90
91 if (code_point < min_code_point
92 || code_point > LIT_UNICODE_CODE_POINT_MAX)
93 {
94 /* utf-8 string doesn't encode valid unicode code point */
95 return false;
96 }
97
98 if (code_point >= LIT_UTF16_HIGH_SURROGATE_MIN
99 && code_point <= LIT_UTF16_HIGH_SURROGATE_MAX)
100 {
101 is_prev_code_point_high_surrogate = true;
102 }
103 else if (code_point >= LIT_UTF16_LOW_SURROGATE_MIN
104 && code_point <= LIT_UTF16_LOW_SURROGATE_MAX
105 && is_prev_code_point_high_surrogate)
106 {
107 /* sequence of high and low surrogate is not allowed */
108 return false;
109 }
110 else
111 {
112 is_prev_code_point_high_surrogate = false;
113 }
114
115 idx += extra_bytes_count;
116 }
117
118 return true;
119 } /* lit_is_valid_utf8_string */
120
121 /**
122 * Validate cesu-8 string
123 *
124 * @return true if cesu-8 string is well-formed
125 * false otherwise
126 */
127 bool
lit_is_valid_cesu8_string(const lit_utf8_byte_t * cesu8_buf_p,lit_utf8_size_t buf_size)128 lit_is_valid_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */
129 lit_utf8_size_t buf_size) /**< string size */
130 {
131 lit_utf8_size_t idx = 0;
132
133 while (idx < buf_size)
134 {
135 lit_utf8_byte_t c = cesu8_buf_p[idx++];
136 if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
137 {
138 continue;
139 }
140
141 lit_code_point_t code_point = 0;
142 lit_code_point_t min_code_point = 0;
143 lit_utf8_size_t extra_bytes_count;
144 if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
145 {
146 extra_bytes_count = 1;
147 min_code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN;
148 code_point = ((uint32_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
149 }
150 else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
151 {
152 extra_bytes_count = 2;
153 min_code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN;
154 code_point = ((uint32_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
155 }
156 else
157 {
158 return false;
159 }
160
161 if (idx + extra_bytes_count > buf_size)
162 {
163 /* cesu-8 string breaks in the middle */
164 return false;
165 }
166
167 for (lit_utf8_size_t offset = 0; offset < extra_bytes_count; ++offset)
168 {
169 c = cesu8_buf_p[idx + offset];
170 if ((c & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
171 {
172 /* invalid continuation byte */
173 return false;
174 }
175 code_point <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
176 code_point |= (c & LIT_UTF8_LAST_6_BITS_MASK);
177 }
178
179 if (code_point < min_code_point)
180 {
181 /* cesu-8 string doesn't encode valid unicode code point */
182 return false;
183 }
184
185 idx += extra_bytes_count;
186 }
187
188 return true;
189 } /* lit_is_valid_cesu8_string */
190
191 /**
192 * Check if the code point is UTF-16 low surrogate
193 *
194 * @return true / false
195 */
196 bool
lit_is_code_point_utf16_low_surrogate(lit_code_point_t code_point)197 lit_is_code_point_utf16_low_surrogate (lit_code_point_t code_point) /**< code point */
198 {
199 return LIT_UTF16_LOW_SURROGATE_MIN <= code_point && code_point <= LIT_UTF16_LOW_SURROGATE_MAX;
200 } /* lit_is_code_point_utf16_low_surrogate */
201
202 /**
203 * Check if the code point is UTF-16 high surrogate
204 *
205 * @return true / false
206 */
207 bool
lit_is_code_point_utf16_high_surrogate(lit_code_point_t code_point)208 lit_is_code_point_utf16_high_surrogate (lit_code_point_t code_point) /**< code point */
209 {
210 return LIT_UTF16_HIGH_SURROGATE_MIN <= code_point && code_point <= LIT_UTF16_HIGH_SURROGATE_MAX;
211 } /* lit_is_code_point_utf16_high_surrogate */
212
213 /**
214 * Represents code point (>0xFFFF) as surrogate pair and returns its lower part
215 *
216 * @return lower code_unit of the surrogate pair
217 */
218 static ecma_char_t
convert_code_point_to_low_surrogate(lit_code_point_t code_point)219 convert_code_point_to_low_surrogate (lit_code_point_t code_point) /**< code point, should be > 0xFFFF */
220 {
221 JERRY_ASSERT (code_point > LIT_UTF16_CODE_UNIT_MAX);
222
223 ecma_char_t code_unit_bits;
224 code_unit_bits = (ecma_char_t) (code_point & LIT_UTF16_LAST_10_BITS_MASK);
225
226 return (ecma_char_t) (LIT_UTF16_LOW_SURROGATE_MARKER | code_unit_bits);
227 } /* convert_code_point_to_low_surrogate */
228
229 /**
230 * Represents code point (>0xFFFF) as surrogate pair and returns its higher part
231 *
232 * @return higher code_unit of the surrogate pair
233 */
234 static ecma_char_t
convert_code_point_to_high_surrogate(lit_code_point_t code_point)235 convert_code_point_to_high_surrogate (lit_code_point_t code_point) /**< code point, should be > 0xFFFF */
236 {
237 JERRY_ASSERT (code_point > LIT_UTF16_CODE_UNIT_MAX);
238 JERRY_ASSERT (code_point <= LIT_UNICODE_CODE_POINT_MAX);
239
240 ecma_char_t code_unit_bits;
241 code_unit_bits = (ecma_char_t) ((code_point - LIT_UTF16_FIRST_SURROGATE_CODE_POINT) >> LIT_UTF16_BITS_IN_SURROGATE);
242
243 return (LIT_UTF16_HIGH_SURROGATE_MARKER | code_unit_bits);
244 } /* convert_code_point_to_high_surrogate */
245
246 /**
247 * UTF16 Encoding method for a code point
248 *
249 * See also:
250 * ECMA-262 v6, 10.1.1
251 *
252 * @return uint8_t, the number of returning code points
253 */
254 uint8_t
lit_utf16_encode_code_point(lit_code_point_t cp,ecma_char_t * cu_p)255 lit_utf16_encode_code_point (lit_code_point_t cp, /**< the code point we encode */
256 ecma_char_t *cu_p) /**< result of the encoding */
257 {
258 if (cp <= LIT_UTF16_CODE_UNIT_MAX)
259 {
260 cu_p[0] = (ecma_char_t) cp;
261 return 1;
262 }
263
264 cu_p[0] = convert_code_point_to_high_surrogate (cp);
265 cu_p[1] = convert_code_point_to_low_surrogate (cp);
266 return 2;
267 } /* lit_utf16_encode_code_point */
268
269 /**
270 * Calculate size of a zero-terminated utf-8 string
271 *
272 * NOTE:
273 * - string cannot be NULL
274 * - string should not contain zero characters in the middle
275 *
276 * @return size of a string
277 */
278 lit_utf8_size_t
lit_zt_utf8_string_size(const lit_utf8_byte_t * utf8_str_p)279 lit_zt_utf8_string_size (const lit_utf8_byte_t *utf8_str_p) /**< zero-terminated utf-8 string */
280 {
281 JERRY_ASSERT (utf8_str_p != NULL);
282 return (lit_utf8_size_t) strlen ((const char *) utf8_str_p);
283 } /* lit_zt_utf8_string_size */
284
285 /**
286 * Calculate length of a cesu-8 encoded string
287 *
288 * @return UTF-16 code units count
289 */
290 ecma_length_t
lit_utf8_string_length(const lit_utf8_byte_t * utf8_buf_p,lit_utf8_size_t utf8_buf_size)291 lit_utf8_string_length (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */
292 lit_utf8_size_t utf8_buf_size) /**< string size */
293 {
294 ecma_length_t length = 0;
295 lit_utf8_size_t size = 0;
296
297 while (size < utf8_buf_size)
298 {
299 size += lit_get_unicode_char_size_by_utf8_first_byte (*(utf8_buf_p + size));
300 length++;
301 }
302
303 JERRY_ASSERT (size == utf8_buf_size);
304
305 return length;
306 } /* lit_utf8_string_length */
307
308 /**
309 * Calculate the required size of an utf-8 encoded string from cesu-8 encoded string
310 *
311 * @return size of an utf-8 encoded string
312 */
313 lit_utf8_size_t
lit_get_utf8_size_of_cesu8_string(const lit_utf8_byte_t * cesu8_buf_p,lit_utf8_size_t cesu8_buf_size)314 lit_get_utf8_size_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */
315 lit_utf8_size_t cesu8_buf_size) /**< string size */
316 {
317 lit_utf8_size_t offset = 0;
318 lit_utf8_size_t utf8_buf_size = cesu8_buf_size;
319 ecma_char_t prev_ch = 0;
320
321 while (offset < cesu8_buf_size)
322 {
323 ecma_char_t ch;
324 offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &ch);
325
326 if (lit_is_code_point_utf16_low_surrogate (ch) && lit_is_code_point_utf16_high_surrogate (prev_ch))
327 {
328 utf8_buf_size -= 2;
329 }
330
331 prev_ch = ch;
332 }
333
334 JERRY_ASSERT (offset == cesu8_buf_size);
335
336 return utf8_buf_size;
337 } /* lit_get_utf8_size_of_cesu8_string */
338
339 /**
340 * Calculate length of an utf-8 encoded string from cesu-8 encoded string
341 *
342 * @return length of an utf-8 encoded string
343 */
344 ecma_length_t
lit_get_utf8_length_of_cesu8_string(const lit_utf8_byte_t * cesu8_buf_p,lit_utf8_size_t cesu8_buf_size)345 lit_get_utf8_length_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */
346 lit_utf8_size_t cesu8_buf_size) /**< string size */
347 {
348 lit_utf8_size_t offset = 0;
349 ecma_length_t utf8_length = 0;
350 ecma_char_t prev_ch = 0;
351
352 while (offset < cesu8_buf_size)
353 {
354 ecma_char_t ch;
355 offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &ch);
356
357 if (!lit_is_code_point_utf16_low_surrogate (ch) || !lit_is_code_point_utf16_high_surrogate (prev_ch))
358 {
359 utf8_length++;
360 }
361
362 prev_ch = ch;
363 }
364
365 JERRY_ASSERT (offset == cesu8_buf_size);
366
367 return utf8_length;
368 } /* lit_get_utf8_length_of_cesu8_string */
369
370 /**
371 * Decodes a unicode code point from non-empty utf-8-encoded buffer
372 *
373 * @return number of bytes occupied by code point in the string
374 */
375 lit_utf8_size_t
lit_read_code_point_from_utf8(const lit_utf8_byte_t * buf_p,lit_utf8_size_t buf_size,lit_code_point_t * code_point)376 lit_read_code_point_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */
377 lit_utf8_size_t buf_size, /**< size of the buffer in bytes */
378 lit_code_point_t *code_point) /**< [out] code point */
379 {
380 JERRY_ASSERT (buf_p && buf_size);
381
382 lit_utf8_byte_t c = buf_p[0];
383 if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
384 {
385 *code_point = (lit_code_point_t) (c & LIT_UTF8_LAST_7_BITS_MASK);
386 return 1;
387 }
388
389 lit_code_point_t ret = LIT_UNICODE_CODE_POINT_NULL;
390 ecma_length_t bytes_count = 0;
391 if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
392 {
393 bytes_count = 2;
394 ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
395 }
396 else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
397 {
398 bytes_count = 3;
399 ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
400 }
401 else
402 {
403 JERRY_ASSERT ((c & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER);
404 bytes_count = 4;
405 ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_3_BITS_MASK));
406 }
407
408 JERRY_ASSERT (buf_size >= bytes_count);
409
410 for (uint32_t i = 1; i < bytes_count; ++i)
411 {
412 ret <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
413 ret |= (buf_p[i] & LIT_UTF8_LAST_6_BITS_MASK);
414 }
415
416 *code_point = ret;
417 return bytes_count;
418 } /* lit_read_code_point_from_utf8 */
419
420 /**
421 * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
422 *
423 * @return number of bytes occupied by code point in the string
424 */
425 lit_utf8_size_t
lit_read_code_unit_from_utf8(const lit_utf8_byte_t * buf_p,ecma_char_t * code_point)426 lit_read_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */
427 ecma_char_t *code_point) /**< [out] code point */
428 {
429 JERRY_ASSERT (buf_p);
430
431 lit_utf8_byte_t c = buf_p[0];
432 if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
433 {
434 *code_point = (ecma_char_t) (c & LIT_UTF8_LAST_7_BITS_MASK);
435 return 1;
436 }
437
438 lit_code_point_t ret = LIT_UNICODE_CODE_POINT_NULL;
439 ecma_length_t bytes_count;
440 if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
441 {
442 bytes_count = 2;
443 ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
444 }
445 else
446 {
447 JERRY_ASSERT ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER);
448 bytes_count = 3;
449 ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
450 }
451
452 for (uint32_t i = 1; i < bytes_count; ++i)
453 {
454 ret <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
455 ret |= (buf_p[i] & LIT_UTF8_LAST_6_BITS_MASK);
456 }
457
458 JERRY_ASSERT (ret <= LIT_UTF16_CODE_UNIT_MAX);
459 *code_point = (ecma_char_t) ret;
460 return bytes_count;
461 } /* lit_read_code_unit_from_utf8 */
462
463 /**
464 * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
465 *
466 * @return number of bytes occupied by code point in the string
467 */
468 lit_utf8_size_t
lit_read_prev_code_unit_from_utf8(const lit_utf8_byte_t * buf_p,ecma_char_t * code_point)469 lit_read_prev_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */
470 ecma_char_t *code_point) /**< [out] code point */
471 {
472 JERRY_ASSERT (buf_p);
473
474 lit_utf8_decr (&buf_p);
475 return lit_read_code_unit_from_utf8 (buf_p, code_point);
476 } /* lit_read_prev_code_unit_from_utf8 */
477
478 /**
479 * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
480 *
481 * @return next code unit
482 */
483 ecma_char_t
lit_cesu8_read_next(const lit_utf8_byte_t ** buf_p)484 lit_cesu8_read_next (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
485 {
486 JERRY_ASSERT (*buf_p);
487 ecma_char_t ch;
488
489 *buf_p += lit_read_code_unit_from_utf8 (*buf_p, &ch);
490
491 return ch;
492 } /* lit_cesu8_read_next */
493
494 /**
495 * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
496 *
497 * @return previous code unit
498 */
499 ecma_char_t
lit_cesu8_read_prev(const lit_utf8_byte_t ** buf_p)500 lit_cesu8_read_prev (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
501 {
502 JERRY_ASSERT (*buf_p);
503 ecma_char_t ch;
504
505 lit_utf8_decr (buf_p);
506 lit_read_code_unit_from_utf8 (*buf_p, &ch);
507
508 return ch;
509 } /* lit_cesu8_read_prev */
510
511 /**
512 * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
513 *
514 * @return next code unit
515 */
516 ecma_char_t JERRY_ATTR_NOINLINE
lit_cesu8_peek_next(const lit_utf8_byte_t * buf_p)517 lit_cesu8_peek_next (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */
518 {
519 JERRY_ASSERT (buf_p != NULL);
520 ecma_char_t ch;
521
522 lit_read_code_unit_from_utf8 (buf_p, &ch);
523
524 return ch;
525 } /* lit_cesu8_peek_next */
526
527 /**
528 * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
529 *
530 * @return previous code unit
531 */
532 ecma_char_t JERRY_ATTR_NOINLINE
lit_cesu8_peek_prev(const lit_utf8_byte_t * buf_p)533 lit_cesu8_peek_prev (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */
534 {
535 JERRY_ASSERT (buf_p != NULL);
536 ecma_char_t ch;
537
538 lit_read_prev_code_unit_from_utf8 (buf_p, &ch);
539
540 return ch;
541 } /* lit_cesu8_peek_prev */
542
543 /**
544 * Increase cesu-8 encoded string pointer by one code unit.
545 */
546 inline void JERRY_ATTR_ALWAYS_INLINE
lit_utf8_incr(const lit_utf8_byte_t ** buf_p)547 lit_utf8_incr (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
548 {
549 JERRY_ASSERT (*buf_p);
550
551 *buf_p += lit_get_unicode_char_size_by_utf8_first_byte (**buf_p);
552 } /* lit_utf8_incr */
553
554 /**
555 * Decrease cesu-8 encoded string pointer by one code unit.
556 */
557 void
lit_utf8_decr(const lit_utf8_byte_t ** buf_p)558 lit_utf8_decr (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
559 {
560 JERRY_ASSERT (*buf_p);
561 const lit_utf8_byte_t *current_p = *buf_p;
562
563 do
564 {
565 current_p--;
566 }
567 while ((*(current_p) & LIT_UTF8_EXTRA_BYTE_MASK) == LIT_UTF8_EXTRA_BYTE_MARKER);
568
569 *buf_p = current_p;
570 } /* lit_utf8_decr */
571
572 /**
573 * Calc hash using the specified hash_basis.
574 *
575 * NOTE:
576 * This is implementation of FNV-1a hash function, which is released into public domain.
577 * Constants used, are carefully picked primes by the authors.
578 * More info: http://www.isthe.com/chongo/tech/comp/fnv/
579 *
580 * @return ecma-string's hash
581 */
582 inline lit_string_hash_t JERRY_ATTR_ALWAYS_INLINE
lit_utf8_string_hash_combine(lit_string_hash_t hash_basis,const lit_utf8_byte_t * utf8_buf_p,lit_utf8_size_t utf8_buf_size)583 lit_utf8_string_hash_combine (lit_string_hash_t hash_basis, /**< hash to be combined with */
584 const lit_utf8_byte_t *utf8_buf_p, /**< characters buffer */
585 lit_utf8_size_t utf8_buf_size) /**< number of characters in the buffer */
586 {
587 JERRY_ASSERT (utf8_buf_p != NULL || utf8_buf_size == 0);
588
589 uint32_t hash = hash_basis;
590
591 for (uint32_t i = 0; i < utf8_buf_size; i++)
592 {
593 /* 16777619 is 32 bit FNV_prime = 2^24 + 2^8 + 0x93 = 16777619 */
594 hash = (hash ^ utf8_buf_p[i]) * 16777619;
595 }
596
597 return (lit_string_hash_t) hash;
598 } /* lit_utf8_string_hash_combine */
599
600 /**
601 * Calculate hash from the buffer.
602 *
603 * @return ecma-string's hash
604 */
605 inline lit_string_hash_t JERRY_ATTR_ALWAYS_INLINE
lit_utf8_string_calc_hash(const lit_utf8_byte_t * utf8_buf_p,lit_utf8_size_t utf8_buf_size)606 lit_utf8_string_calc_hash (const lit_utf8_byte_t *utf8_buf_p, /**< characters buffer */
607 lit_utf8_size_t utf8_buf_size) /**< number of characters in the buffer */
608 {
609 JERRY_ASSERT (utf8_buf_p != NULL || utf8_buf_size == 0);
610
611 /* 32 bit offset_basis for FNV = 2166136261 */
612 return lit_utf8_string_hash_combine ((lit_string_hash_t) 2166136261, utf8_buf_p, utf8_buf_size);
613 } /* lit_utf8_string_calc_hash */
614
615 /**
616 * Return code unit at the specified position in string
617 *
618 * NOTE:
619 * code_unit_offset should be less then string's length
620 *
621 * @return code unit value
622 */
623 ecma_char_t
lit_utf8_string_code_unit_at(const lit_utf8_byte_t * utf8_buf_p,lit_utf8_size_t utf8_buf_size,ecma_length_t code_unit_offset)624 lit_utf8_string_code_unit_at (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */
625 lit_utf8_size_t utf8_buf_size, /**< string size in bytes */
626 ecma_length_t code_unit_offset) /**< ofset of a code_unit */
627 {
628 lit_utf8_byte_t *current_p = (lit_utf8_byte_t *) utf8_buf_p;
629 ecma_char_t code_unit;
630
631 do
632 {
633 JERRY_ASSERT (current_p < utf8_buf_p + utf8_buf_size);
634 current_p += lit_read_code_unit_from_utf8 (current_p, &code_unit);
635 }
636 while (code_unit_offset--);
637
638 return code_unit;
639 } /* lit_utf8_string_code_unit_at */
640
641 /**
642 * Get CESU-8 encoded size of character
643 *
644 * @return number of bytes occupied in CESU-8
645 */
646 inline lit_utf8_size_t JERRY_ATTR_ALWAYS_INLINE
lit_get_unicode_char_size_by_utf8_first_byte(const lit_utf8_byte_t first_byte)647 lit_get_unicode_char_size_by_utf8_first_byte (const lit_utf8_byte_t first_byte) /**< buffer with characters */
648 {
649 if ((first_byte & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
650 {
651 return 1;
652 }
653 else if ((first_byte & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
654 {
655 return 2;
656 }
657 else
658 {
659 JERRY_ASSERT ((first_byte & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER);
660 return 3;
661 }
662 } /* lit_get_unicode_char_size_by_utf8_first_byte */
663
664 /**
665 * Convert code unit to cesu-8 representation
666 *
667 * @return byte count required to represent the code unit
668 */
669 lit_utf8_size_t
lit_code_unit_to_utf8(ecma_char_t code_unit,lit_utf8_byte_t * buf_p)670 lit_code_unit_to_utf8 (ecma_char_t code_unit, /**< code unit */
671 lit_utf8_byte_t *buf_p) /**< buffer where to store the result and its size
672 * should be at least LIT_UTF8_MAX_BYTES_IN_CODE_UNIT */
673 {
674 if (code_unit <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
675 {
676 buf_p[0] = (lit_utf8_byte_t) code_unit;
677 return 1;
678 }
679 else if (code_unit <= LIT_UTF8_2_BYTE_CODE_POINT_MAX)
680 {
681 uint32_t code_unit_bits = code_unit;
682 lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK);
683 code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
684
685 lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_5_BITS_MASK);
686 JERRY_ASSERT (first_byte_bits == code_unit_bits);
687
688 buf_p[0] = LIT_UTF8_2_BYTE_MARKER | first_byte_bits;
689 buf_p[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
690 return 2;
691 }
692 else
693 {
694 uint32_t code_unit_bits = code_unit;
695 lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK);
696 code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
697
698 lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK);
699 code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
700
701 lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_4_BITS_MASK);
702 JERRY_ASSERT (first_byte_bits == code_unit_bits);
703
704 buf_p[0] = LIT_UTF8_3_BYTE_MARKER | first_byte_bits;
705 buf_p[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
706 buf_p[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits;
707 return 3;
708 }
709 } /* lit_code_unit_to_utf8 */
710
711 /**
712 * Convert code point to cesu-8 representation
713 *
714 * @return byte count required to represent the code point
715 */
716 lit_utf8_size_t
lit_code_point_to_cesu8(lit_code_point_t code_point,lit_utf8_byte_t * buf)717 lit_code_point_to_cesu8 (lit_code_point_t code_point, /**< code point */
718 lit_utf8_byte_t *buf) /**< buffer where to store the result,
719 * its size should be at least 6 bytes */
720 {
721 if (code_point <= LIT_UTF16_CODE_UNIT_MAX)
722 {
723 return lit_code_unit_to_utf8 ((ecma_char_t) code_point, buf);
724 }
725 else
726 {
727 lit_utf8_size_t offset = lit_code_unit_to_utf8 (convert_code_point_to_high_surrogate (code_point), buf);
728 offset += lit_code_unit_to_utf8 (convert_code_point_to_low_surrogate (code_point), buf + offset);
729 return offset;
730 }
731 } /* lit_code_point_to_cesu8 */
732
733 /**
734 * Convert code point to utf-8 representation
735 *
736 * @return byte count required to represent the code point
737 */
738 lit_utf8_size_t
lit_code_point_to_utf8(lit_code_point_t code_point,lit_utf8_byte_t * buf)739 lit_code_point_to_utf8 (lit_code_point_t code_point, /**< code point */
740 lit_utf8_byte_t *buf) /**< buffer where to store the result,
741 * its size should be at least 4 bytes */
742 {
743 if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
744 {
745 buf[0] = (lit_utf8_byte_t) code_point;
746 return 1;
747 }
748 else if (code_point <= LIT_UTF8_2_BYTE_CODE_POINT_MAX)
749 {
750 uint32_t code_point_bits = code_point;
751 lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
752 code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
753
754 lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_5_BITS_MASK);
755 JERRY_ASSERT (first_byte_bits == code_point_bits);
756
757 buf[0] = LIT_UTF8_2_BYTE_MARKER | first_byte_bits;
758 buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
759 return 2;
760 }
761 else if (code_point <= LIT_UTF8_3_BYTE_CODE_POINT_MAX)
762 {
763 uint32_t code_point_bits = code_point;
764 lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
765 code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
766
767 lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
768 code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
769
770 lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_4_BITS_MASK);
771 JERRY_ASSERT (first_byte_bits == code_point_bits);
772
773 buf[0] = LIT_UTF8_3_BYTE_MARKER | first_byte_bits;
774 buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
775 buf[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits;
776 return 3;
777 }
778 else
779 {
780 JERRY_ASSERT (code_point <= LIT_UTF8_4_BYTE_CODE_POINT_MAX);
781
782 uint32_t code_point_bits = code_point;
783 lit_utf8_byte_t fourth_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
784 code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
785
786 lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
787 code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
788
789 lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
790 code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
791
792 lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_3_BITS_MASK);
793 JERRY_ASSERT (first_byte_bits == code_point_bits);
794
795 buf[0] = LIT_UTF8_4_BYTE_MARKER | first_byte_bits;
796 buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
797 buf[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits;
798 buf[3] = LIT_UTF8_EXTRA_BYTE_MARKER | fourth_byte_bits;
799 return 4;
800 }
801 } /* lit_code_point_to_utf8 */
802
803 /**
804 * Convert cesu-8 string to an utf-8 string and put it into the buffer.
805 * It is the caller's responsibility to make sure that the string fits in the buffer.
806 *
807 * @return number of bytes copied to the buffer.
808 */
809 lit_utf8_size_t
lit_convert_cesu8_string_to_utf8_string(const lit_utf8_byte_t * cesu8_string,lit_utf8_size_t cesu8_size,lit_utf8_byte_t * utf8_string,lit_utf8_size_t utf8_size)810 lit_convert_cesu8_string_to_utf8_string (const lit_utf8_byte_t *cesu8_string, /**< cesu-8 string */
811 lit_utf8_size_t cesu8_size, /**< size of cesu-8 string */
812 lit_utf8_byte_t *utf8_string, /**< destination utf-8 buffer pointer
813 * (can be NULL if buffer_size == 0) */
814 lit_utf8_size_t utf8_size) /**< size of utf-8 buffer */
815 {
816 const lit_utf8_byte_t *cesu8_pos = cesu8_string;
817 const lit_utf8_byte_t *cesu8_end_pos = cesu8_string + cesu8_size;
818
819 lit_utf8_byte_t *utf8_pos = utf8_string;
820 lit_utf8_byte_t *utf8_end_pos = utf8_string + utf8_size;
821
822 lit_utf8_size_t size = 0;
823
824 ecma_char_t prev_ch = 0;
825 lit_utf8_size_t prev_ch_size = 0;
826
827 while (cesu8_pos < cesu8_end_pos)
828 {
829 ecma_char_t ch;
830 lit_utf8_size_t code_unit_size = lit_read_code_unit_from_utf8 (cesu8_pos, &ch);
831
832 if (lit_is_code_point_utf16_low_surrogate (ch) && lit_is_code_point_utf16_high_surrogate (prev_ch))
833 {
834 JERRY_ASSERT (code_unit_size == prev_ch_size);
835 utf8_pos -= prev_ch_size;
836 lit_code_point_t code_point = lit_convert_surrogate_pair_to_code_point (prev_ch, ch);
837 lit_code_point_to_utf8 (code_point, utf8_pos);
838 size++;
839 }
840 else
841 {
842 memcpy (utf8_pos, cesu8_pos, code_unit_size);
843 size += code_unit_size;
844 }
845
846 utf8_pos = utf8_string + size;
847 cesu8_pos += code_unit_size;
848 prev_ch = ch;
849 prev_ch_size = code_unit_size;
850 }
851
852 JERRY_ASSERT (cesu8_pos == cesu8_end_pos);
853 JERRY_ASSERT (utf8_pos <= utf8_end_pos);
854
855 return size;
856 } /* lit_convert_cesu8_string_to_utf8_string */
857
858 /**
859 * Convert surrogate pair to code point
860 *
861 * @return code point
862 */
863 lit_code_point_t
lit_convert_surrogate_pair_to_code_point(ecma_char_t high_surrogate,ecma_char_t low_surrogate)864 lit_convert_surrogate_pair_to_code_point (ecma_char_t high_surrogate, /**< high surrogate code point */
865 ecma_char_t low_surrogate) /**< low surrogate code point */
866 {
867 JERRY_ASSERT (lit_is_code_point_utf16_high_surrogate (high_surrogate));
868 JERRY_ASSERT (lit_is_code_point_utf16_low_surrogate (low_surrogate));
869
870 lit_code_point_t code_point;
871 code_point = (uint16_t) (high_surrogate - LIT_UTF16_HIGH_SURROGATE_MIN);
872 code_point <<= LIT_UTF16_BITS_IN_SURROGATE;
873
874 code_point += LIT_UTF16_FIRST_SURROGATE_CODE_POINT;
875
876 code_point |= (uint16_t) (low_surrogate - LIT_UTF16_LOW_SURROGATE_MIN);
877 return code_point;
878 } /* lit_convert_surrogate_pair_to_code_point */
879
880 /**
881 * Relational compare of cesu-8 strings
882 *
883 * First string is less than second string if:
884 * - strings are not equal;
885 * - first string is prefix of second or is lexicographically less than second.
886 *
887 * @return true - if first string is less than second string,
888 * false - otherwise
889 */
lit_compare_utf8_strings_relational(const lit_utf8_byte_t * string1_p,lit_utf8_size_t string1_size,const lit_utf8_byte_t * string2_p,lit_utf8_size_t string2_size)890 bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, /**< utf-8 string */
891 lit_utf8_size_t string1_size, /**< string size */
892 const lit_utf8_byte_t *string2_p, /**< utf-8 string */
893 lit_utf8_size_t string2_size) /**< string size */
894 {
895 lit_utf8_byte_t *string1_pos = (lit_utf8_byte_t *) string1_p;
896 lit_utf8_byte_t *string2_pos = (lit_utf8_byte_t *) string2_p;
897 const lit_utf8_byte_t *string1_end_p = string1_p + string1_size;
898 const lit_utf8_byte_t *string2_end_p = string2_p + string2_size;
899
900 while (string1_pos < string1_end_p && string2_pos < string2_end_p)
901 {
902 ecma_char_t ch1, ch2;
903 string1_pos += lit_read_code_unit_from_utf8 (string1_pos, &ch1);
904 string2_pos += lit_read_code_unit_from_utf8 (string2_pos, &ch2);
905
906 if (ch1 < ch2)
907 {
908 return true;
909 }
910 else if (ch1 > ch2)
911 {
912 return false;
913 }
914 }
915
916 return (string1_pos >= string1_end_p && string2_pos < string2_end_p);
917 } /* lit_compare_utf8_strings_relational */
918