• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright JS Foundation and other contributors, http://js.foundation
2  *
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "lit-strings.h"
17 
18 #include "jrt-libc-includes.h"
19 
20 /**
21  * Validate utf-8 string
22  *
23  * NOTE:
24  *   Isolated surrogates are allowed.
25  *   Correct pair of surrogates is not allowed, it should be represented as 4-byte utf-8 character.
26  *
27  * @return true if utf-8 string is well-formed
28  *         false otherwise
29  */
30 bool
lit_is_valid_utf8_string(const lit_utf8_byte_t * utf8_buf_p,lit_utf8_size_t buf_size)31 lit_is_valid_utf8_string (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */
32                           lit_utf8_size_t buf_size) /**< string size */
33 {
34   lit_utf8_size_t idx = 0;
35 
36   bool is_prev_code_point_high_surrogate = false;
37   while (idx < buf_size)
38   {
39     lit_utf8_byte_t c = utf8_buf_p[idx++];
40     if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
41     {
42       is_prev_code_point_high_surrogate = false;
43       continue;
44     }
45 
46     lit_code_point_t code_point = 0;
47     lit_code_point_t min_code_point = 0;
48     lit_utf8_size_t extra_bytes_count;
49     if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
50     {
51       extra_bytes_count = 1;
52       min_code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN;
53       code_point = ((uint32_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
54     }
55     else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
56     {
57       extra_bytes_count = 2;
58       min_code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN;
59       code_point = ((uint32_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
60     }
61     else if ((c & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER)
62     {
63       extra_bytes_count = 3;
64       min_code_point = LIT_UTF8_4_BYTE_CODE_POINT_MIN;
65       code_point = ((uint32_t) (c & LIT_UTF8_LAST_3_BITS_MASK));
66     }
67     else
68     {
69       /* utf-8 string could not contain 5- and 6-byte sequences. */
70       return false;
71     }
72 
73     if (idx + extra_bytes_count > buf_size)
74     {
75       /* utf-8 string breaks in the middle */
76       return false;
77     }
78 
79     for (lit_utf8_size_t offset = 0; offset < extra_bytes_count; ++offset)
80     {
81       c = utf8_buf_p[idx + offset];
82       if ((c & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
83       {
84         /* invalid continuation byte */
85         return false;
86       }
87       code_point <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
88       code_point |= (c & LIT_UTF8_LAST_6_BITS_MASK);
89     }
90 
91     if (code_point < min_code_point
92         || code_point > LIT_UNICODE_CODE_POINT_MAX)
93     {
94       /* utf-8 string doesn't encode valid unicode code point */
95       return false;
96     }
97 
98     if (code_point >= LIT_UTF16_HIGH_SURROGATE_MIN
99         && code_point <= LIT_UTF16_HIGH_SURROGATE_MAX)
100     {
101       is_prev_code_point_high_surrogate = true;
102     }
103     else if (code_point >= LIT_UTF16_LOW_SURROGATE_MIN
104              && code_point <= LIT_UTF16_LOW_SURROGATE_MAX
105              && is_prev_code_point_high_surrogate)
106     {
107       /* sequence of high and low surrogate is not allowed */
108       return false;
109     }
110     else
111     {
112       is_prev_code_point_high_surrogate = false;
113     }
114 
115     idx += extra_bytes_count;
116   }
117 
118   return true;
119 } /* lit_is_valid_utf8_string */
120 
121 /**
122  * Validate cesu-8 string
123  *
124  * @return true if cesu-8 string is well-formed
125  *         false otherwise
126  */
127 bool
lit_is_valid_cesu8_string(const lit_utf8_byte_t * cesu8_buf_p,lit_utf8_size_t buf_size)128 lit_is_valid_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */
129                            lit_utf8_size_t buf_size) /**< string size */
130 {
131   lit_utf8_size_t idx = 0;
132 
133   while (idx < buf_size)
134   {
135     lit_utf8_byte_t c = cesu8_buf_p[idx++];
136     if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
137     {
138       continue;
139     }
140 
141     lit_code_point_t code_point = 0;
142     lit_code_point_t min_code_point = 0;
143     lit_utf8_size_t extra_bytes_count;
144     if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
145     {
146       extra_bytes_count = 1;
147       min_code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN;
148       code_point = ((uint32_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
149     }
150     else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
151     {
152       extra_bytes_count = 2;
153       min_code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN;
154       code_point = ((uint32_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
155     }
156     else
157     {
158       return false;
159     }
160 
161     if (idx + extra_bytes_count > buf_size)
162     {
163       /* cesu-8 string breaks in the middle */
164       return false;
165     }
166 
167     for (lit_utf8_size_t offset = 0; offset < extra_bytes_count; ++offset)
168     {
169       c = cesu8_buf_p[idx + offset];
170       if ((c & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
171       {
172         /* invalid continuation byte */
173         return false;
174       }
175       code_point <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
176       code_point |= (c & LIT_UTF8_LAST_6_BITS_MASK);
177     }
178 
179     if (code_point < min_code_point)
180     {
181       /* cesu-8 string doesn't encode valid unicode code point */
182       return false;
183     }
184 
185     idx += extra_bytes_count;
186   }
187 
188   return true;
189 } /* lit_is_valid_cesu8_string */
190 
191 /**
192  * Check if the code point is UTF-16 low surrogate
193  *
194  * @return true / false
195  */
196 bool
lit_is_code_point_utf16_low_surrogate(lit_code_point_t code_point)197 lit_is_code_point_utf16_low_surrogate (lit_code_point_t code_point) /**< code point */
198 {
199   return LIT_UTF16_LOW_SURROGATE_MIN <= code_point && code_point <= LIT_UTF16_LOW_SURROGATE_MAX;
200 } /* lit_is_code_point_utf16_low_surrogate */
201 
202 /**
203  * Check if the code point is UTF-16 high surrogate
204  *
205  * @return true / false
206  */
207 bool
lit_is_code_point_utf16_high_surrogate(lit_code_point_t code_point)208 lit_is_code_point_utf16_high_surrogate (lit_code_point_t code_point) /**< code point */
209 {
210   return LIT_UTF16_HIGH_SURROGATE_MIN <= code_point && code_point <= LIT_UTF16_HIGH_SURROGATE_MAX;
211 } /* lit_is_code_point_utf16_high_surrogate */
212 
213 /**
214  * Represents code point (>0xFFFF) as surrogate pair and returns its lower part
215  *
216  * @return lower code_unit of the surrogate pair
217  */
218 static ecma_char_t
convert_code_point_to_low_surrogate(lit_code_point_t code_point)219 convert_code_point_to_low_surrogate (lit_code_point_t code_point) /**< code point, should be > 0xFFFF */
220 {
221   JERRY_ASSERT (code_point > LIT_UTF16_CODE_UNIT_MAX);
222 
223   ecma_char_t code_unit_bits;
224   code_unit_bits = (ecma_char_t) (code_point & LIT_UTF16_LAST_10_BITS_MASK);
225 
226   return (ecma_char_t) (LIT_UTF16_LOW_SURROGATE_MARKER | code_unit_bits);
227 } /* convert_code_point_to_low_surrogate */
228 
229 /**
230  * Represents code point (>0xFFFF) as surrogate pair and returns its higher part
231  *
232  * @return higher code_unit of the surrogate pair
233  */
234 static ecma_char_t
convert_code_point_to_high_surrogate(lit_code_point_t code_point)235 convert_code_point_to_high_surrogate (lit_code_point_t code_point) /**< code point, should be > 0xFFFF */
236 {
237   JERRY_ASSERT (code_point > LIT_UTF16_CODE_UNIT_MAX);
238   JERRY_ASSERT (code_point <= LIT_UNICODE_CODE_POINT_MAX);
239 
240   ecma_char_t code_unit_bits;
241   code_unit_bits = (ecma_char_t) ((code_point - LIT_UTF16_FIRST_SURROGATE_CODE_POINT) >> LIT_UTF16_BITS_IN_SURROGATE);
242 
243   return (LIT_UTF16_HIGH_SURROGATE_MARKER | code_unit_bits);
244 } /* convert_code_point_to_high_surrogate */
245 
246 /**
247  * UTF16 Encoding method for a code point
248  *
249  * See also:
250  *          ECMA-262 v6, 10.1.1
251  *
252  * @return uint8_t, the number of returning code points
253  */
254 uint8_t
lit_utf16_encode_code_point(lit_code_point_t cp,ecma_char_t * cu_p)255 lit_utf16_encode_code_point (lit_code_point_t cp, /**< the code point we encode */
256                              ecma_char_t *cu_p) /**< result of the encoding */
257 {
258   if (cp <= LIT_UTF16_CODE_UNIT_MAX)
259   {
260     cu_p[0] = (ecma_char_t) cp;
261     return 1;
262   }
263 
264   cu_p[0] = convert_code_point_to_high_surrogate (cp);
265   cu_p[1] = convert_code_point_to_low_surrogate (cp);
266   return 2;
267 } /* lit_utf16_encode_code_point */
268 
269 /**
270  * Calculate size of a zero-terminated utf-8 string
271  *
272  * NOTE:
273  *   - string cannot be NULL
274  *   - string should not contain zero characters in the middle
275  *
276  * @return size of a string
277  */
278 lit_utf8_size_t
lit_zt_utf8_string_size(const lit_utf8_byte_t * utf8_str_p)279 lit_zt_utf8_string_size (const lit_utf8_byte_t *utf8_str_p) /**< zero-terminated utf-8 string */
280 {
281   JERRY_ASSERT (utf8_str_p != NULL);
282   return (lit_utf8_size_t) strlen ((const char *) utf8_str_p);
283 } /* lit_zt_utf8_string_size */
284 
285 /**
286  * Calculate length of a cesu-8 encoded string
287  *
288  * @return UTF-16 code units count
289  */
290 ecma_length_t
lit_utf8_string_length(const lit_utf8_byte_t * utf8_buf_p,lit_utf8_size_t utf8_buf_size)291 lit_utf8_string_length (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */
292                         lit_utf8_size_t utf8_buf_size) /**< string size */
293 {
294   ecma_length_t length = 0;
295   lit_utf8_size_t size = 0;
296 
297   while (size < utf8_buf_size)
298   {
299     size += lit_get_unicode_char_size_by_utf8_first_byte (*(utf8_buf_p + size));
300     length++;
301   }
302 
303   JERRY_ASSERT (size == utf8_buf_size);
304 
305   return length;
306 } /* lit_utf8_string_length */
307 
308 /**
309  * Calculate the required size of an utf-8 encoded string from cesu-8 encoded string
310  *
311  * @return size of an utf-8 encoded string
312  */
313 lit_utf8_size_t
lit_get_utf8_size_of_cesu8_string(const lit_utf8_byte_t * cesu8_buf_p,lit_utf8_size_t cesu8_buf_size)314 lit_get_utf8_size_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */
315                                    lit_utf8_size_t cesu8_buf_size) /**< string size */
316 {
317   lit_utf8_size_t offset = 0;
318   lit_utf8_size_t utf8_buf_size = cesu8_buf_size;
319   ecma_char_t prev_ch = 0;
320 
321   while (offset < cesu8_buf_size)
322   {
323     ecma_char_t ch;
324     offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &ch);
325 
326     if (lit_is_code_point_utf16_low_surrogate (ch) && lit_is_code_point_utf16_high_surrogate (prev_ch))
327     {
328       utf8_buf_size -= 2;
329     }
330 
331     prev_ch = ch;
332   }
333 
334   JERRY_ASSERT (offset == cesu8_buf_size);
335 
336   return utf8_buf_size;
337 } /* lit_get_utf8_size_of_cesu8_string */
338 
339 /**
340  * Calculate length of an utf-8 encoded string from cesu-8 encoded string
341  *
342  * @return length of an utf-8 encoded string
343  */
344 ecma_length_t
lit_get_utf8_length_of_cesu8_string(const lit_utf8_byte_t * cesu8_buf_p,lit_utf8_size_t cesu8_buf_size)345 lit_get_utf8_length_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */
346                                      lit_utf8_size_t cesu8_buf_size) /**< string size */
347 {
348   lit_utf8_size_t offset = 0;
349   ecma_length_t utf8_length = 0;
350   ecma_char_t prev_ch = 0;
351 
352   while (offset < cesu8_buf_size)
353   {
354     ecma_char_t ch;
355     offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &ch);
356 
357     if (!lit_is_code_point_utf16_low_surrogate (ch) || !lit_is_code_point_utf16_high_surrogate (prev_ch))
358     {
359       utf8_length++;
360     }
361 
362     prev_ch = ch;
363   }
364 
365   JERRY_ASSERT (offset == cesu8_buf_size);
366 
367   return utf8_length;
368 } /* lit_get_utf8_length_of_cesu8_string */
369 
370 /**
371  * Decodes a unicode code point from non-empty utf-8-encoded buffer
372  *
373  * @return number of bytes occupied by code point in the string
374  */
375 lit_utf8_size_t
lit_read_code_point_from_utf8(const lit_utf8_byte_t * buf_p,lit_utf8_size_t buf_size,lit_code_point_t * code_point)376 lit_read_code_point_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */
377                                lit_utf8_size_t buf_size, /**< size of the buffer in bytes */
378                                lit_code_point_t *code_point) /**< [out] code point */
379 {
380   JERRY_ASSERT (buf_p && buf_size);
381 
382   lit_utf8_byte_t c = buf_p[0];
383   if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
384   {
385     *code_point = (lit_code_point_t) (c & LIT_UTF8_LAST_7_BITS_MASK);
386     return 1;
387   }
388 
389   lit_code_point_t ret = LIT_UNICODE_CODE_POINT_NULL;
390   ecma_length_t bytes_count = 0;
391   if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
392   {
393     bytes_count = 2;
394     ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
395   }
396   else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
397   {
398     bytes_count = 3;
399     ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
400   }
401   else
402   {
403     JERRY_ASSERT ((c & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER);
404     bytes_count = 4;
405     ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_3_BITS_MASK));
406   }
407 
408   JERRY_ASSERT (buf_size >= bytes_count);
409 
410   for (uint32_t i = 1; i < bytes_count; ++i)
411   {
412     ret <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
413     ret |= (buf_p[i] & LIT_UTF8_LAST_6_BITS_MASK);
414   }
415 
416   *code_point = ret;
417   return bytes_count;
418 } /* lit_read_code_point_from_utf8 */
419 
420 /**
421  * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
422  *
423  * @return number of bytes occupied by code point in the string
424  */
425 lit_utf8_size_t
lit_read_code_unit_from_utf8(const lit_utf8_byte_t * buf_p,ecma_char_t * code_point)426 lit_read_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */
427                               ecma_char_t *code_point) /**< [out] code point */
428 {
429   JERRY_ASSERT (buf_p);
430 
431   lit_utf8_byte_t c = buf_p[0];
432   if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
433   {
434     *code_point = (ecma_char_t) (c & LIT_UTF8_LAST_7_BITS_MASK);
435     return 1;
436   }
437 
438   lit_code_point_t ret = LIT_UNICODE_CODE_POINT_NULL;
439   ecma_length_t bytes_count;
440   if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
441   {
442     bytes_count = 2;
443     ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
444   }
445   else
446   {
447     JERRY_ASSERT ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER);
448     bytes_count = 3;
449     ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
450   }
451 
452   for (uint32_t i = 1; i < bytes_count; ++i)
453   {
454     ret <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
455     ret |= (buf_p[i] & LIT_UTF8_LAST_6_BITS_MASK);
456   }
457 
458   JERRY_ASSERT (ret <= LIT_UTF16_CODE_UNIT_MAX);
459   *code_point = (ecma_char_t) ret;
460   return bytes_count;
461 } /* lit_read_code_unit_from_utf8 */
462 
463 /**
464  * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
465  *
466  * @return number of bytes occupied by code point in the string
467  */
468 lit_utf8_size_t
lit_read_prev_code_unit_from_utf8(const lit_utf8_byte_t * buf_p,ecma_char_t * code_point)469 lit_read_prev_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */
470                                    ecma_char_t *code_point) /**< [out] code point */
471 {
472   JERRY_ASSERT (buf_p);
473 
474   lit_utf8_decr (&buf_p);
475   return lit_read_code_unit_from_utf8 (buf_p, code_point);
476 } /* lit_read_prev_code_unit_from_utf8 */
477 
478 /**
479  * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
480  *
481  * @return next code unit
482  */
483 ecma_char_t
lit_cesu8_read_next(const lit_utf8_byte_t ** buf_p)484 lit_cesu8_read_next (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
485 {
486   JERRY_ASSERT (*buf_p);
487   ecma_char_t ch;
488 
489   *buf_p += lit_read_code_unit_from_utf8 (*buf_p, &ch);
490 
491   return ch;
492 } /* lit_cesu8_read_next */
493 
494 /**
495  * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
496  *
497  * @return previous code unit
498  */
499 ecma_char_t
lit_cesu8_read_prev(const lit_utf8_byte_t ** buf_p)500 lit_cesu8_read_prev (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
501 {
502   JERRY_ASSERT (*buf_p);
503   ecma_char_t ch;
504 
505   lit_utf8_decr (buf_p);
506   lit_read_code_unit_from_utf8 (*buf_p, &ch);
507 
508   return ch;
509 } /* lit_cesu8_read_prev */
510 
511 /**
512  * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
513  *
514  * @return next code unit
515  */
516 ecma_char_t JERRY_ATTR_NOINLINE
lit_cesu8_peek_next(const lit_utf8_byte_t * buf_p)517 lit_cesu8_peek_next (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */
518 {
519   JERRY_ASSERT (buf_p != NULL);
520   ecma_char_t ch;
521 
522   lit_read_code_unit_from_utf8 (buf_p, &ch);
523 
524   return ch;
525 } /* lit_cesu8_peek_next */
526 
527 /**
528  * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
529  *
530  * @return previous code unit
531  */
532 ecma_char_t JERRY_ATTR_NOINLINE
lit_cesu8_peek_prev(const lit_utf8_byte_t * buf_p)533 lit_cesu8_peek_prev (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */
534 {
535   JERRY_ASSERT (buf_p != NULL);
536   ecma_char_t ch;
537 
538   lit_read_prev_code_unit_from_utf8 (buf_p, &ch);
539 
540   return ch;
541 } /* lit_cesu8_peek_prev */
542 
543 /**
544  * Increase cesu-8 encoded string pointer by one code unit.
545  */
546 inline void JERRY_ATTR_ALWAYS_INLINE
lit_utf8_incr(const lit_utf8_byte_t ** buf_p)547 lit_utf8_incr (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
548 {
549   JERRY_ASSERT (*buf_p);
550 
551   *buf_p += lit_get_unicode_char_size_by_utf8_first_byte (**buf_p);
552 } /* lit_utf8_incr */
553 
554 /**
555  * Decrease cesu-8 encoded string pointer by one code unit.
556  */
557 void
lit_utf8_decr(const lit_utf8_byte_t ** buf_p)558 lit_utf8_decr (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
559 {
560   JERRY_ASSERT (*buf_p);
561   const lit_utf8_byte_t *current_p = *buf_p;
562 
563   do
564   {
565     current_p--;
566   }
567   while ((*(current_p) & LIT_UTF8_EXTRA_BYTE_MASK) == LIT_UTF8_EXTRA_BYTE_MARKER);
568 
569   *buf_p = current_p;
570 } /* lit_utf8_decr */
571 
572 /**
573  * Calc hash using the specified hash_basis.
574  *
575  * NOTE:
576  *   This is implementation of FNV-1a hash function, which is released into public domain.
577  *   Constants used, are carefully picked primes by the authors.
578  *   More info: http://www.isthe.com/chongo/tech/comp/fnv/
579  *
580  * @return ecma-string's hash
581  */
582 inline lit_string_hash_t JERRY_ATTR_ALWAYS_INLINE
lit_utf8_string_hash_combine(lit_string_hash_t hash_basis,const lit_utf8_byte_t * utf8_buf_p,lit_utf8_size_t utf8_buf_size)583 lit_utf8_string_hash_combine (lit_string_hash_t hash_basis, /**< hash to be combined with */
584                               const lit_utf8_byte_t *utf8_buf_p, /**< characters buffer */
585                               lit_utf8_size_t utf8_buf_size) /**< number of characters in the buffer */
586 {
587   JERRY_ASSERT (utf8_buf_p != NULL || utf8_buf_size == 0);
588 
589   uint32_t hash = hash_basis;
590 
591   for (uint32_t i = 0; i < utf8_buf_size; i++)
592   {
593     /* 16777619 is 32 bit FNV_prime = 2^24 + 2^8 + 0x93 = 16777619 */
594     hash = (hash ^ utf8_buf_p[i]) * 16777619;
595   }
596 
597   return (lit_string_hash_t) hash;
598 } /* lit_utf8_string_hash_combine */
599 
600 /**
601  * Calculate hash from the buffer.
602  *
603  * @return ecma-string's hash
604  */
605 inline lit_string_hash_t JERRY_ATTR_ALWAYS_INLINE
lit_utf8_string_calc_hash(const lit_utf8_byte_t * utf8_buf_p,lit_utf8_size_t utf8_buf_size)606 lit_utf8_string_calc_hash (const lit_utf8_byte_t *utf8_buf_p, /**< characters buffer */
607                            lit_utf8_size_t utf8_buf_size) /**< number of characters in the buffer */
608 {
609   JERRY_ASSERT (utf8_buf_p != NULL || utf8_buf_size == 0);
610 
611   /* 32 bit offset_basis for FNV = 2166136261 */
612   return lit_utf8_string_hash_combine ((lit_string_hash_t) 2166136261, utf8_buf_p, utf8_buf_size);
613 } /* lit_utf8_string_calc_hash */
614 
615 /**
616  * Return code unit at the specified position in string
617  *
618  * NOTE:
619  *   code_unit_offset should be less then string's length
620  *
621  * @return code unit value
622  */
623 ecma_char_t
lit_utf8_string_code_unit_at(const lit_utf8_byte_t * utf8_buf_p,lit_utf8_size_t utf8_buf_size,ecma_length_t code_unit_offset)624 lit_utf8_string_code_unit_at (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */
625                               lit_utf8_size_t utf8_buf_size, /**< string size in bytes */
626                               ecma_length_t code_unit_offset) /**< ofset of a code_unit */
627 {
628   lit_utf8_byte_t *current_p = (lit_utf8_byte_t *) utf8_buf_p;
629   ecma_char_t code_unit;
630 
631   do
632   {
633     JERRY_ASSERT (current_p < utf8_buf_p + utf8_buf_size);
634     current_p += lit_read_code_unit_from_utf8 (current_p, &code_unit);
635   }
636   while (code_unit_offset--);
637 
638   return code_unit;
639 } /* lit_utf8_string_code_unit_at */
640 
641 /**
642  * Get CESU-8 encoded size of character
643  *
644  * @return number of bytes occupied in CESU-8
645  */
646 inline lit_utf8_size_t JERRY_ATTR_ALWAYS_INLINE
lit_get_unicode_char_size_by_utf8_first_byte(const lit_utf8_byte_t first_byte)647 lit_get_unicode_char_size_by_utf8_first_byte (const lit_utf8_byte_t first_byte) /**< buffer with characters */
648 {
649   if ((first_byte & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
650   {
651     return 1;
652   }
653   else if ((first_byte & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
654   {
655     return 2;
656   }
657   else
658   {
659     JERRY_ASSERT ((first_byte & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER);
660     return 3;
661   }
662 } /* lit_get_unicode_char_size_by_utf8_first_byte */
663 
664 /**
665  * Convert code unit to cesu-8 representation
666  *
667  * @return byte count required to represent the code unit
668  */
669 lit_utf8_size_t
lit_code_unit_to_utf8(ecma_char_t code_unit,lit_utf8_byte_t * buf_p)670 lit_code_unit_to_utf8 (ecma_char_t code_unit, /**< code unit */
671                        lit_utf8_byte_t *buf_p) /**< buffer where to store the result and its size
672                                                 *   should be at least LIT_UTF8_MAX_BYTES_IN_CODE_UNIT */
673 {
674   if (code_unit <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
675   {
676     buf_p[0] = (lit_utf8_byte_t) code_unit;
677     return 1;
678   }
679   else if (code_unit <= LIT_UTF8_2_BYTE_CODE_POINT_MAX)
680   {
681     uint32_t code_unit_bits = code_unit;
682     lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK);
683     code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
684 
685     lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_5_BITS_MASK);
686     JERRY_ASSERT (first_byte_bits == code_unit_bits);
687 
688     buf_p[0] = LIT_UTF8_2_BYTE_MARKER | first_byte_bits;
689     buf_p[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
690     return 2;
691   }
692   else
693   {
694     uint32_t code_unit_bits = code_unit;
695     lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK);
696     code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
697 
698     lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK);
699     code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
700 
701     lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_4_BITS_MASK);
702     JERRY_ASSERT (first_byte_bits == code_unit_bits);
703 
704     buf_p[0] = LIT_UTF8_3_BYTE_MARKER | first_byte_bits;
705     buf_p[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
706     buf_p[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits;
707     return 3;
708   }
709 } /* lit_code_unit_to_utf8 */
710 
711 /**
712  * Convert code point to cesu-8 representation
713  *
714  * @return byte count required to represent the code point
715  */
716 lit_utf8_size_t
lit_code_point_to_cesu8(lit_code_point_t code_point,lit_utf8_byte_t * buf)717 lit_code_point_to_cesu8 (lit_code_point_t code_point, /**< code point */
718                          lit_utf8_byte_t *buf) /**< buffer where to store the result,
719                                                 *   its size should be at least 6 bytes */
720 {
721   if (code_point <= LIT_UTF16_CODE_UNIT_MAX)
722   {
723     return lit_code_unit_to_utf8 ((ecma_char_t) code_point, buf);
724   }
725   else
726   {
727     lit_utf8_size_t offset = lit_code_unit_to_utf8 (convert_code_point_to_high_surrogate (code_point), buf);
728     offset += lit_code_unit_to_utf8 (convert_code_point_to_low_surrogate (code_point), buf + offset);
729     return offset;
730   }
731 } /* lit_code_point_to_cesu8 */
732 
733 /**
734  * Convert code point to utf-8 representation
735  *
736  * @return byte count required to represent the code point
737  */
738 lit_utf8_size_t
lit_code_point_to_utf8(lit_code_point_t code_point,lit_utf8_byte_t * buf)739 lit_code_point_to_utf8 (lit_code_point_t code_point, /**< code point */
740                         lit_utf8_byte_t *buf) /**< buffer where to store the result,
741                                               *   its size should be at least 4 bytes */
742 {
743   if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
744   {
745     buf[0] = (lit_utf8_byte_t) code_point;
746     return 1;
747   }
748   else if (code_point <= LIT_UTF8_2_BYTE_CODE_POINT_MAX)
749   {
750     uint32_t code_point_bits = code_point;
751     lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
752     code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
753 
754     lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_5_BITS_MASK);
755     JERRY_ASSERT (first_byte_bits == code_point_bits);
756 
757     buf[0] = LIT_UTF8_2_BYTE_MARKER | first_byte_bits;
758     buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
759     return 2;
760   }
761   else if (code_point <= LIT_UTF8_3_BYTE_CODE_POINT_MAX)
762   {
763     uint32_t code_point_bits = code_point;
764     lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
765     code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
766 
767     lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
768     code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
769 
770     lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_4_BITS_MASK);
771     JERRY_ASSERT (first_byte_bits == code_point_bits);
772 
773     buf[0] = LIT_UTF8_3_BYTE_MARKER | first_byte_bits;
774     buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
775     buf[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits;
776     return 3;
777   }
778   else
779   {
780     JERRY_ASSERT (code_point <= LIT_UTF8_4_BYTE_CODE_POINT_MAX);
781 
782     uint32_t code_point_bits = code_point;
783     lit_utf8_byte_t fourth_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
784     code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
785 
786     lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
787     code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
788 
789     lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
790     code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
791 
792     lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_3_BITS_MASK);
793     JERRY_ASSERT (first_byte_bits == code_point_bits);
794 
795     buf[0] = LIT_UTF8_4_BYTE_MARKER | first_byte_bits;
796     buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
797     buf[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits;
798     buf[3] = LIT_UTF8_EXTRA_BYTE_MARKER | fourth_byte_bits;
799     return 4;
800   }
801 } /* lit_code_point_to_utf8 */
802 
803 /**
804  * Convert cesu-8 string to an utf-8 string and put it into the buffer.
805  * It is the caller's responsibility to make sure that the string fits in the buffer.
806  *
807  * @return number of bytes copied to the buffer.
808  */
809 lit_utf8_size_t
lit_convert_cesu8_string_to_utf8_string(const lit_utf8_byte_t * cesu8_string,lit_utf8_size_t cesu8_size,lit_utf8_byte_t * utf8_string,lit_utf8_size_t utf8_size)810 lit_convert_cesu8_string_to_utf8_string (const lit_utf8_byte_t *cesu8_string, /**< cesu-8 string */
811                                          lit_utf8_size_t cesu8_size, /**< size of cesu-8 string */
812                                          lit_utf8_byte_t *utf8_string, /**< destination utf-8 buffer pointer
813                                                                         * (can be NULL if buffer_size == 0) */
814                                          lit_utf8_size_t utf8_size) /**< size of utf-8 buffer */
815 {
816   const lit_utf8_byte_t *cesu8_pos = cesu8_string;
817   const lit_utf8_byte_t *cesu8_end_pos = cesu8_string + cesu8_size;
818 
819   lit_utf8_byte_t *utf8_pos = utf8_string;
820   lit_utf8_byte_t *utf8_end_pos = utf8_string + utf8_size;
821 
822   lit_utf8_size_t size = 0;
823 
824   ecma_char_t prev_ch = 0;
825   lit_utf8_size_t prev_ch_size = 0;
826 
827   while (cesu8_pos < cesu8_end_pos)
828   {
829     ecma_char_t ch;
830     lit_utf8_size_t code_unit_size = lit_read_code_unit_from_utf8 (cesu8_pos, &ch);
831 
832     if (lit_is_code_point_utf16_low_surrogate (ch) && lit_is_code_point_utf16_high_surrogate (prev_ch))
833     {
834       JERRY_ASSERT (code_unit_size == prev_ch_size);
835       utf8_pos -= prev_ch_size;
836       lit_code_point_t code_point = lit_convert_surrogate_pair_to_code_point (prev_ch, ch);
837       lit_code_point_to_utf8 (code_point, utf8_pos);
838       size++;
839     }
840     else
841     {
842       memcpy (utf8_pos, cesu8_pos, code_unit_size);
843       size += code_unit_size;
844     }
845 
846     utf8_pos = utf8_string + size;
847     cesu8_pos += code_unit_size;
848     prev_ch = ch;
849     prev_ch_size = code_unit_size;
850   }
851 
852   JERRY_ASSERT (cesu8_pos == cesu8_end_pos);
853   JERRY_ASSERT (utf8_pos <= utf8_end_pos);
854 
855   return size;
856 } /* lit_convert_cesu8_string_to_utf8_string */
857 
858 /**
859  * Convert surrogate pair to code point
860  *
861  * @return code point
862  */
863 lit_code_point_t
lit_convert_surrogate_pair_to_code_point(ecma_char_t high_surrogate,ecma_char_t low_surrogate)864 lit_convert_surrogate_pair_to_code_point (ecma_char_t high_surrogate, /**< high surrogate code point */
865                                           ecma_char_t low_surrogate) /**< low surrogate code point */
866 {
867   JERRY_ASSERT (lit_is_code_point_utf16_high_surrogate (high_surrogate));
868   JERRY_ASSERT (lit_is_code_point_utf16_low_surrogate (low_surrogate));
869 
870   lit_code_point_t code_point;
871   code_point = (uint16_t) (high_surrogate - LIT_UTF16_HIGH_SURROGATE_MIN);
872   code_point <<= LIT_UTF16_BITS_IN_SURROGATE;
873 
874   code_point += LIT_UTF16_FIRST_SURROGATE_CODE_POINT;
875 
876   code_point |= (uint16_t) (low_surrogate - LIT_UTF16_LOW_SURROGATE_MIN);
877   return code_point;
878 } /* lit_convert_surrogate_pair_to_code_point */
879 
880 /**
881  * Relational compare of cesu-8 strings
882  *
883  * First string is less than second string if:
884  *  - strings are not equal;
885  *  - first string is prefix of second or is lexicographically less than second.
886  *
887  * @return true - if first string is less than second string,
888  *         false - otherwise
889  */
lit_compare_utf8_strings_relational(const lit_utf8_byte_t * string1_p,lit_utf8_size_t string1_size,const lit_utf8_byte_t * string2_p,lit_utf8_size_t string2_size)890 bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, /**< utf-8 string */
891                                           lit_utf8_size_t string1_size, /**< string size */
892                                           const lit_utf8_byte_t *string2_p, /**< utf-8 string */
893                                           lit_utf8_size_t string2_size) /**< string size */
894 {
895   lit_utf8_byte_t *string1_pos = (lit_utf8_byte_t *) string1_p;
896   lit_utf8_byte_t *string2_pos = (lit_utf8_byte_t *) string2_p;
897   const lit_utf8_byte_t *string1_end_p = string1_p + string1_size;
898   const lit_utf8_byte_t *string2_end_p = string2_p + string2_size;
899 
900   while (string1_pos < string1_end_p && string2_pos < string2_end_p)
901   {
902     ecma_char_t ch1, ch2;
903     string1_pos += lit_read_code_unit_from_utf8 (string1_pos, &ch1);
904     string2_pos += lit_read_code_unit_from_utf8 (string2_pos, &ch2);
905 
906     if (ch1 < ch2)
907     {
908       return true;
909     }
910     else if (ch1 > ch2)
911     {
912       return false;
913     }
914   }
915 
916   return (string1_pos >= string1_end_p && string2_pos < string2_end_p);
917 } /* lit_compare_utf8_strings_relational */
918