• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright JS Foundation and other contributors, http://js.foundation
2  *
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "config.h"
17 #include "lit-char-helpers.h"
18 #include "lit-unicode-ranges.inc.h"
19 #include "lit-strings.h"
20 
21 #if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
22 #include "lit-unicode-conversions.inc.h"
23 #endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
24 
25 #define NUM_OF_ELEMENTS(array) (sizeof (array) / sizeof ((array)[0]))
26 
27 /**
28  * Binary search algorithm that searches the a
29  * character in the given char array.
30  *
31  * @return true - if the character is in the given array
32  *         false - otherwise
33  */
34 static bool
search_char_in_char_array(ecma_char_t c,const ecma_char_t * array,int size_of_array)35 search_char_in_char_array (ecma_char_t c,               /**< code unit */
36                            const ecma_char_t *array,    /**< array */
37                            int size_of_array)           /**< length of the array */
38 {
39   int bottom = 0;
40   int top = size_of_array - 1;
41 
42   while (bottom <= top)
43   {
44     int middle = (bottom + top) / 2;
45     ecma_char_t current = array[middle];
46 
47     if (current == c)
48     {
49       return true;
50     }
51 
52     if (c < current)
53     {
54       top = middle - 1;
55     }
56     else
57     {
58       bottom = middle + 1;
59     }
60   }
61 
62   return false;
63 } /* search_char_in_char_array */
64 
65 /**
66  * Binary search algorithm that searches a character in the given intervals.
67  * Intervals specifed by two arrays. The first one contains the starting points
68  * of the intervals, the second one contains the length of them.
69  *
70  * @return true - if the the character is included (inclusively) in one of the intervals in the given array
71  *         false - otherwise
72  */
73 static bool
search_char_in_interval_array(ecma_char_t c,const ecma_char_t * array_sp,const uint8_t * lengths,int size_of_array)74 search_char_in_interval_array (ecma_char_t c,               /**< code unit */
75                                const ecma_char_t *array_sp, /**< array of interval starting points */
76                                const uint8_t *lengths,      /**< array of interval lengths */
77                                int size_of_array)           /**< length of the array */
78 {
79   int bottom = 0;
80   int top = size_of_array - 1;
81 
82   while (bottom <= top)
83   {
84     int middle = (bottom + top) / 2;
85     ecma_char_t current_sp = array_sp[middle];
86 
87     if (current_sp <= c && c <= current_sp + lengths[middle])
88     {
89       return true;
90     }
91 
92     if (c > current_sp)
93     {
94       bottom = middle + 1;
95     }
96     else
97     {
98       top = middle - 1;
99     }
100   }
101 
102   return false;
103 } /* search_char_in_interval_array */
104 
105 /**
106  * Check if specified character is one of the Whitespace characters including those that fall into
107  * "Space, Separator" ("Zs") Unicode character category or one of the Line Terminator characters.
108  *
109  * @return true - if the character is one of characters, listed in ECMA-262 v5, Table 2,
110  *         false - otherwise
111  */
112 bool
lit_char_is_white_space(lit_code_point_t c)113 lit_char_is_white_space (lit_code_point_t c) /**< code point */
114 {
115   if (c <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
116   {
117     return (c == LIT_CHAR_SP || (c >= LIT_CHAR_TAB && c <= LIT_CHAR_CR));
118   }
119   else
120   {
121     if (c == LIT_CHAR_NBSP || c == LIT_CHAR_BOM || c == LIT_CHAR_LS || c == LIT_CHAR_PS)
122     {
123       return true;
124     }
125 
126     return (c <= LIT_UTF16_CODE_UNIT_MAX
127             && ((c >= lit_unicode_separator_char_interval_sps[0]
128                  && c < lit_unicode_separator_char_interval_sps[0] + lit_unicode_separator_char_interval_lengths[0])
129                 || search_char_in_char_array ((ecma_char_t) c,
130                                               lit_unicode_separator_chars,
131                                               NUM_OF_ELEMENTS (lit_unicode_separator_chars))));
132   }
133 } /* lit_char_is_white_space */
134 
135 /**
136  * Check if specified character is one of LineTerminator characters
137  *
138  * @return true - if the character is one of characters, listed in ECMA-262 v5, Table 3,
139  *         false - otherwise
140  */
141 bool
lit_char_is_line_terminator(ecma_char_t c)142 lit_char_is_line_terminator (ecma_char_t c) /**< code unit */
143 {
144   return (c == LIT_CHAR_LF
145           || c == LIT_CHAR_CR
146           || c == LIT_CHAR_LS
147           || c == LIT_CHAR_PS);
148 } /* lit_char_is_line_terminator */
149 
150 /**
151  * Check if specified character is a unicode letter
152  *
153  * Note:
154  *      Unicode letter is a character, included into one of the following categories:
155  *       - Uppercase letter (Lu);
156  *       - Lowercase letter (Ll);
157  *       - Titlecase letter (Lt);
158  *       - Modifier letter (Lm);
159  *       - Other letter (Lo);
160  *       - Letter number (Nl).
161  *
162  * See also:
163  *          ECMA-262 v5, 7.6
164  *
165  * @return true - if specified character falls into one of the listed categories,
166  *         false - otherwise
167  */
168 static bool
lit_char_is_unicode_letter(ecma_char_t c)169 lit_char_is_unicode_letter (ecma_char_t c) /**< code unit */
170 {
171   return (search_char_in_interval_array (c,
172                                          lit_unicode_letter_interval_sps,
173                                          lit_unicode_letter_interval_lengths,
174                                          NUM_OF_ELEMENTS (lit_unicode_letter_interval_sps))
175           || search_char_in_char_array (c, lit_unicode_letter_chars, NUM_OF_ELEMENTS (lit_unicode_letter_chars)));
176 } /* lit_char_is_unicode_letter */
177 
178 /**
179  * Check if specified character is a non-letter character and can be used as a
180  * non-first character of an identifier.
181  * These characters coverd by the following unicode categories:
182  *  - digit (Nd)
183  *  - punctuation mark (Mn, Mc)
184  *  - connector punctuation (Pc)
185  *
186  * See also:
187  *          ECMA-262 v5, 7.6
188  *
189  * @return true - if specified character falls into one of the listed categories,
190  *         false - otherwise
191  */
192 static bool
lit_char_is_unicode_non_letter_ident_part(ecma_char_t c)193 lit_char_is_unicode_non_letter_ident_part (ecma_char_t c) /**< code unit */
194 {
195   return (search_char_in_interval_array (c,
196                                          lit_unicode_non_letter_ident_part_interval_sps,
197                                          lit_unicode_non_letter_ident_part_interval_lengths,
198                                          NUM_OF_ELEMENTS (lit_unicode_non_letter_ident_part_interval_sps))
199           || search_char_in_char_array (c,
200                                         lit_unicode_non_letter_ident_part_chars,
201                                         NUM_OF_ELEMENTS (lit_unicode_non_letter_ident_part_chars)));
202 } /* lit_char_is_unicode_non_letter_ident_part */
203 
204 /**
205  * Checks whether the character is a valid identifier start.
206  *
207  * @return true if it is.
208  */
209 bool
lit_code_point_is_identifier_start(lit_code_point_t code_point)210 lit_code_point_is_identifier_start (lit_code_point_t code_point) /**< code point */
211 {
212   /* Fast path for ASCII-defined letters. */
213   if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
214   {
215     return ((LEXER_TO_ASCII_LOWERCASE (code_point) >= LIT_CHAR_LOWERCASE_A
216              && LEXER_TO_ASCII_LOWERCASE (code_point) <= LIT_CHAR_LOWERCASE_Z)
217             || code_point == LIT_CHAR_DOLLAR_SIGN
218             || code_point == LIT_CHAR_UNDERSCORE);
219   }
220 
221 #if ENABLED (JERRY_ES2015)
222   if (code_point >= LIT_UTF8_4_BYTE_CODE_POINT_MIN)
223   {
224     /* TODO: detect these ranges correctly. */
225     return (code_point >= 0x10C80 && code_point <= 0x10CF2);
226   }
227 #else /* !ENABLED (JERRY_ES2015) */
228   JERRY_ASSERT (code_point <= LIT_UTF8_4_BYTE_CODE_POINT_MIN);
229 #endif /* ENABLED (JERRY_ES2015) */
230 
231   return lit_char_is_unicode_letter ((ecma_char_t) code_point);
232 } /* lit_code_point_is_identifier_start */
233 
234 /**
235  * Checks whether the character is a valid identifier part.
236  *
237  * @return true if it is.
238  */
239 bool
lit_code_point_is_identifier_part(lit_code_point_t code_point)240 lit_code_point_is_identifier_part (lit_code_point_t code_point) /**< code point */
241 {
242   /* Fast path for ASCII-defined letters. */
243   if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
244   {
245     return ((LEXER_TO_ASCII_LOWERCASE (code_point) >= LIT_CHAR_LOWERCASE_A
246              && LEXER_TO_ASCII_LOWERCASE (code_point) <= LIT_CHAR_LOWERCASE_Z)
247             || (code_point >= LIT_CHAR_0 && code_point <= LIT_CHAR_9)
248             || code_point == LIT_CHAR_DOLLAR_SIGN
249             || code_point == LIT_CHAR_UNDERSCORE);
250   }
251 
252 #if ENABLED (JERRY_ES2015)
253   if (code_point >= LIT_UTF8_4_BYTE_CODE_POINT_MIN)
254   {
255     /* TODO: detect these ranges correctly. */
256     return (code_point >= 0x10C80 && code_point <= 0x10CF2);
257   }
258 #else /* !ENABLED (JERRY_ES2015) */
259   JERRY_ASSERT (code_point <= LIT_UTF8_4_BYTE_CODE_POINT_MIN);
260 #endif /* ENABLED (JERRY_ES2015) */
261 
262   return (lit_char_is_unicode_letter ((ecma_char_t) code_point)
263           || lit_char_is_unicode_non_letter_ident_part ((ecma_char_t) code_point));
264 } /* lit_code_point_is_identifier_part */
265 
266 /**
267  * Check if specified character is one of OctalDigit characters (ECMA-262 v5, B.1.2)
268  *
269  * @return true / false
270  */
271 bool
lit_char_is_octal_digit(ecma_char_t c)272 lit_char_is_octal_digit (ecma_char_t c) /**< code unit */
273 {
274   return (c >= LIT_CHAR_ASCII_OCTAL_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_OCTAL_DIGITS_END);
275 } /* lit_char_is_octal_digit */
276 
277 /**
278  * Check if specified character is one of DecimalDigit characters (ECMA-262 v5, 7.8.3)
279  *
280  * @return true / false
281  */
282 bool
lit_char_is_decimal_digit(ecma_char_t c)283 lit_char_is_decimal_digit (ecma_char_t c) /**< code unit */
284 {
285   return (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END);
286 } /* lit_char_is_decimal_digit */
287 
288 /**
289  * Check if specified character is one of HexDigit characters (ECMA-262 v5, 7.8.3)
290  *
291  * @return true / false
292  */
293 bool
lit_char_is_hex_digit(ecma_char_t c)294 lit_char_is_hex_digit (ecma_char_t c) /**< code unit */
295 {
296   return ((c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END)
297           || (LEXER_TO_ASCII_LOWERCASE (c) >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN
298               && LEXER_TO_ASCII_LOWERCASE (c) <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_END));
299 } /* lit_char_is_hex_digit */
300 
301 #if ENABLED (JERRY_ES2015)
302 /**
303  * Check if specified character is one of BinaryDigits characters (ECMA-262 v6, 11.8.3)
304  *
305  * @return true / false
306  */
307 bool
lit_char_is_binary_digit(ecma_char_t c)308 lit_char_is_binary_digit (ecma_char_t c) /** code unit */
309 {
310   return (c == LIT_CHAR_0 || c == LIT_CHAR_1);
311 } /* lit_char_is_binary_digit */
312 #endif /* ENABLED (JERRY_ES2015) */
313 
314 /**
315  * Convert a HexDigit character to its numeric value, as defined in ECMA-262 v5, 7.8.3
316  *
317  * @return digit value, corresponding to the hex char
318  */
319 uint32_t
lit_char_hex_to_int(ecma_char_t c)320 lit_char_hex_to_int (ecma_char_t c) /**< code unit, corresponding to
321                                      *    one of HexDigit characters */
322 {
323   JERRY_ASSERT (lit_char_is_hex_digit (c));
324 
325   if (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END)
326   {
327     return (uint32_t) (c - LIT_CHAR_ASCII_DIGITS_BEGIN);
328   }
329   else if (c >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN && c <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_END)
330   {
331     return (uint32_t) (c - LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN + 10);
332   }
333   else
334   {
335     return (uint32_t) (c - LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN + 10);
336   }
337 } /* lit_char_hex_to_int */
338 
339 /**
340  * Converts a character to UTF8 bytes.
341  *
342  * @return length of the UTF8 representation.
343  */
344 size_t
lit_code_point_to_cesu8_bytes(uint8_t * dst_p,lit_code_point_t code_point)345 lit_code_point_to_cesu8_bytes (uint8_t *dst_p, /**< destination buffer */
346                                lit_code_point_t code_point) /**< code point */
347 {
348   if (code_point < LIT_UTF8_2_BYTE_CODE_POINT_MIN)
349   {
350     /* 00000000 0xxxxxxx -> 0xxxxxxx */
351     dst_p[0] = (uint8_t) code_point;
352     return 1;
353   }
354 
355   if (code_point < LIT_UTF8_3_BYTE_CODE_POINT_MIN)
356   {
357     /* 00000yyy yyxxxxxx -> 110yyyyy 10xxxxxx */
358     dst_p[0] = (uint8_t) (LIT_UTF8_2_BYTE_MARKER | ((code_point >> 6) & LIT_UTF8_LAST_5_BITS_MASK));
359     dst_p[1] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (code_point & LIT_UTF8_LAST_6_BITS_MASK));
360     return 2;
361   }
362 
363   if (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN)
364   {
365     /* zzzzyyyy yyxxxxxx -> 1110zzzz 10yyyyyy 10xxxxxx */
366     dst_p[0] = (uint8_t) (LIT_UTF8_3_BYTE_MARKER | ((code_point >> 12) & LIT_UTF8_LAST_4_BITS_MASK));
367     dst_p[1] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | ((code_point >> 6) & LIT_UTF8_LAST_6_BITS_MASK));
368     dst_p[2] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (code_point & LIT_UTF8_LAST_6_BITS_MASK));
369     return 3;
370   }
371 
372   JERRY_ASSERT (code_point <= LIT_UNICODE_CODE_POINT_MAX);
373 
374   code_point -= LIT_UTF8_4_BYTE_CODE_POINT_MIN;
375 
376   dst_p[0] = (uint8_t) (LIT_UTF8_3_BYTE_MARKER | 0xd);
377   dst_p[1] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | 0x20 | ((code_point >> 16) & LIT_UTF8_LAST_4_BITS_MASK));
378   dst_p[2] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | ((code_point >> 10) & LIT_UTF8_LAST_6_BITS_MASK));
379 
380   dst_p[3] = (uint8_t) (LIT_UTF8_3_BYTE_MARKER | 0xd);
381   dst_p[4] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | 0x30 | ((code_point >> 6) & LIT_UTF8_LAST_4_BITS_MASK));
382   dst_p[5] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (code_point & LIT_UTF8_LAST_6_BITS_MASK));
383 
384   return 3 * 2;
385 } /* lit_code_point_to_cesu8_bytes */
386 
387 /**
388  * Returns the length of the UTF8 representation of a character.
389  *
390  * @return length of the UTF8 representation.
391  */
392 size_t
lit_code_point_get_cesu8_length(lit_code_point_t code_point)393 lit_code_point_get_cesu8_length (lit_code_point_t code_point) /**< code point */
394 {
395   if (code_point < LIT_UTF8_2_BYTE_CODE_POINT_MIN)
396   {
397     /* 00000000 0xxxxxxx */
398     return 1;
399   }
400 
401   if (code_point < LIT_UTF8_3_BYTE_CODE_POINT_MIN)
402   {
403     /* 00000yyy yyxxxxxx */
404     return 2;
405   }
406 
407   if (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN)
408   {
409     /* zzzzyyyy yyxxxxxx */
410     return 3;
411   }
412 
413   /* high + low surrogate */
414   return 2 * 3;
415 } /* lit_code_point_get_cesu8_length */
416 
417 /**
418  * Convert a four byte long utf8 character to two three byte long cesu8 characters
419  */
420 void
lit_four_byte_utf8_char_to_cesu8(uint8_t * dst_p,const uint8_t * source_p)421 lit_four_byte_utf8_char_to_cesu8 (uint8_t *dst_p, /**< destination buffer */
422                                   const uint8_t *source_p) /**< source buffer */
423 {
424   lit_code_point_t code_point = ((((uint32_t) source_p[0]) & LIT_UTF8_LAST_3_BITS_MASK) << 18);
425   code_point |= ((((uint32_t) source_p[1]) & LIT_UTF8_LAST_6_BITS_MASK) << 12);
426   code_point |= ((((uint32_t) source_p[2]) & LIT_UTF8_LAST_6_BITS_MASK) << 6);
427   code_point |= (((uint32_t) source_p[3]) & LIT_UTF8_LAST_6_BITS_MASK);
428 
429   lit_code_point_to_cesu8_bytes (dst_p, code_point);
430 } /* lit_four_byte_utf8_char_to_cesu8 */
431 
432 /**
433  * Lookup hex digits in a buffer
434  *
435  * @return UINT32_MAX - if next 'lookup' number of characters do not form a valid hex number
436  *         value of hex number, otherwise
437  */
438 uint32_t
lit_char_hex_lookup(const lit_utf8_byte_t * buf_p,const lit_utf8_byte_t * const buf_end_p,uint32_t lookup)439 lit_char_hex_lookup (const lit_utf8_byte_t *buf_p, /**< buffer */
440                      const lit_utf8_byte_t *const buf_end_p, /**< buffer end */
441                      uint32_t lookup) /**< size of lookup */
442 {
443   JERRY_ASSERT (lookup <= 4);
444 
445   if (JERRY_UNLIKELY (buf_p + lookup > buf_end_p))
446   {
447     return UINT32_MAX;
448   }
449 
450   uint32_t value = 0;
451 
452   while (lookup--)
453   {
454     lit_utf8_byte_t ch = *buf_p++;
455     if (!lit_char_is_hex_digit (ch))
456     {
457       return UINT32_MAX;
458     }
459 
460     value <<= 4;
461     value += lit_char_hex_to_int (ch);
462   }
463 
464   JERRY_ASSERT (value <= LIT_UTF16_CODE_UNIT_MAX);
465   return value;
466 } /* lit_char_hex_lookup */
467 
468 /**
469  * Parse a decimal number with the value clamped to UINT32_MAX.
470  *
471  * @returns uint32_t number
472  */
473 uint32_t
lit_parse_decimal(const lit_utf8_byte_t ** buffer_p,const lit_utf8_byte_t * buffer_end_p)474 lit_parse_decimal (const lit_utf8_byte_t **buffer_p, /**< [in/out] character buffer */
475                    const lit_utf8_byte_t *buffer_end_p) /**< buffer end */
476 {
477   const lit_utf8_byte_t *current_p = *buffer_p;
478   JERRY_ASSERT (lit_char_is_decimal_digit (*current_p));
479 
480   uint32_t value = (uint32_t) (*current_p++ - LIT_CHAR_0);
481 
482   while (current_p < buffer_end_p && lit_char_is_decimal_digit (*current_p))
483   {
484     const uint32_t digit = (uint32_t) (*current_p++ - LIT_CHAR_0);
485     uint32_t new_value = value * 10 + digit;
486 
487     if (JERRY_UNLIKELY (value > UINT32_MAX / 10) || JERRY_UNLIKELY (new_value < value))
488     {
489       value = UINT32_MAX;
490       continue;
491     }
492 
493     value = new_value;
494   }
495 
496   *buffer_p = current_p;
497   return value;
498 } /* lit_parse_decimal */
499 
500 /**
501  * Check if specified character is a word character (part of IsWordChar abstract operation)
502  *
503  * See also: ECMA-262 v5, 15.10.2.6 (IsWordChar)
504  *
505  * @return true - if the character is a word character
506  *         false - otherwise
507  */
508 bool
lit_char_is_word_char(lit_code_point_t c)509 lit_char_is_word_char (lit_code_point_t c) /**< code point */
510 {
511   return ((c >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && c <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END)
512           || (c >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && c <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END)
513           || (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END)
514           || c == LIT_CHAR_UNDERSCORE);
515 } /* lit_char_is_word_char */
516 
517 #if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
518 
519 /**
520  * Check if the specified character is in one of those tables which contain bidirectional conversions.
521  *
522  * @return the mapped character sequence of an ecma character, if it's in the table.
523  *         0 - otherwise.
524  */
525 static ecma_length_t
search_in_bidirectional_conversion_tables(ecma_char_t character,ecma_char_t * output_buffer_p,bool is_lowercase)526 search_in_bidirectional_conversion_tables (ecma_char_t character,        /**< code unit */
527                                            ecma_char_t *output_buffer_p, /**< [out] buffer for the result characters */
528                                            bool is_lowercase)            /**< is lowercase conversion */
529 {
530   /* 1, Check if the specified character is part of the lit_character_case_ranges table. */
531   int number_of_case_ranges = NUM_OF_ELEMENTS (lit_character_case_ranges);
532   int conv_counter = 0;
533 
534   for (int i = 0; i < number_of_case_ranges; i++)
535   {
536     if (i % 2 == 0 && i > 0)
537     {
538       conv_counter++;
539     }
540 
541     int range_length = lit_character_case_range_lengths[conv_counter];
542     ecma_char_t start_point = lit_character_case_ranges[i];
543 
544     if (start_point > character || character >= start_point + range_length)
545     {
546       continue;
547     }
548 
549     int char_dist = character - start_point;
550 
551     if (i % 2 == 0)
552     {
553       output_buffer_p[0] = is_lowercase ? (ecma_char_t) (lit_character_case_ranges[i + 1] + char_dist) : character;
554     }
555     else
556     {
557       output_buffer_p[0] = is_lowercase ? character : (ecma_char_t) (lit_character_case_ranges[i - 1] + char_dist);
558     }
559 
560     return 1;
561   }
562 
563   /* 2, Check if the specified character is part of the character_pair_ranges table. */
564   int bottom = 0;
565   int top = NUM_OF_ELEMENTS (lit_character_pair_ranges) - 1;
566 
567   while (bottom <= top)
568   {
569     int middle = (bottom + top) / 2;
570     ecma_char_t current_sp = lit_character_pair_ranges[middle];
571 
572     if (current_sp <= character && character < current_sp + lit_character_pair_range_lengths[middle])
573     {
574       int char_dist = character - current_sp;
575 
576       if ((character - current_sp) % 2 == 0)
577       {
578         output_buffer_p[0] = is_lowercase ? (ecma_char_t) (current_sp + char_dist + 1) : character;
579       }
580       else
581       {
582         output_buffer_p[0] = is_lowercase ? character : (ecma_char_t) (current_sp + char_dist - 1);
583       }
584 
585       return 1;
586     }
587 
588     if (character > current_sp)
589     {
590       bottom = middle + 1;
591     }
592     else
593     {
594       top = middle - 1;
595     }
596   }
597 
598   /* 3, Check if the specified character is part of the character_pairs table. */
599   int number_of_character_pairs = NUM_OF_ELEMENTS (lit_character_pairs);
600 
601   for (int i = 0; i < number_of_character_pairs; i++)
602   {
603     if (character != lit_character_pairs[i])
604     {
605       continue;
606     }
607 
608     if (i % 2 == 0)
609     {
610       output_buffer_p[0] = is_lowercase ? lit_character_pairs[i + 1] : character;
611     }
612     else
613     {
614       output_buffer_p[0] = is_lowercase ? character : lit_character_pairs[i - 1];
615     }
616 
617     return 1;
618   }
619 
620   return 0;
621 } /* search_in_bidirectional_conversion_tables */
622 
623 /**
624  * Check if the specified character is in the given conversion table.
625  *
626  * @return the mapped character sequence of an ecma character, if it's in the table.
627  *         0 - otherwise.
628  */
629 static ecma_length_t
search_in_conversion_table(ecma_char_t character,ecma_char_t * output_buffer_p,const ecma_char_t * array,const uint8_t * counters)630 search_in_conversion_table (ecma_char_t character,        /**< code unit */
631                             ecma_char_t *output_buffer_p, /**< [out] buffer for the result characters */
632                             const ecma_char_t *array,     /**< array */
633                             const uint8_t *counters)      /**< case_values counter */
634 {
635   int end_point = 0;
636 
637   for (int i = 0; i < 3; i++)
638   {
639     int start_point = end_point;
640     int size_of_case_value = i + 1;
641     end_point += counters[i] * (size_of_case_value + 1);
642 
643     int bottom = start_point;
644     int top = end_point - size_of_case_value;
645 
646     while (bottom <= top)
647     {
648       int middle = (bottom + top) / 2;
649 
650       middle -= ((middle - bottom) % (size_of_case_value + 1));
651 
652       ecma_char_t current = array[middle];
653 
654       if (current == character)
655       {
656         ecma_length_t char_sequence = 1;
657 
658         switch (size_of_case_value)
659         {
660           case 3:
661           {
662             output_buffer_p[2] = array[middle + 3];
663             char_sequence++;
664             /* FALLTHRU */
665           }
666           case 2:
667           {
668             output_buffer_p[1] = array[middle + 2];
669             char_sequence++;
670             /* FALLTHRU */
671           }
672           default:
673           {
674             output_buffer_p[0] = array[middle + 1];
675             return char_sequence;
676           }
677         }
678       }
679 
680       if (character < current)
681       {
682         top = middle - (size_of_case_value + 1);
683       }
684       else
685       {
686         bottom = middle + (size_of_case_value + 1);
687       }
688     }
689   }
690 
691   return 0;
692 } /* search_in_conversion_table */
693 #endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
694 
695 /**
696  * Returns the lowercase character sequence of an ecma character.
697  *
698  * Note: output_buffer_p must be able to hold at least LIT_MAXIMUM_OTHER_CASE_LENGTH characters.
699  *
700  * @return the length of the lowercase character sequence
701  *         which is always between 1 and LIT_MAXIMUM_OTHER_CASE_LENGTH.
702  */
703 ecma_length_t
lit_char_to_lower_case(ecma_char_t character,ecma_char_t * output_buffer_p,ecma_length_t buffer_size)704 lit_char_to_lower_case (ecma_char_t character, /**< input character value */
705                         ecma_char_t *output_buffer_p, /**< [out] buffer for the result characters */
706                         ecma_length_t buffer_size) /**< buffer size */
707 {
708   JERRY_ASSERT (buffer_size >= LIT_MAXIMUM_OTHER_CASE_LENGTH);
709 
710   if (character >= LIT_CHAR_UPPERCASE_A && character <= LIT_CHAR_UPPERCASE_Z)
711   {
712     output_buffer_p[0] = (ecma_char_t) (character + (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A));
713     return 1;
714   }
715 
716 #if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
717 
718   ecma_length_t lowercase_sequence = search_in_bidirectional_conversion_tables (character, output_buffer_p, true);
719 
720   if (lowercase_sequence != 0)
721   {
722     return lowercase_sequence;
723   }
724 
725   int num_of_lowercase_ranges = NUM_OF_ELEMENTS (lit_lower_case_ranges);
726 
727   for (int i = 0, j = 0; i < num_of_lowercase_ranges; i += 2, j++)
728   {
729     int range_length = lit_lower_case_range_lengths[j] - 1;
730     ecma_char_t start_point = lit_lower_case_ranges[i];
731 
732     if (start_point <= character && character <= start_point + range_length)
733     {
734       output_buffer_p[0] = (ecma_char_t) (lit_lower_case_ranges[i + 1] + (character - start_point));
735       return 1;
736     }
737   }
738 
739   lowercase_sequence = search_in_conversion_table (character,
740                                                    output_buffer_p,
741                                                    lit_lower_case_conversions,
742                                                    lit_lower_case_conversion_counters);
743 
744   if (lowercase_sequence != 0)
745   {
746     return lowercase_sequence;
747   }
748 
749 #endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
750 
751   output_buffer_p[0] = character;
752   return 1;
753 } /* lit_char_to_lower_case */
754 
755 /**
756  * Returns the uppercase character sequence of an ecma character.
757  *
758  * Note: output_buffer_p must be able to hold at least LIT_MAXIMUM_OTHER_CASE_LENGTH characters.
759  *
760  * @return the length of the uppercase character sequence
761  *         which is always between 1 and LIT_MAXIMUM_OTHER_CASE_LENGTH.
762  */
763 ecma_length_t
lit_char_to_upper_case(ecma_char_t character,ecma_char_t * output_buffer_p,ecma_length_t buffer_size)764 lit_char_to_upper_case (ecma_char_t character, /**< input character value */
765                         ecma_char_t *output_buffer_p, /**< buffer for the result characters */
766                         ecma_length_t buffer_size) /**< buffer size */
767 {
768   JERRY_ASSERT (buffer_size >= LIT_MAXIMUM_OTHER_CASE_LENGTH);
769 
770   if (character >= LIT_CHAR_LOWERCASE_A && character <= LIT_CHAR_LOWERCASE_Z)
771   {
772     output_buffer_p[0] = (ecma_char_t) (character - (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A));
773     return 1;
774   }
775 
776 #if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
777 
778   ecma_length_t uppercase_sequence = search_in_bidirectional_conversion_tables (character, output_buffer_p, false);
779 
780   if (uppercase_sequence != 0)
781   {
782     return uppercase_sequence;
783   }
784 
785   int num_of_upper_case_special_ranges = NUM_OF_ELEMENTS (lit_upper_case_special_ranges);
786 
787   for (int i = 0, j = 0; i < num_of_upper_case_special_ranges; i += 3, j++)
788   {
789     int range_length = lit_upper_case_special_range_lengths[j];
790     ecma_char_t start_point = lit_upper_case_special_ranges[i];
791 
792     if (start_point <= character && character <= start_point + range_length)
793     {
794       output_buffer_p[0] = (ecma_char_t) (lit_upper_case_special_ranges[i + 1] + (character - start_point));
795       output_buffer_p[1] = (ecma_char_t) (lit_upper_case_special_ranges[i + 2]);
796       return 2;
797     }
798   }
799 
800   uppercase_sequence = search_in_conversion_table (character,
801                                                    output_buffer_p,
802                                                    lit_upper_case_conversions,
803                                                    lit_upper_case_conversion_counters);
804 
805   if (uppercase_sequence != 0)
806   {
807     return uppercase_sequence;
808   }
809 
810 #endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
811 
812   output_buffer_p[0] = character;
813   return 1;
814 } /* lit_char_to_upper_case */
815