1 /* Copyright JS Foundation and other contributors, http://js.foundation
2 *
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "config.h"
17 #include "lit-char-helpers.h"
18 #include "lit-unicode-ranges.inc.h"
19 #include "lit-strings.h"
20
21 #if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
22 #include "lit-unicode-conversions.inc.h"
23 #endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
24
25 #define NUM_OF_ELEMENTS(array) (sizeof (array) / sizeof ((array)[0]))
26
27 /**
28 * Binary search algorithm that searches the a
29 * character in the given char array.
30 *
31 * @return true - if the character is in the given array
32 * false - otherwise
33 */
34 static bool
search_char_in_char_array(ecma_char_t c,const ecma_char_t * array,int size_of_array)35 search_char_in_char_array (ecma_char_t c, /**< code unit */
36 const ecma_char_t *array, /**< array */
37 int size_of_array) /**< length of the array */
38 {
39 int bottom = 0;
40 int top = size_of_array - 1;
41
42 while (bottom <= top)
43 {
44 int middle = (bottom + top) / 2;
45 ecma_char_t current = array[middle];
46
47 if (current == c)
48 {
49 return true;
50 }
51
52 if (c < current)
53 {
54 top = middle - 1;
55 }
56 else
57 {
58 bottom = middle + 1;
59 }
60 }
61
62 return false;
63 } /* search_char_in_char_array */
64
65 /**
66 * Binary search algorithm that searches a character in the given intervals.
67 * Intervals specifed by two arrays. The first one contains the starting points
68 * of the intervals, the second one contains the length of them.
69 *
70 * @return true - if the the character is included (inclusively) in one of the intervals in the given array
71 * false - otherwise
72 */
73 static bool
search_char_in_interval_array(ecma_char_t c,const ecma_char_t * array_sp,const uint8_t * lengths,int size_of_array)74 search_char_in_interval_array (ecma_char_t c, /**< code unit */
75 const ecma_char_t *array_sp, /**< array of interval starting points */
76 const uint8_t *lengths, /**< array of interval lengths */
77 int size_of_array) /**< length of the array */
78 {
79 int bottom = 0;
80 int top = size_of_array - 1;
81
82 while (bottom <= top)
83 {
84 int middle = (bottom + top) / 2;
85 ecma_char_t current_sp = array_sp[middle];
86
87 if (current_sp <= c && c <= current_sp + lengths[middle])
88 {
89 return true;
90 }
91
92 if (c > current_sp)
93 {
94 bottom = middle + 1;
95 }
96 else
97 {
98 top = middle - 1;
99 }
100 }
101
102 return false;
103 } /* search_char_in_interval_array */
104
105 /**
106 * Check if specified character is one of the Whitespace characters including those that fall into
107 * "Space, Separator" ("Zs") Unicode character category or one of the Line Terminator characters.
108 *
109 * @return true - if the character is one of characters, listed in ECMA-262 v5, Table 2,
110 * false - otherwise
111 */
112 bool
lit_char_is_white_space(lit_code_point_t c)113 lit_char_is_white_space (lit_code_point_t c) /**< code point */
114 {
115 if (c <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
116 {
117 return (c == LIT_CHAR_SP || (c >= LIT_CHAR_TAB && c <= LIT_CHAR_CR));
118 }
119 else
120 {
121 if (c == LIT_CHAR_NBSP || c == LIT_CHAR_BOM || c == LIT_CHAR_LS || c == LIT_CHAR_PS)
122 {
123 return true;
124 }
125
126 return (c <= LIT_UTF16_CODE_UNIT_MAX
127 && ((c >= lit_unicode_separator_char_interval_sps[0]
128 && c < lit_unicode_separator_char_interval_sps[0] + lit_unicode_separator_char_interval_lengths[0])
129 || search_char_in_char_array ((ecma_char_t) c,
130 lit_unicode_separator_chars,
131 NUM_OF_ELEMENTS (lit_unicode_separator_chars))));
132 }
133 } /* lit_char_is_white_space */
134
135 /**
136 * Check if specified character is one of LineTerminator characters
137 *
138 * @return true - if the character is one of characters, listed in ECMA-262 v5, Table 3,
139 * false - otherwise
140 */
141 bool
lit_char_is_line_terminator(ecma_char_t c)142 lit_char_is_line_terminator (ecma_char_t c) /**< code unit */
143 {
144 return (c == LIT_CHAR_LF
145 || c == LIT_CHAR_CR
146 || c == LIT_CHAR_LS
147 || c == LIT_CHAR_PS);
148 } /* lit_char_is_line_terminator */
149
150 /**
151 * Check if specified character is a unicode letter
152 *
153 * Note:
154 * Unicode letter is a character, included into one of the following categories:
155 * - Uppercase letter (Lu);
156 * - Lowercase letter (Ll);
157 * - Titlecase letter (Lt);
158 * - Modifier letter (Lm);
159 * - Other letter (Lo);
160 * - Letter number (Nl).
161 *
162 * See also:
163 * ECMA-262 v5, 7.6
164 *
165 * @return true - if specified character falls into one of the listed categories,
166 * false - otherwise
167 */
168 static bool
lit_char_is_unicode_letter(ecma_char_t c)169 lit_char_is_unicode_letter (ecma_char_t c) /**< code unit */
170 {
171 return (search_char_in_interval_array (c,
172 lit_unicode_letter_interval_sps,
173 lit_unicode_letter_interval_lengths,
174 NUM_OF_ELEMENTS (lit_unicode_letter_interval_sps))
175 || search_char_in_char_array (c, lit_unicode_letter_chars, NUM_OF_ELEMENTS (lit_unicode_letter_chars)));
176 } /* lit_char_is_unicode_letter */
177
178 /**
179 * Check if specified character is a non-letter character and can be used as a
180 * non-first character of an identifier.
181 * These characters coverd by the following unicode categories:
182 * - digit (Nd)
183 * - punctuation mark (Mn, Mc)
184 * - connector punctuation (Pc)
185 *
186 * See also:
187 * ECMA-262 v5, 7.6
188 *
189 * @return true - if specified character falls into one of the listed categories,
190 * false - otherwise
191 */
192 static bool
lit_char_is_unicode_non_letter_ident_part(ecma_char_t c)193 lit_char_is_unicode_non_letter_ident_part (ecma_char_t c) /**< code unit */
194 {
195 return (search_char_in_interval_array (c,
196 lit_unicode_non_letter_ident_part_interval_sps,
197 lit_unicode_non_letter_ident_part_interval_lengths,
198 NUM_OF_ELEMENTS (lit_unicode_non_letter_ident_part_interval_sps))
199 || search_char_in_char_array (c,
200 lit_unicode_non_letter_ident_part_chars,
201 NUM_OF_ELEMENTS (lit_unicode_non_letter_ident_part_chars)));
202 } /* lit_char_is_unicode_non_letter_ident_part */
203
204 /**
205 * Checks whether the character is a valid identifier start.
206 *
207 * @return true if it is.
208 */
209 bool
lit_code_point_is_identifier_start(lit_code_point_t code_point)210 lit_code_point_is_identifier_start (lit_code_point_t code_point) /**< code point */
211 {
212 /* Fast path for ASCII-defined letters. */
213 if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
214 {
215 return ((LEXER_TO_ASCII_LOWERCASE (code_point) >= LIT_CHAR_LOWERCASE_A
216 && LEXER_TO_ASCII_LOWERCASE (code_point) <= LIT_CHAR_LOWERCASE_Z)
217 || code_point == LIT_CHAR_DOLLAR_SIGN
218 || code_point == LIT_CHAR_UNDERSCORE);
219 }
220
221 #if ENABLED (JERRY_ES2015)
222 if (code_point >= LIT_UTF8_4_BYTE_CODE_POINT_MIN)
223 {
224 /* TODO: detect these ranges correctly. */
225 return (code_point >= 0x10C80 && code_point <= 0x10CF2);
226 }
227 #else /* !ENABLED (JERRY_ES2015) */
228 JERRY_ASSERT (code_point <= LIT_UTF8_4_BYTE_CODE_POINT_MIN);
229 #endif /* ENABLED (JERRY_ES2015) */
230
231 return lit_char_is_unicode_letter ((ecma_char_t) code_point);
232 } /* lit_code_point_is_identifier_start */
233
234 /**
235 * Checks whether the character is a valid identifier part.
236 *
237 * @return true if it is.
238 */
239 bool
lit_code_point_is_identifier_part(lit_code_point_t code_point)240 lit_code_point_is_identifier_part (lit_code_point_t code_point) /**< code point */
241 {
242 /* Fast path for ASCII-defined letters. */
243 if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
244 {
245 return ((LEXER_TO_ASCII_LOWERCASE (code_point) >= LIT_CHAR_LOWERCASE_A
246 && LEXER_TO_ASCII_LOWERCASE (code_point) <= LIT_CHAR_LOWERCASE_Z)
247 || (code_point >= LIT_CHAR_0 && code_point <= LIT_CHAR_9)
248 || code_point == LIT_CHAR_DOLLAR_SIGN
249 || code_point == LIT_CHAR_UNDERSCORE);
250 }
251
252 #if ENABLED (JERRY_ES2015)
253 if (code_point >= LIT_UTF8_4_BYTE_CODE_POINT_MIN)
254 {
255 /* TODO: detect these ranges correctly. */
256 return (code_point >= 0x10C80 && code_point <= 0x10CF2);
257 }
258 #else /* !ENABLED (JERRY_ES2015) */
259 JERRY_ASSERT (code_point <= LIT_UTF8_4_BYTE_CODE_POINT_MIN);
260 #endif /* ENABLED (JERRY_ES2015) */
261
262 return (lit_char_is_unicode_letter ((ecma_char_t) code_point)
263 || lit_char_is_unicode_non_letter_ident_part ((ecma_char_t) code_point));
264 } /* lit_code_point_is_identifier_part */
265
266 /**
267 * Check if specified character is one of OctalDigit characters (ECMA-262 v5, B.1.2)
268 *
269 * @return true / false
270 */
271 bool
lit_char_is_octal_digit(ecma_char_t c)272 lit_char_is_octal_digit (ecma_char_t c) /**< code unit */
273 {
274 return (c >= LIT_CHAR_ASCII_OCTAL_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_OCTAL_DIGITS_END);
275 } /* lit_char_is_octal_digit */
276
277 /**
278 * Check if specified character is one of DecimalDigit characters (ECMA-262 v5, 7.8.3)
279 *
280 * @return true / false
281 */
282 bool
lit_char_is_decimal_digit(ecma_char_t c)283 lit_char_is_decimal_digit (ecma_char_t c) /**< code unit */
284 {
285 return (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END);
286 } /* lit_char_is_decimal_digit */
287
288 /**
289 * Check if specified character is one of HexDigit characters (ECMA-262 v5, 7.8.3)
290 *
291 * @return true / false
292 */
293 bool
lit_char_is_hex_digit(ecma_char_t c)294 lit_char_is_hex_digit (ecma_char_t c) /**< code unit */
295 {
296 return ((c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END)
297 || (LEXER_TO_ASCII_LOWERCASE (c) >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN
298 && LEXER_TO_ASCII_LOWERCASE (c) <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_END));
299 } /* lit_char_is_hex_digit */
300
301 #if ENABLED (JERRY_ES2015)
302 /**
303 * Check if specified character is one of BinaryDigits characters (ECMA-262 v6, 11.8.3)
304 *
305 * @return true / false
306 */
307 bool
lit_char_is_binary_digit(ecma_char_t c)308 lit_char_is_binary_digit (ecma_char_t c) /** code unit */
309 {
310 return (c == LIT_CHAR_0 || c == LIT_CHAR_1);
311 } /* lit_char_is_binary_digit */
312 #endif /* ENABLED (JERRY_ES2015) */
313
314 /**
315 * Convert a HexDigit character to its numeric value, as defined in ECMA-262 v5, 7.8.3
316 *
317 * @return digit value, corresponding to the hex char
318 */
319 uint32_t
lit_char_hex_to_int(ecma_char_t c)320 lit_char_hex_to_int (ecma_char_t c) /**< code unit, corresponding to
321 * one of HexDigit characters */
322 {
323 JERRY_ASSERT (lit_char_is_hex_digit (c));
324
325 if (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END)
326 {
327 return (uint32_t) (c - LIT_CHAR_ASCII_DIGITS_BEGIN);
328 }
329 else if (c >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN && c <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_END)
330 {
331 return (uint32_t) (c - LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN + 10);
332 }
333 else
334 {
335 return (uint32_t) (c - LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN + 10);
336 }
337 } /* lit_char_hex_to_int */
338
339 /**
340 * Converts a character to UTF8 bytes.
341 *
342 * @return length of the UTF8 representation.
343 */
344 size_t
lit_code_point_to_cesu8_bytes(uint8_t * dst_p,lit_code_point_t code_point)345 lit_code_point_to_cesu8_bytes (uint8_t *dst_p, /**< destination buffer */
346 lit_code_point_t code_point) /**< code point */
347 {
348 if (code_point < LIT_UTF8_2_BYTE_CODE_POINT_MIN)
349 {
350 /* 00000000 0xxxxxxx -> 0xxxxxxx */
351 dst_p[0] = (uint8_t) code_point;
352 return 1;
353 }
354
355 if (code_point < LIT_UTF8_3_BYTE_CODE_POINT_MIN)
356 {
357 /* 00000yyy yyxxxxxx -> 110yyyyy 10xxxxxx */
358 dst_p[0] = (uint8_t) (LIT_UTF8_2_BYTE_MARKER | ((code_point >> 6) & LIT_UTF8_LAST_5_BITS_MASK));
359 dst_p[1] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (code_point & LIT_UTF8_LAST_6_BITS_MASK));
360 return 2;
361 }
362
363 if (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN)
364 {
365 /* zzzzyyyy yyxxxxxx -> 1110zzzz 10yyyyyy 10xxxxxx */
366 dst_p[0] = (uint8_t) (LIT_UTF8_3_BYTE_MARKER | ((code_point >> 12) & LIT_UTF8_LAST_4_BITS_MASK));
367 dst_p[1] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | ((code_point >> 6) & LIT_UTF8_LAST_6_BITS_MASK));
368 dst_p[2] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (code_point & LIT_UTF8_LAST_6_BITS_MASK));
369 return 3;
370 }
371
372 JERRY_ASSERT (code_point <= LIT_UNICODE_CODE_POINT_MAX);
373
374 code_point -= LIT_UTF8_4_BYTE_CODE_POINT_MIN;
375
376 dst_p[0] = (uint8_t) (LIT_UTF8_3_BYTE_MARKER | 0xd);
377 dst_p[1] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | 0x20 | ((code_point >> 16) & LIT_UTF8_LAST_4_BITS_MASK));
378 dst_p[2] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | ((code_point >> 10) & LIT_UTF8_LAST_6_BITS_MASK));
379
380 dst_p[3] = (uint8_t) (LIT_UTF8_3_BYTE_MARKER | 0xd);
381 dst_p[4] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | 0x30 | ((code_point >> 6) & LIT_UTF8_LAST_4_BITS_MASK));
382 dst_p[5] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (code_point & LIT_UTF8_LAST_6_BITS_MASK));
383
384 return 3 * 2;
385 } /* lit_code_point_to_cesu8_bytes */
386
387 /**
388 * Returns the length of the UTF8 representation of a character.
389 *
390 * @return length of the UTF8 representation.
391 */
392 size_t
lit_code_point_get_cesu8_length(lit_code_point_t code_point)393 lit_code_point_get_cesu8_length (lit_code_point_t code_point) /**< code point */
394 {
395 if (code_point < LIT_UTF8_2_BYTE_CODE_POINT_MIN)
396 {
397 /* 00000000 0xxxxxxx */
398 return 1;
399 }
400
401 if (code_point < LIT_UTF8_3_BYTE_CODE_POINT_MIN)
402 {
403 /* 00000yyy yyxxxxxx */
404 return 2;
405 }
406
407 if (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN)
408 {
409 /* zzzzyyyy yyxxxxxx */
410 return 3;
411 }
412
413 /* high + low surrogate */
414 return 2 * 3;
415 } /* lit_code_point_get_cesu8_length */
416
417 /**
418 * Convert a four byte long utf8 character to two three byte long cesu8 characters
419 */
420 void
lit_four_byte_utf8_char_to_cesu8(uint8_t * dst_p,const uint8_t * source_p)421 lit_four_byte_utf8_char_to_cesu8 (uint8_t *dst_p, /**< destination buffer */
422 const uint8_t *source_p) /**< source buffer */
423 {
424 lit_code_point_t code_point = ((((uint32_t) source_p[0]) & LIT_UTF8_LAST_3_BITS_MASK) << 18);
425 code_point |= ((((uint32_t) source_p[1]) & LIT_UTF8_LAST_6_BITS_MASK) << 12);
426 code_point |= ((((uint32_t) source_p[2]) & LIT_UTF8_LAST_6_BITS_MASK) << 6);
427 code_point |= (((uint32_t) source_p[3]) & LIT_UTF8_LAST_6_BITS_MASK);
428
429 lit_code_point_to_cesu8_bytes (dst_p, code_point);
430 } /* lit_four_byte_utf8_char_to_cesu8 */
431
432 /**
433 * Lookup hex digits in a buffer
434 *
435 * @return UINT32_MAX - if next 'lookup' number of characters do not form a valid hex number
436 * value of hex number, otherwise
437 */
438 uint32_t
lit_char_hex_lookup(const lit_utf8_byte_t * buf_p,const lit_utf8_byte_t * const buf_end_p,uint32_t lookup)439 lit_char_hex_lookup (const lit_utf8_byte_t *buf_p, /**< buffer */
440 const lit_utf8_byte_t *const buf_end_p, /**< buffer end */
441 uint32_t lookup) /**< size of lookup */
442 {
443 JERRY_ASSERT (lookup <= 4);
444
445 if (JERRY_UNLIKELY (buf_p + lookup > buf_end_p))
446 {
447 return UINT32_MAX;
448 }
449
450 uint32_t value = 0;
451
452 while (lookup--)
453 {
454 lit_utf8_byte_t ch = *buf_p++;
455 if (!lit_char_is_hex_digit (ch))
456 {
457 return UINT32_MAX;
458 }
459
460 value <<= 4;
461 value += lit_char_hex_to_int (ch);
462 }
463
464 JERRY_ASSERT (value <= LIT_UTF16_CODE_UNIT_MAX);
465 return value;
466 } /* lit_char_hex_lookup */
467
468 /**
469 * Parse a decimal number with the value clamped to UINT32_MAX.
470 *
471 * @returns uint32_t number
472 */
473 uint32_t
lit_parse_decimal(const lit_utf8_byte_t ** buffer_p,const lit_utf8_byte_t * buffer_end_p)474 lit_parse_decimal (const lit_utf8_byte_t **buffer_p, /**< [in/out] character buffer */
475 const lit_utf8_byte_t *buffer_end_p) /**< buffer end */
476 {
477 const lit_utf8_byte_t *current_p = *buffer_p;
478 JERRY_ASSERT (lit_char_is_decimal_digit (*current_p));
479
480 uint32_t value = (uint32_t) (*current_p++ - LIT_CHAR_0);
481
482 while (current_p < buffer_end_p && lit_char_is_decimal_digit (*current_p))
483 {
484 const uint32_t digit = (uint32_t) (*current_p++ - LIT_CHAR_0);
485 uint32_t new_value = value * 10 + digit;
486
487 if (JERRY_UNLIKELY (value > UINT32_MAX / 10) || JERRY_UNLIKELY (new_value < value))
488 {
489 value = UINT32_MAX;
490 continue;
491 }
492
493 value = new_value;
494 }
495
496 *buffer_p = current_p;
497 return value;
498 } /* lit_parse_decimal */
499
500 /**
501 * Check if specified character is a word character (part of IsWordChar abstract operation)
502 *
503 * See also: ECMA-262 v5, 15.10.2.6 (IsWordChar)
504 *
505 * @return true - if the character is a word character
506 * false - otherwise
507 */
508 bool
lit_char_is_word_char(lit_code_point_t c)509 lit_char_is_word_char (lit_code_point_t c) /**< code point */
510 {
511 return ((c >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && c <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END)
512 || (c >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && c <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END)
513 || (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END)
514 || c == LIT_CHAR_UNDERSCORE);
515 } /* lit_char_is_word_char */
516
517 #if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
518
519 /**
520 * Check if the specified character is in one of those tables which contain bidirectional conversions.
521 *
522 * @return the mapped character sequence of an ecma character, if it's in the table.
523 * 0 - otherwise.
524 */
525 static ecma_length_t
search_in_bidirectional_conversion_tables(ecma_char_t character,ecma_char_t * output_buffer_p,bool is_lowercase)526 search_in_bidirectional_conversion_tables (ecma_char_t character, /**< code unit */
527 ecma_char_t *output_buffer_p, /**< [out] buffer for the result characters */
528 bool is_lowercase) /**< is lowercase conversion */
529 {
530 /* 1, Check if the specified character is part of the lit_character_case_ranges table. */
531 int number_of_case_ranges = NUM_OF_ELEMENTS (lit_character_case_ranges);
532 int conv_counter = 0;
533
534 for (int i = 0; i < number_of_case_ranges; i++)
535 {
536 if (i % 2 == 0 && i > 0)
537 {
538 conv_counter++;
539 }
540
541 int range_length = lit_character_case_range_lengths[conv_counter];
542 ecma_char_t start_point = lit_character_case_ranges[i];
543
544 if (start_point > character || character >= start_point + range_length)
545 {
546 continue;
547 }
548
549 int char_dist = character - start_point;
550
551 if (i % 2 == 0)
552 {
553 output_buffer_p[0] = is_lowercase ? (ecma_char_t) (lit_character_case_ranges[i + 1] + char_dist) : character;
554 }
555 else
556 {
557 output_buffer_p[0] = is_lowercase ? character : (ecma_char_t) (lit_character_case_ranges[i - 1] + char_dist);
558 }
559
560 return 1;
561 }
562
563 /* 2, Check if the specified character is part of the character_pair_ranges table. */
564 int bottom = 0;
565 int top = NUM_OF_ELEMENTS (lit_character_pair_ranges) - 1;
566
567 while (bottom <= top)
568 {
569 int middle = (bottom + top) / 2;
570 ecma_char_t current_sp = lit_character_pair_ranges[middle];
571
572 if (current_sp <= character && character < current_sp + lit_character_pair_range_lengths[middle])
573 {
574 int char_dist = character - current_sp;
575
576 if ((character - current_sp) % 2 == 0)
577 {
578 output_buffer_p[0] = is_lowercase ? (ecma_char_t) (current_sp + char_dist + 1) : character;
579 }
580 else
581 {
582 output_buffer_p[0] = is_lowercase ? character : (ecma_char_t) (current_sp + char_dist - 1);
583 }
584
585 return 1;
586 }
587
588 if (character > current_sp)
589 {
590 bottom = middle + 1;
591 }
592 else
593 {
594 top = middle - 1;
595 }
596 }
597
598 /* 3, Check if the specified character is part of the character_pairs table. */
599 int number_of_character_pairs = NUM_OF_ELEMENTS (lit_character_pairs);
600
601 for (int i = 0; i < number_of_character_pairs; i++)
602 {
603 if (character != lit_character_pairs[i])
604 {
605 continue;
606 }
607
608 if (i % 2 == 0)
609 {
610 output_buffer_p[0] = is_lowercase ? lit_character_pairs[i + 1] : character;
611 }
612 else
613 {
614 output_buffer_p[0] = is_lowercase ? character : lit_character_pairs[i - 1];
615 }
616
617 return 1;
618 }
619
620 return 0;
621 } /* search_in_bidirectional_conversion_tables */
622
623 /**
624 * Check if the specified character is in the given conversion table.
625 *
626 * @return the mapped character sequence of an ecma character, if it's in the table.
627 * 0 - otherwise.
628 */
629 static ecma_length_t
search_in_conversion_table(ecma_char_t character,ecma_char_t * output_buffer_p,const ecma_char_t * array,const uint8_t * counters)630 search_in_conversion_table (ecma_char_t character, /**< code unit */
631 ecma_char_t *output_buffer_p, /**< [out] buffer for the result characters */
632 const ecma_char_t *array, /**< array */
633 const uint8_t *counters) /**< case_values counter */
634 {
635 int end_point = 0;
636
637 for (int i = 0; i < 3; i++)
638 {
639 int start_point = end_point;
640 int size_of_case_value = i + 1;
641 end_point += counters[i] * (size_of_case_value + 1);
642
643 int bottom = start_point;
644 int top = end_point - size_of_case_value;
645
646 while (bottom <= top)
647 {
648 int middle = (bottom + top) / 2;
649
650 middle -= ((middle - bottom) % (size_of_case_value + 1));
651
652 ecma_char_t current = array[middle];
653
654 if (current == character)
655 {
656 ecma_length_t char_sequence = 1;
657
658 switch (size_of_case_value)
659 {
660 case 3:
661 {
662 output_buffer_p[2] = array[middle + 3];
663 char_sequence++;
664 /* FALLTHRU */
665 }
666 case 2:
667 {
668 output_buffer_p[1] = array[middle + 2];
669 char_sequence++;
670 /* FALLTHRU */
671 }
672 default:
673 {
674 output_buffer_p[0] = array[middle + 1];
675 return char_sequence;
676 }
677 }
678 }
679
680 if (character < current)
681 {
682 top = middle - (size_of_case_value + 1);
683 }
684 else
685 {
686 bottom = middle + (size_of_case_value + 1);
687 }
688 }
689 }
690
691 return 0;
692 } /* search_in_conversion_table */
693 #endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
694
695 /**
696 * Returns the lowercase character sequence of an ecma character.
697 *
698 * Note: output_buffer_p must be able to hold at least LIT_MAXIMUM_OTHER_CASE_LENGTH characters.
699 *
700 * @return the length of the lowercase character sequence
701 * which is always between 1 and LIT_MAXIMUM_OTHER_CASE_LENGTH.
702 */
703 ecma_length_t
lit_char_to_lower_case(ecma_char_t character,ecma_char_t * output_buffer_p,ecma_length_t buffer_size)704 lit_char_to_lower_case (ecma_char_t character, /**< input character value */
705 ecma_char_t *output_buffer_p, /**< [out] buffer for the result characters */
706 ecma_length_t buffer_size) /**< buffer size */
707 {
708 JERRY_ASSERT (buffer_size >= LIT_MAXIMUM_OTHER_CASE_LENGTH);
709
710 if (character >= LIT_CHAR_UPPERCASE_A && character <= LIT_CHAR_UPPERCASE_Z)
711 {
712 output_buffer_p[0] = (ecma_char_t) (character + (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A));
713 return 1;
714 }
715
716 #if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
717
718 ecma_length_t lowercase_sequence = search_in_bidirectional_conversion_tables (character, output_buffer_p, true);
719
720 if (lowercase_sequence != 0)
721 {
722 return lowercase_sequence;
723 }
724
725 int num_of_lowercase_ranges = NUM_OF_ELEMENTS (lit_lower_case_ranges);
726
727 for (int i = 0, j = 0; i < num_of_lowercase_ranges; i += 2, j++)
728 {
729 int range_length = lit_lower_case_range_lengths[j] - 1;
730 ecma_char_t start_point = lit_lower_case_ranges[i];
731
732 if (start_point <= character && character <= start_point + range_length)
733 {
734 output_buffer_p[0] = (ecma_char_t) (lit_lower_case_ranges[i + 1] + (character - start_point));
735 return 1;
736 }
737 }
738
739 lowercase_sequence = search_in_conversion_table (character,
740 output_buffer_p,
741 lit_lower_case_conversions,
742 lit_lower_case_conversion_counters);
743
744 if (lowercase_sequence != 0)
745 {
746 return lowercase_sequence;
747 }
748
749 #endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
750
751 output_buffer_p[0] = character;
752 return 1;
753 } /* lit_char_to_lower_case */
754
755 /**
756 * Returns the uppercase character sequence of an ecma character.
757 *
758 * Note: output_buffer_p must be able to hold at least LIT_MAXIMUM_OTHER_CASE_LENGTH characters.
759 *
760 * @return the length of the uppercase character sequence
761 * which is always between 1 and LIT_MAXIMUM_OTHER_CASE_LENGTH.
762 */
763 ecma_length_t
lit_char_to_upper_case(ecma_char_t character,ecma_char_t * output_buffer_p,ecma_length_t buffer_size)764 lit_char_to_upper_case (ecma_char_t character, /**< input character value */
765 ecma_char_t *output_buffer_p, /**< buffer for the result characters */
766 ecma_length_t buffer_size) /**< buffer size */
767 {
768 JERRY_ASSERT (buffer_size >= LIT_MAXIMUM_OTHER_CASE_LENGTH);
769
770 if (character >= LIT_CHAR_LOWERCASE_A && character <= LIT_CHAR_LOWERCASE_Z)
771 {
772 output_buffer_p[0] = (ecma_char_t) (character - (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A));
773 return 1;
774 }
775
776 #if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
777
778 ecma_length_t uppercase_sequence = search_in_bidirectional_conversion_tables (character, output_buffer_p, false);
779
780 if (uppercase_sequence != 0)
781 {
782 return uppercase_sequence;
783 }
784
785 int num_of_upper_case_special_ranges = NUM_OF_ELEMENTS (lit_upper_case_special_ranges);
786
787 for (int i = 0, j = 0; i < num_of_upper_case_special_ranges; i += 3, j++)
788 {
789 int range_length = lit_upper_case_special_range_lengths[j];
790 ecma_char_t start_point = lit_upper_case_special_ranges[i];
791
792 if (start_point <= character && character <= start_point + range_length)
793 {
794 output_buffer_p[0] = (ecma_char_t) (lit_upper_case_special_ranges[i + 1] + (character - start_point));
795 output_buffer_p[1] = (ecma_char_t) (lit_upper_case_special_ranges[i + 2]);
796 return 2;
797 }
798 }
799
800 uppercase_sequence = search_in_conversion_table (character,
801 output_buffer_p,
802 lit_upper_case_conversions,
803 lit_upper_case_conversion_counters);
804
805 if (uppercase_sequence != 0)
806 {
807 return uppercase_sequence;
808 }
809
810 #endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
811
812 output_buffer_p[0] = character;
813 return 1;
814 } /* lit_char_to_upper_case */
815