1 /*
2 * Unicode utilities
3 *
4 * Copyright (c) 2017-2018 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <stdarg.h>
27 #include <string.h>
28 #include <assert.h>
29
30 #include "cutils.h"
31 #include "libunicode.h"
32 #include "libunicode-table.h"
33
34 enum {
35 RUN_TYPE_U,
36 RUN_TYPE_L,
37 RUN_TYPE_UF,
38 RUN_TYPE_LF,
39 RUN_TYPE_UL,
40 RUN_TYPE_LSU,
41 RUN_TYPE_U2L_399_EXT2,
42 RUN_TYPE_UF_D20,
43 RUN_TYPE_UF_D1_EXT,
44 RUN_TYPE_U_EXT,
45 RUN_TYPE_LF_EXT,
46 RUN_TYPE_U_EXT2,
47 RUN_TYPE_L_EXT2,
48 RUN_TYPE_U_EXT3,
49 };
50
51 /* conv_type:
52 0 = to upper
53 1 = to lower
54 2 = case folding (= to lower with modifications)
55 */
lre_case_conv(uint32_t * res,uint32_t c,int conv_type)56 int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
57 {
58 if (c < 128) {
59 if (conv_type) {
60 if (c >= 'A' && c <= 'Z') {
61 c = c - 'A' + 'a';
62 }
63 } else {
64 if (c >= 'a' && c <= 'z') {
65 c = c - 'a' + 'A';
66 }
67 }
68 } else {
69 uint32_t v, code, data, type, len, a, is_lower;
70 int idx, idx_min, idx_max;
71
72 is_lower = (conv_type != 0);
73 idx_min = 0;
74 idx_max = countof(case_conv_table1) - 1;
75 while (idx_min <= idx_max) {
76 idx = (unsigned)(idx_max + idx_min) / 2;
77 v = case_conv_table1[idx];
78 code = v >> (32 - 17);
79 len = (v >> (32 - 17 - 7)) & 0x7f;
80 if (c < code) {
81 idx_max = idx - 1;
82 } else if (c >= code + len) {
83 idx_min = idx + 1;
84 } else {
85 type = (v >> (32 - 17 - 7 - 4)) & 0xf;
86 data = ((v & 0xf) << 8) | case_conv_table2[idx];
87 switch(type) {
88 case RUN_TYPE_U:
89 case RUN_TYPE_L:
90 case RUN_TYPE_UF:
91 case RUN_TYPE_LF:
92 if (conv_type == (type & 1) ||
93 (type >= RUN_TYPE_UF && conv_type == 2)) {
94 c = c - code + (case_conv_table1[data] >> (32 - 17));
95 }
96 break;
97 case RUN_TYPE_UL:
98 a = c - code;
99 if ((a & 1) != (1 - is_lower))
100 break;
101 c = (a ^ 1) + code;
102 break;
103 case RUN_TYPE_LSU:
104 a = c - code;
105 if (a == 1) {
106 c += 2 * is_lower - 1;
107 } else if (a == (1 - is_lower) * 2) {
108 c += (2 * is_lower - 1) * 2;
109 }
110 break;
111 case RUN_TYPE_U2L_399_EXT2:
112 if (!is_lower) {
113 res[0] = c - code + case_conv_ext[data >> 6];
114 res[1] = 0x399;
115 return 2;
116 } else {
117 c = c - code + case_conv_ext[data & 0x3f];
118 }
119 break;
120 case RUN_TYPE_UF_D20:
121 if (conv_type == 1)
122 break;
123 c = data + (conv_type == 2) * 0x20;
124 break;
125 case RUN_TYPE_UF_D1_EXT:
126 if (conv_type == 1)
127 break;
128 c = case_conv_ext[data] + (conv_type == 2);
129 break;
130 case RUN_TYPE_U_EXT:
131 case RUN_TYPE_LF_EXT:
132 if (is_lower != (type - RUN_TYPE_U_EXT))
133 break;
134 c = case_conv_ext[data];
135 break;
136 case RUN_TYPE_U_EXT2:
137 case RUN_TYPE_L_EXT2:
138 if (conv_type != (type - RUN_TYPE_U_EXT2))
139 break;
140 res[0] = c - code + case_conv_ext[data >> 6];
141 res[1] = case_conv_ext[data & 0x3f];
142 return 2;
143 default:
144 case RUN_TYPE_U_EXT3:
145 if (conv_type != 0)
146 break;
147 res[0] = case_conv_ext[data >> 8];
148 res[1] = case_conv_ext[(data >> 4) & 0xf];
149 res[2] = case_conv_ext[data & 0xf];
150 return 3;
151 }
152 break;
153 }
154 }
155 }
156 res[0] = c;
157 return 1;
158 }
159
get_le24(const uint8_t * ptr)160 static uint32_t get_le24(const uint8_t *ptr)
161 {
162 #if defined(__x86__) || defined(__x86_64__)
163 return *(uint16_t *)ptr | (ptr[2] << 16);
164 #else
165 return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16);
166 #endif
167 }
168
169 #define UNICODE_INDEX_BLOCK_LEN 32
170
171 /* return -1 if not in table, otherwise the offset in the block */
get_index_pos(uint32_t * pcode,uint32_t c,const uint8_t * index_table,int index_table_len)172 static int get_index_pos(uint32_t *pcode, uint32_t c,
173 const uint8_t *index_table, int index_table_len)
174 {
175 uint32_t code, v;
176 int idx_min, idx_max, idx;
177
178 idx_min = 0;
179 v = get_le24(index_table);
180 code = v & ((1 << 21) - 1);
181 if (c < code) {
182 *pcode = 0;
183 return 0;
184 }
185 idx_max = index_table_len - 1;
186 code = get_le24(index_table + idx_max * 3);
187 if (c >= code)
188 return -1;
189 /* invariant: tab[idx_min] <= c < tab2[idx_max] */
190 while ((idx_max - idx_min) > 1) {
191 idx = (idx_max + idx_min) / 2;
192 v = get_le24(index_table + idx * 3);
193 code = v & ((1 << 21) - 1);
194 if (c < code) {
195 idx_max = idx;
196 } else {
197 idx_min = idx;
198 }
199 }
200 v = get_le24(index_table + idx_min * 3);
201 *pcode = v & ((1 << 21) - 1);
202 return (idx_min + 1) * UNICODE_INDEX_BLOCK_LEN + (v >> 21);
203 }
204
lre_is_in_table(uint32_t c,const uint8_t * table,const uint8_t * index_table,int index_table_len)205 static BOOL lre_is_in_table(uint32_t c, const uint8_t *table,
206 const uint8_t *index_table, int index_table_len)
207 {
208 uint32_t code, b, bit;
209 int pos;
210 const uint8_t *p;
211
212 pos = get_index_pos(&code, c, index_table, index_table_len);
213 if (pos < 0)
214 return FALSE; /* outside the table */
215 p = table + pos;
216 bit = 0;
217 for(;;) {
218 b = *p++;
219 if (b < 64) {
220 code += (b >> 3) + 1;
221 if (c < code)
222 return bit;
223 bit ^= 1;
224 code += (b & 7) + 1;
225 } else if (b >= 0x80) {
226 code += b - 0x80 + 1;
227 } else if (b < 0x60) {
228 code += (((b - 0x40) << 8) | p[0]) + 1;
229 p++;
230 } else {
231 code += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
232 p += 2;
233 }
234 if (c < code)
235 return bit;
236 bit ^= 1;
237 }
238 }
239
lre_is_cased(uint32_t c)240 BOOL lre_is_cased(uint32_t c)
241 {
242 uint32_t v, code, len;
243 int idx, idx_min, idx_max;
244
245 idx_min = 0;
246 idx_max = countof(case_conv_table1) - 1;
247 while (idx_min <= idx_max) {
248 idx = (unsigned)(idx_max + idx_min) / 2;
249 v = case_conv_table1[idx];
250 code = v >> (32 - 17);
251 len = (v >> (32 - 17 - 7)) & 0x7f;
252 if (c < code) {
253 idx_max = idx - 1;
254 } else if (c >= code + len) {
255 idx_min = idx + 1;
256 } else {
257 return TRUE;
258 }
259 }
260 return lre_is_in_table(c, unicode_prop_Cased1_table,
261 unicode_prop_Cased1_index,
262 sizeof(unicode_prop_Cased1_index) / 3);
263 }
264
lre_is_case_ignorable(uint32_t c)265 BOOL lre_is_case_ignorable(uint32_t c)
266 {
267 return lre_is_in_table(c, unicode_prop_Case_Ignorable_table,
268 unicode_prop_Case_Ignorable_index,
269 sizeof(unicode_prop_Case_Ignorable_index) / 3);
270 }
271
272 /* character range */
273
cr_dump(CharRange * cr)274 static __maybe_unused void cr_dump(CharRange *cr)
275 {
276 int i;
277 for(i = 0; i < cr->len; i++)
278 printf("%d: 0x%04x\n", i, cr->points[i]);
279 }
280
cr_default_realloc(void * opaque,void * ptr,size_t size)281 static void *cr_default_realloc(void *opaque, void *ptr, size_t size)
282 {
283 return realloc(ptr, size);
284 }
285
cr_init(CharRange * cr,void * mem_opaque,DynBufReallocFunc * realloc_func)286 void cr_init(CharRange *cr, void *mem_opaque, DynBufReallocFunc *realloc_func)
287 {
288 cr->len = cr->size = 0;
289 cr->points = NULL;
290 cr->mem_opaque = mem_opaque;
291 cr->realloc_func = realloc_func ? realloc_func : cr_default_realloc;
292 }
293
cr_free(CharRange * cr)294 void cr_free(CharRange *cr)
295 {
296 cr->realloc_func(cr->mem_opaque, cr->points, 0);
297 }
298
cr_realloc(CharRange * cr,int size)299 int cr_realloc(CharRange *cr, int size)
300 {
301 int new_size;
302 uint32_t *new_buf;
303
304 if (size > cr->size) {
305 new_size = max_int(size, cr->size * 3 / 2);
306 new_buf = cr->realloc_func(cr->mem_opaque, cr->points,
307 new_size * sizeof(cr->points[0]));
308 if (!new_buf)
309 return -1;
310 cr->points = new_buf;
311 cr->size = new_size;
312 }
313 return 0;
314 }
315
cr_copy(CharRange * cr,const CharRange * cr1)316 int cr_copy(CharRange *cr, const CharRange *cr1)
317 {
318 if (cr_realloc(cr, cr1->len))
319 return -1;
320 memcpy(cr->points, cr1->points, sizeof(cr->points[0]) * cr1->len);
321 cr->len = cr1->len;
322 return 0;
323 }
324
325 /* merge consecutive intervals and remove empty intervals */
cr_compress(CharRange * cr)326 static void cr_compress(CharRange *cr)
327 {
328 int i, j, k, len;
329 uint32_t *pt;
330
331 pt = cr->points;
332 len = cr->len;
333 i = 0;
334 j = 0;
335 k = 0;
336 while ((i + 1) < len) {
337 if (pt[i] == pt[i + 1]) {
338 /* empty interval */
339 i += 2;
340 } else {
341 j = i;
342 while ((j + 3) < len && pt[j + 1] == pt[j + 2])
343 j += 2;
344 /* just copy */
345 pt[k] = pt[i];
346 pt[k + 1] = pt[j + 1];
347 k += 2;
348 i = j + 2;
349 }
350 }
351 cr->len = k;
352 }
353
354 /* union or intersection */
cr_op(CharRange * cr,const uint32_t * a_pt,int a_len,const uint32_t * b_pt,int b_len,int op)355 int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
356 const uint32_t *b_pt, int b_len, int op)
357 {
358 int a_idx, b_idx, is_in;
359 uint32_t v;
360
361 a_idx = 0;
362 b_idx = 0;
363 for(;;) {
364 /* get one more point from a or b in increasing order */
365 if (a_idx < a_len && b_idx < b_len) {
366 if (a_pt[a_idx] < b_pt[b_idx]) {
367 goto a_add;
368 } else if (a_pt[a_idx] == b_pt[b_idx]) {
369 v = a_pt[a_idx];
370 a_idx++;
371 b_idx++;
372 } else {
373 goto b_add;
374 }
375 } else if (a_idx < a_len) {
376 a_add:
377 v = a_pt[a_idx++];
378 } else if (b_idx < b_len) {
379 b_add:
380 v = b_pt[b_idx++];
381 } else {
382 break;
383 }
384 /* add the point if the in/out status changes */
385 switch(op) {
386 case CR_OP_UNION:
387 is_in = (a_idx & 1) | (b_idx & 1);
388 break;
389 case CR_OP_INTER:
390 is_in = (a_idx & 1) & (b_idx & 1);
391 break;
392 case CR_OP_XOR:
393 is_in = (a_idx & 1) ^ (b_idx & 1);
394 break;
395 default:
396 abort();
397 }
398 if (is_in != (cr->len & 1)) {
399 if (cr_add_point(cr, v))
400 return -1;
401 }
402 }
403 cr_compress(cr);
404 return 0;
405 }
406
cr_union1(CharRange * cr,const uint32_t * b_pt,int b_len)407 int cr_union1(CharRange *cr, const uint32_t *b_pt, int b_len)
408 {
409 CharRange a = *cr;
410 int ret;
411 cr->len = 0;
412 cr->size = 0;
413 cr->points = NULL;
414 ret = cr_op(cr, a.points, a.len, b_pt, b_len, CR_OP_UNION);
415 cr_free(&a);
416 return ret;
417 }
418
cr_invert(CharRange * cr)419 int cr_invert(CharRange *cr)
420 {
421 int len;
422 len = cr->len;
423 if (cr_realloc(cr, len + 2))
424 return -1;
425 memmove(cr->points + 1, cr->points, len * sizeof(cr->points[0]));
426 cr->points[0] = 0;
427 cr->points[len + 1] = UINT32_MAX;
428 cr->len = len + 2;
429 cr_compress(cr);
430 return 0;
431 }
432
433 #ifdef CONFIG_ALL_UNICODE
434
lre_is_id_start(uint32_t c)435 BOOL lre_is_id_start(uint32_t c)
436 {
437 return lre_is_in_table(c, unicode_prop_ID_Start_table,
438 unicode_prop_ID_Start_index,
439 sizeof(unicode_prop_ID_Start_index) / 3);
440 }
441
lre_is_id_continue(uint32_t c)442 BOOL lre_is_id_continue(uint32_t c)
443 {
444 return lre_is_id_start(c) ||
445 lre_is_in_table(c, unicode_prop_ID_Continue1_table,
446 unicode_prop_ID_Continue1_index,
447 sizeof(unicode_prop_ID_Continue1_index) / 3);
448 }
449
450 #define UNICODE_DECOMP_LEN_MAX 18
451
452 typedef enum {
453 DECOMP_TYPE_C1, /* 16 bit char */
454 DECOMP_TYPE_L1, /* 16 bit char table */
455 DECOMP_TYPE_L2,
456 DECOMP_TYPE_L3,
457 DECOMP_TYPE_L4,
458 DECOMP_TYPE_L5, /* XXX: not used */
459 DECOMP_TYPE_L6, /* XXX: could remove */
460 DECOMP_TYPE_L7, /* XXX: could remove */
461 DECOMP_TYPE_LL1, /* 18 bit char table */
462 DECOMP_TYPE_LL2,
463 DECOMP_TYPE_S1, /* 8 bit char table */
464 DECOMP_TYPE_S2,
465 DECOMP_TYPE_S3,
466 DECOMP_TYPE_S4,
467 DECOMP_TYPE_S5,
468 DECOMP_TYPE_I1, /* increment 16 bit char value */
469 DECOMP_TYPE_I2_0,
470 DECOMP_TYPE_I2_1,
471 DECOMP_TYPE_I3_1,
472 DECOMP_TYPE_I3_2,
473 DECOMP_TYPE_I4_1,
474 DECOMP_TYPE_I4_2,
475 DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */
476 DECOMP_TYPE_B2,
477 DECOMP_TYPE_B3,
478 DECOMP_TYPE_B4,
479 DECOMP_TYPE_B5,
480 DECOMP_TYPE_B6,
481 DECOMP_TYPE_B7,
482 DECOMP_TYPE_B8,
483 DECOMP_TYPE_B18,
484 DECOMP_TYPE_LS2,
485 DECOMP_TYPE_PAT3,
486 DECOMP_TYPE_S2_UL,
487 DECOMP_TYPE_LS2_UL,
488 } DecompTypeEnum;
489
unicode_get_short_code(uint32_t c)490 static uint32_t unicode_get_short_code(uint32_t c)
491 {
492 static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 };
493
494 if (c < 0x80)
495 return c;
496 else if (c < 0x80 + 0x50)
497 return c - 0x80 + 0x300;
498 else
499 return unicode_short_table[c - 0x80 - 0x50];
500 }
501
unicode_get_lower_simple(uint32_t c)502 static uint32_t unicode_get_lower_simple(uint32_t c)
503 {
504 if (c < 0x100 || (c >= 0x410 && c <= 0x42f))
505 c += 0x20;
506 else
507 c++;
508 return c;
509 }
510
unicode_get16(const uint8_t * p)511 static uint16_t unicode_get16(const uint8_t *p)
512 {
513 return p[0] | (p[1] << 8);
514 }
515
unicode_decomp_entry(uint32_t * res,uint32_t c,int idx,uint32_t code,uint32_t len,uint32_t type)516 static int unicode_decomp_entry(uint32_t *res, uint32_t c,
517 int idx, uint32_t code, uint32_t len,
518 uint32_t type)
519 {
520 uint32_t c1;
521 int l, i, p;
522 const uint8_t *d;
523
524 if (type == DECOMP_TYPE_C1) {
525 res[0] = unicode_decomp_table2[idx];
526 return 1;
527 } else {
528 d = unicode_decomp_data + unicode_decomp_table2[idx];
529 switch(type) {
530 case DECOMP_TYPE_L1:
531 case DECOMP_TYPE_L2:
532 case DECOMP_TYPE_L3:
533 case DECOMP_TYPE_L4:
534 case DECOMP_TYPE_L5:
535 case DECOMP_TYPE_L6:
536 case DECOMP_TYPE_L7:
537 l = type - DECOMP_TYPE_L1 + 1;
538 d += (c - code) * l * 2;
539 for(i = 0; i < l; i++) {
540 if ((res[i] = unicode_get16(d + 2 * i)) == 0)
541 return 0;
542 }
543 return l;
544 case DECOMP_TYPE_LL1:
545 case DECOMP_TYPE_LL2:
546 {
547 uint32_t k, p;
548 l = type - DECOMP_TYPE_LL1 + 1;
549 k = (c - code) * l;
550 p = len * l * 2;
551 for(i = 0; i < l; i++) {
552 c1 = unicode_get16(d + 2 * k) |
553 (((d[p + (k / 4)] >> ((k % 4) * 2)) & 3) << 16);
554 if (!c1)
555 return 0;
556 res[i] = c1;
557 k++;
558 }
559 }
560 return l;
561 case DECOMP_TYPE_S1:
562 case DECOMP_TYPE_S2:
563 case DECOMP_TYPE_S3:
564 case DECOMP_TYPE_S4:
565 case DECOMP_TYPE_S5:
566 l = type - DECOMP_TYPE_S1 + 1;
567 d += (c - code) * l;
568 for(i = 0; i < l; i++) {
569 if ((res[i] = unicode_get_short_code(d[i])) == 0)
570 return 0;
571 }
572 return l;
573 case DECOMP_TYPE_I1:
574 l = 1;
575 p = 0;
576 goto decomp_type_i;
577 case DECOMP_TYPE_I2_0:
578 case DECOMP_TYPE_I2_1:
579 case DECOMP_TYPE_I3_1:
580 case DECOMP_TYPE_I3_2:
581 case DECOMP_TYPE_I4_1:
582 case DECOMP_TYPE_I4_2:
583 l = 2 + ((type - DECOMP_TYPE_I2_0) >> 1);
584 p = ((type - DECOMP_TYPE_I2_0) & 1) + (l > 2);
585 decomp_type_i:
586 for(i = 0; i < l; i++) {
587 c1 = unicode_get16(d + 2 * i);
588 if (i == p)
589 c1 += c - code;
590 res[i] = c1;
591 }
592 return l;
593 case DECOMP_TYPE_B18:
594 l = 18;
595 goto decomp_type_b;
596 case DECOMP_TYPE_B1:
597 case DECOMP_TYPE_B2:
598 case DECOMP_TYPE_B3:
599 case DECOMP_TYPE_B4:
600 case DECOMP_TYPE_B5:
601 case DECOMP_TYPE_B6:
602 case DECOMP_TYPE_B7:
603 case DECOMP_TYPE_B8:
604 l = type - DECOMP_TYPE_B1 + 1;
605 decomp_type_b:
606 {
607 uint32_t c_min;
608 c_min = unicode_get16(d);
609 d += 2 + (c - code) * l;
610 for(i = 0; i < l; i++) {
611 c1 = d[i];
612 if (c1 == 0xff)
613 c1 = 0x20;
614 else
615 c1 += c_min;
616 res[i] = c1;
617 }
618 }
619 return l;
620 case DECOMP_TYPE_LS2:
621 d += (c - code) * 3;
622 if (!(res[0] = unicode_get16(d)))
623 return 0;
624 res[1] = unicode_get_short_code(d[2]);
625 return 2;
626 case DECOMP_TYPE_PAT3:
627 res[0] = unicode_get16(d);
628 res[2] = unicode_get16(d + 2);
629 d += 4 + (c - code) * 2;
630 res[1] = unicode_get16(d);
631 return 3;
632 case DECOMP_TYPE_S2_UL:
633 case DECOMP_TYPE_LS2_UL:
634 c1 = c - code;
635 if (type == DECOMP_TYPE_S2_UL) {
636 d += c1 & ~1;
637 c = unicode_get_short_code(*d);
638 d++;
639 } else {
640 d += (c1 >> 1) * 3;
641 c = unicode_get16(d);
642 d += 2;
643 }
644 if (c1 & 1)
645 c = unicode_get_lower_simple(c);
646 res[0] = c;
647 res[1] = unicode_get_short_code(*d);
648 return 2;
649 }
650 }
651 return 0;
652 }
653
654
655 /* return the length of the decomposition (length <=
656 UNICODE_DECOMP_LEN_MAX) or 0 if no decomposition */
unicode_decomp_char(uint32_t * res,uint32_t c,BOOL is_compat1)657 static int unicode_decomp_char(uint32_t *res, uint32_t c, BOOL is_compat1)
658 {
659 uint32_t v, type, is_compat, code, len;
660 int idx_min, idx_max, idx;
661
662 idx_min = 0;
663 idx_max = countof(unicode_decomp_table1) - 1;
664 while (idx_min <= idx_max) {
665 idx = (idx_max + idx_min) / 2;
666 v = unicode_decomp_table1[idx];
667 code = v >> (32 - 18);
668 len = (v >> (32 - 18 - 7)) & 0x7f;
669 // printf("idx=%d code=%05x len=%d\n", idx, code, len);
670 if (c < code) {
671 idx_max = idx - 1;
672 } else if (c >= code + len) {
673 idx_min = idx + 1;
674 } else {
675 is_compat = v & 1;
676 if (is_compat1 < is_compat)
677 break;
678 type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
679 return unicode_decomp_entry(res, c, idx, code, len, type);
680 }
681 }
682 return 0;
683 }
684
685 /* return 0 if no pair found */
unicode_compose_pair(uint32_t c0,uint32_t c1)686 static int unicode_compose_pair(uint32_t c0, uint32_t c1)
687 {
688 uint32_t code, len, type, v, idx1, d_idx, d_offset, ch;
689 int idx_min, idx_max, idx, d;
690 uint32_t pair[2];
691
692 idx_min = 0;
693 idx_max = countof(unicode_comp_table) - 1;
694 while (idx_min <= idx_max) {
695 idx = (idx_max + idx_min) / 2;
696 idx1 = unicode_comp_table[idx];
697
698 /* idx1 represent an entry of the decomposition table */
699 d_idx = idx1 >> 6;
700 d_offset = idx1 & 0x3f;
701 v = unicode_decomp_table1[d_idx];
702 code = v >> (32 - 18);
703 len = (v >> (32 - 18 - 7)) & 0x7f;
704 type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
705 ch = code + d_offset;
706 unicode_decomp_entry(pair, ch, d_idx, code, len, type);
707 d = c0 - pair[0];
708 if (d == 0)
709 d = c1 - pair[1];
710 if (d < 0) {
711 idx_max = idx - 1;
712 } else if (d > 0) {
713 idx_min = idx + 1;
714 } else {
715 return ch;
716 }
717 }
718 return 0;
719 }
720
721 /* return the combining class of character c (between 0 and 255) */
unicode_get_cc(uint32_t c)722 static int unicode_get_cc(uint32_t c)
723 {
724 uint32_t code, n, type, cc, c1, b;
725 int pos;
726 const uint8_t *p;
727
728 pos = get_index_pos(&code, c,
729 unicode_cc_index, sizeof(unicode_cc_index) / 3);
730 if (pos < 0)
731 return 0;
732 p = unicode_cc_table + pos;
733 for(;;) {
734 b = *p++;
735 type = b >> 6;
736 n = b & 0x3f;
737 if (n < 48) {
738 } else if (n < 56) {
739 n = (n - 48) << 8;
740 n |= *p++;
741 n += 48;
742 } else {
743 n = (n - 56) << 8;
744 n |= *p++ << 8;
745 n |= *p++;
746 n += 48 + (1 << 11);
747 }
748 if (type <= 1)
749 p++;
750 c1 = code + n + 1;
751 if (c < c1) {
752 switch(type) {
753 case 0:
754 cc = p[-1];
755 break;
756 case 1:
757 cc = p[-1] + c - code;
758 break;
759 case 2:
760 cc = 0;
761 break;
762 default:
763 case 3:
764 cc = 230;
765 break;
766 }
767 return cc;
768 }
769 code = c1;
770 }
771 }
772
sort_cc(int * buf,int len)773 static void sort_cc(int *buf, int len)
774 {
775 int i, j, k, cc, cc1, start, ch1;
776
777 for(i = 0; i < len; i++) {
778 cc = unicode_get_cc(buf[i]);
779 if (cc != 0) {
780 start = i;
781 j = i + 1;
782 while (j < len) {
783 ch1 = buf[j];
784 cc1 = unicode_get_cc(ch1);
785 if (cc1 == 0)
786 break;
787 k = j - 1;
788 while (k >= start) {
789 if (unicode_get_cc(buf[k]) <= cc1)
790 break;
791 buf[k + 1] = buf[k];
792 k--;
793 }
794 buf[k + 1] = ch1;
795 j++;
796 }
797 #if 0
798 printf("cc:");
799 for(k = start; k < j; k++) {
800 printf(" %3d", unicode_get_cc(buf[k]));
801 }
802 printf("\n");
803 #endif
804 i = j;
805 }
806 }
807 }
808
to_nfd_rec(DynBuf * dbuf,const int * src,int src_len,int is_compat)809 static void to_nfd_rec(DynBuf *dbuf,
810 const int *src, int src_len, int is_compat)
811 {
812 uint32_t c, v;
813 int i, l;
814 uint32_t res[UNICODE_DECOMP_LEN_MAX];
815
816 for(i = 0; i < src_len; i++) {
817 c = src[i];
818 if (c >= 0xac00 && c < 0xd7a4) {
819 /* Hangul decomposition */
820 c -= 0xac00;
821 dbuf_put_u32(dbuf, 0x1100 + c / 588);
822 dbuf_put_u32(dbuf, 0x1161 + (c % 588) / 28);
823 v = c % 28;
824 if (v != 0)
825 dbuf_put_u32(dbuf, 0x11a7 + v);
826 } else {
827 l = unicode_decomp_char(res, c, is_compat);
828 if (l) {
829 to_nfd_rec(dbuf, (int *)res, l, is_compat);
830 } else {
831 dbuf_put_u32(dbuf, c);
832 }
833 }
834 }
835 }
836
837 /* return 0 if not found */
compose_pair(uint32_t c0,uint32_t c1)838 static int compose_pair(uint32_t c0, uint32_t c1)
839 {
840 /* Hangul composition */
841 if (c0 >= 0x1100 && c0 < 0x1100 + 19 &&
842 c1 >= 0x1161 && c1 < 0x1161 + 21) {
843 return 0xac00 + (c0 - 0x1100) * 588 + (c1 - 0x1161) * 28;
844 } else if (c0 >= 0xac00 && c0 < 0xac00 + 11172 &&
845 (c0 - 0xac00) % 28 == 0 &&
846 c1 >= 0x11a7 && c1 < 0x11a7 + 28) {
847 return c0 + c1 - 0x11a7;
848 } else {
849 return unicode_compose_pair(c0, c1);
850 }
851 }
852
unicode_normalize(uint32_t ** pdst,const uint32_t * src,int src_len,UnicodeNormalizationEnum n_type,void * opaque,DynBufReallocFunc * realloc_func)853 int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
854 UnicodeNormalizationEnum n_type,
855 void *opaque, DynBufReallocFunc *realloc_func)
856 {
857 int *buf, buf_len, i, p, starter_pos, cc, last_cc, out_len;
858 BOOL is_compat;
859 DynBuf dbuf_s, *dbuf = &dbuf_s;
860
861 is_compat = n_type >> 1;
862
863 dbuf_init2(dbuf, opaque, realloc_func);
864 if (dbuf_realloc(dbuf, sizeof(int) * src_len))
865 goto fail;
866
867 /* common case: latin1 is unaffected by NFC */
868 if (n_type == UNICODE_NFC) {
869 for(i = 0; i < src_len; i++) {
870 if (src[i] >= 0x100)
871 goto not_latin1;
872 }
873 buf = (int *)dbuf->buf;
874 memcpy(buf, src, src_len * sizeof(int));
875 *pdst = (uint32_t *)buf;
876 return src_len;
877 not_latin1: ;
878 }
879
880 to_nfd_rec(dbuf, (const int *)src, src_len, is_compat);
881 if (dbuf_error(dbuf)) {
882 fail:
883 *pdst = NULL;
884 return -1;
885 }
886 buf = (int *)dbuf->buf;
887 buf_len = dbuf->size / sizeof(int);
888
889 sort_cc(buf, buf_len);
890
891 if (buf_len <= 1 || (n_type & 1) != 0) {
892 /* NFD / NFKD */
893 *pdst = (uint32_t *)buf;
894 return buf_len;
895 }
896
897 i = 1;
898 out_len = 1;
899 while (i < buf_len) {
900 /* find the starter character and test if it is blocked from
901 the character at 'i' */
902 last_cc = unicode_get_cc(buf[i]);
903 starter_pos = out_len - 1;
904 while (starter_pos >= 0) {
905 cc = unicode_get_cc(buf[starter_pos]);
906 if (cc == 0)
907 break;
908 if (cc >= last_cc)
909 goto next;
910 last_cc = 256;
911 starter_pos--;
912 }
913 if (starter_pos >= 0 &&
914 (p = compose_pair(buf[starter_pos], buf[i])) != 0) {
915 buf[starter_pos] = p;
916 i++;
917 } else {
918 next:
919 buf[out_len++] = buf[i++];
920 }
921 }
922 *pdst = (uint32_t *)buf;
923 return out_len;
924 }
925
926 /* char ranges for various unicode properties */
927
unicode_find_name(const char * name_table,const char * name)928 static int unicode_find_name(const char *name_table, const char *name)
929 {
930 const char *p, *r;
931 int pos;
932 size_t name_len, len;
933
934 p = name_table;
935 pos = 0;
936 name_len = strlen(name);
937 while (*p) {
938 for(;;) {
939 r = strchr(p, ',');
940 if (!r)
941 len = strlen(p);
942 else
943 len = r - p;
944 if (len == name_len && !memcmp(p, name, name_len))
945 return pos;
946 p += len + 1;
947 if (!r)
948 break;
949 }
950 pos++;
951 }
952 return -1;
953 }
954
955 /* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
956 if not found */
unicode_script(CharRange * cr,const char * script_name,BOOL is_ext)957 int unicode_script(CharRange *cr,
958 const char *script_name, BOOL is_ext)
959 {
960 int script_idx;
961 const uint8_t *p, *p_end;
962 uint32_t c, c1, b, n, v, v_len, i, type;
963 CharRange cr1_s, *cr1;
964 CharRange cr2_s, *cr2 = &cr2_s;
965 BOOL is_common;
966
967 script_idx = unicode_find_name(unicode_script_name_table, script_name);
968 if (script_idx < 0)
969 return -2;
970 /* Note: we remove the "Unknown" Script */
971 script_idx += UNICODE_SCRIPT_Unknown + 1;
972
973 is_common = (script_idx == UNICODE_SCRIPT_Common ||
974 script_idx == UNICODE_SCRIPT_Inherited);
975 if (is_ext) {
976 cr1 = &cr1_s;
977 cr_init(cr1, cr->mem_opaque, cr->realloc_func);
978 cr_init(cr2, cr->mem_opaque, cr->realloc_func);
979 } else {
980 cr1 = cr;
981 }
982
983 p = unicode_script_table;
984 p_end = unicode_script_table + countof(unicode_script_table);
985 c = 0;
986 while (p < p_end) {
987 b = *p++;
988 type = b >> 7;
989 n = b & 0x7f;
990 if (n < 96) {
991 } else if (n < 112) {
992 n = (n - 96) << 8;
993 n |= *p++;
994 n += 96;
995 } else {
996 n = (n - 112) << 16;
997 n |= *p++ << 8;
998 n |= *p++;
999 n += 96 + (1 << 12);
1000 }
1001 if (type == 0)
1002 v = 0;
1003 else
1004 v = *p++;
1005 c1 = c + n + 1;
1006 if (v == script_idx) {
1007 if (cr_add_interval(cr1, c, c1))
1008 goto fail;
1009 }
1010 c = c1;
1011 }
1012
1013 if (is_ext) {
1014 /* add the script extensions */
1015 p = unicode_script_ext_table;
1016 p_end = unicode_script_ext_table + countof(unicode_script_ext_table);
1017 c = 0;
1018 while (p < p_end) {
1019 b = *p++;
1020 if (b < 128) {
1021 n = b;
1022 } else if (b < 128 + 64) {
1023 n = (b - 128) << 8;
1024 n |= *p++;
1025 n += 128;
1026 } else {
1027 n = (b - 128 - 64) << 16;
1028 n |= *p++ << 8;
1029 n |= *p++;
1030 n += 128 + (1 << 14);
1031 }
1032 c1 = c + n + 1;
1033 v_len = *p++;
1034 if (is_common) {
1035 if (v_len != 0) {
1036 if (cr_add_interval(cr2, c, c1))
1037 goto fail;
1038 }
1039 } else {
1040 for(i = 0; i < v_len; i++) {
1041 if (p[i] == script_idx) {
1042 if (cr_add_interval(cr2, c, c1))
1043 goto fail;
1044 break;
1045 }
1046 }
1047 }
1048 p += v_len;
1049 c = c1;
1050 }
1051 if (is_common) {
1052 /* remove all the characters with script extensions */
1053 if (cr_invert(cr2))
1054 goto fail;
1055 if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
1056 CR_OP_INTER))
1057 goto fail;
1058 } else {
1059 if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
1060 CR_OP_UNION))
1061 goto fail;
1062 }
1063 cr_free(cr1);
1064 cr_free(cr2);
1065 }
1066 return 0;
1067 fail:
1068 if (is_ext) {
1069 cr_free(cr1);
1070 cr_free(cr2);
1071 }
1072 goto fail;
1073 }
1074
1075 #define M(id) (1U << UNICODE_GC_ ## id)
1076
unicode_general_category1(CharRange * cr,uint32_t gc_mask)1077 static int unicode_general_category1(CharRange *cr, uint32_t gc_mask)
1078 {
1079 const uint8_t *p, *p_end;
1080 uint32_t c, c0, b, n, v;
1081
1082 p = unicode_gc_table;
1083 p_end = unicode_gc_table + countof(unicode_gc_table);
1084 c = 0;
1085 while (p < p_end) {
1086 b = *p++;
1087 n = b >> 5;
1088 v = b & 0x1f;
1089 if (n == 7) {
1090 n = *p++;
1091 if (n < 128) {
1092 n += 7;
1093 } else if (n < 128 + 64) {
1094 n = (n - 128) << 8;
1095 n |= *p++;
1096 n += 7 + 128;
1097 } else {
1098 n = (n - 128 - 64) << 16;
1099 n |= *p++ << 8;
1100 n |= *p++;
1101 n += 7 + 128 + (1 << 14);
1102 }
1103 }
1104 c0 = c;
1105 c += n + 1;
1106 if (v == 31) {
1107 /* run of Lu / Ll */
1108 b = gc_mask & (M(Lu) | M(Ll));
1109 if (b != 0) {
1110 if (b == (M(Lu) | M(Ll))) {
1111 goto add_range;
1112 } else {
1113 c0 += ((gc_mask & M(Ll)) != 0);
1114 for(; c0 < c; c0 += 2) {
1115 if (cr_add_interval(cr, c0, c0 + 1))
1116 return -1;
1117 }
1118 }
1119 }
1120 } else if ((gc_mask >> v) & 1) {
1121 add_range:
1122 if (cr_add_interval(cr, c0, c))
1123 return -1;
1124 }
1125 }
1126 return 0;
1127 }
1128
unicode_prop1(CharRange * cr,int prop_idx)1129 static int unicode_prop1(CharRange *cr, int prop_idx)
1130 {
1131 const uint8_t *p, *p_end;
1132 uint32_t c, c0, b, bit;
1133
1134 p = unicode_prop_table[prop_idx];
1135 p_end = p + unicode_prop_len_table[prop_idx];
1136 c = 0;
1137 bit = 0;
1138 while (p < p_end) {
1139 c0 = c;
1140 b = *p++;
1141 if (b < 64) {
1142 c += (b >> 3) + 1;
1143 if (bit) {
1144 if (cr_add_interval(cr, c0, c))
1145 return -1;
1146 }
1147 bit ^= 1;
1148 c0 = c;
1149 c += (b & 7) + 1;
1150 } else if (b >= 0x80) {
1151 c += b - 0x80 + 1;
1152 } else if (b < 0x60) {
1153 c += (((b - 0x40) << 8) | p[0]) + 1;
1154 p++;
1155 } else {
1156 c += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
1157 p += 2;
1158 }
1159 if (bit) {
1160 if (cr_add_interval(cr, c0, c))
1161 return -1;
1162 }
1163 bit ^= 1;
1164 }
1165 return 0;
1166 }
1167
1168 #define CASE_U (1 << 0)
1169 #define CASE_L (1 << 1)
1170 #define CASE_F (1 << 2)
1171
1172 /* use the case conversion table to generate range of characters.
1173 CASE_U: set char if modified by uppercasing,
1174 CASE_L: set char if modified by lowercasing,
1175 CASE_F: set char if modified by case folding,
1176 */
unicode_case1(CharRange * cr,int case_mask)1177 static int unicode_case1(CharRange *cr, int case_mask)
1178 {
1179 #define MR(x) (1 << RUN_TYPE_ ## x)
1180 const uint32_t tab_run_mask[3] = {
1181 MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) |
1182 MR(UF_D1_EXT) | MR(U_EXT) | MR(U_EXT2) | MR(U_EXT3),
1183
1184 MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(L_EXT2),
1185
1186 MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT),
1187 };
1188 #undef MR
1189 uint32_t mask, v, code, type, len, i, idx;
1190
1191 if (case_mask == 0)
1192 return 0;
1193 mask = 0;
1194 for(i = 0; i < 3; i++) {
1195 if ((case_mask >> i) & 1)
1196 mask |= tab_run_mask[i];
1197 }
1198 for(idx = 0; idx < countof(case_conv_table1); idx++) {
1199 v = case_conv_table1[idx];
1200 type = (v >> (32 - 17 - 7 - 4)) & 0xf;
1201 code = v >> (32 - 17);
1202 len = (v >> (32 - 17 - 7)) & 0x7f;
1203 if ((mask >> type) & 1) {
1204 // printf("%d: type=%d %04x %04x\n", idx, type, code, code + len - 1);
1205 switch(type) {
1206 case RUN_TYPE_UL:
1207 if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
1208 goto def_case;
1209 code += ((case_mask & CASE_U) != 0);
1210 for(i = 0; i < len; i += 2) {
1211 if (cr_add_interval(cr, code + i, code + i + 1))
1212 return -1;
1213 }
1214 break;
1215 case RUN_TYPE_LSU:
1216 if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
1217 goto def_case;
1218 if (!(case_mask & CASE_U)) {
1219 if (cr_add_interval(cr, code, code + 1))
1220 return -1;
1221 }
1222 if (cr_add_interval(cr, code + 1, code + 2))
1223 return -1;
1224 if (case_mask & CASE_U) {
1225 if (cr_add_interval(cr, code + 2, code + 3))
1226 return -1;
1227 }
1228 break;
1229 default:
1230 def_case:
1231 if (cr_add_interval(cr, code, code + len))
1232 return -1;
1233 break;
1234 }
1235 }
1236 }
1237 return 0;
1238 }
1239
1240 typedef enum {
1241 POP_GC,
1242 POP_PROP,
1243 POP_CASE,
1244 POP_UNION,
1245 POP_INTER,
1246 POP_XOR,
1247 POP_INVERT,
1248 POP_END,
1249 } PropOPEnum;
1250
1251 #define POP_STACK_LEN_MAX 4
1252
unicode_prop_ops(CharRange * cr,...)1253 static int unicode_prop_ops(CharRange *cr, ...)
1254 {
1255 va_list ap;
1256 CharRange stack[POP_STACK_LEN_MAX];
1257 int stack_len, op, ret, i;
1258 uint32_t a;
1259
1260 va_start(ap, cr);
1261 stack_len = 0;
1262 for(;;) {
1263 op = va_arg(ap, int);
1264 switch(op) {
1265 case POP_GC:
1266 assert(stack_len < POP_STACK_LEN_MAX);
1267 a = va_arg(ap, int);
1268 cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1269 if (unicode_general_category1(&stack[stack_len - 1], a))
1270 goto fail;
1271 break;
1272 case POP_PROP:
1273 assert(stack_len < POP_STACK_LEN_MAX);
1274 a = va_arg(ap, int);
1275 cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1276 if (unicode_prop1(&stack[stack_len - 1], a))
1277 goto fail;
1278 break;
1279 case POP_CASE:
1280 assert(stack_len < POP_STACK_LEN_MAX);
1281 a = va_arg(ap, int);
1282 cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
1283 if (unicode_case1(&stack[stack_len - 1], a))
1284 goto fail;
1285 break;
1286 case POP_UNION:
1287 case POP_INTER:
1288 case POP_XOR:
1289 {
1290 CharRange *cr1, *cr2, *cr3;
1291 assert(stack_len >= 2);
1292 assert(stack_len < POP_STACK_LEN_MAX);
1293 cr1 = &stack[stack_len - 2];
1294 cr2 = &stack[stack_len - 1];
1295 cr3 = &stack[stack_len++];
1296 cr_init(cr3, cr->mem_opaque, cr->realloc_func);
1297 if (cr_op(cr3, cr1->points, cr1->len,
1298 cr2->points, cr2->len, op - POP_UNION + CR_OP_UNION))
1299 goto fail;
1300 cr_free(cr1);
1301 cr_free(cr2);
1302 *cr1 = *cr3;
1303 stack_len -= 2;
1304 }
1305 break;
1306 case POP_INVERT:
1307 assert(stack_len >= 1);
1308 if (cr_invert(&stack[stack_len - 1]))
1309 goto fail;
1310 break;
1311 case POP_END:
1312 goto done;
1313 default:
1314 abort();
1315 }
1316 }
1317 done:
1318 assert(stack_len == 1);
1319 ret = cr_copy(cr, &stack[0]);
1320 cr_free(&stack[0]);
1321 return ret;
1322 fail:
1323 for(i = 0; i < stack_len; i++)
1324 cr_free(&stack[i]);
1325 return -1;
1326 }
1327
1328 static const uint32_t unicode_gc_mask_table[] = {
1329 M(Lu) | M(Ll) | M(Lt), /* LC */
1330 M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo), /* L */
1331 M(Mn) | M(Mc) | M(Me), /* M */
1332 M(Nd) | M(Nl) | M(No), /* N */
1333 M(Sm) | M(Sc) | M(Sk) | M(So), /* S */
1334 M(Pc) | M(Pd) | M(Ps) | M(Pe) | M(Pi) | M(Pf) | M(Po), /* P */
1335 M(Zs) | M(Zl) | M(Zp), /* Z */
1336 M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn), /* C */
1337 };
1338
1339 /* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1340 if not found */
unicode_general_category(CharRange * cr,const char * gc_name)1341 int unicode_general_category(CharRange *cr, const char *gc_name)
1342 {
1343 int gc_idx;
1344 uint32_t gc_mask;
1345
1346 gc_idx = unicode_find_name(unicode_gc_name_table, gc_name);
1347 if (gc_idx < 0)
1348 return -2;
1349 if (gc_idx <= UNICODE_GC_Co) {
1350 gc_mask = (uint64_t)1 << gc_idx;
1351 } else {
1352 gc_mask = unicode_gc_mask_table[gc_idx - UNICODE_GC_LC];
1353 }
1354 return unicode_general_category1(cr, gc_mask);
1355 }
1356
1357
1358 /* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
1359 if not found */
unicode_prop(CharRange * cr,const char * prop_name)1360 int unicode_prop(CharRange *cr, const char *prop_name)
1361 {
1362 int prop_idx, ret;
1363
1364 prop_idx = unicode_find_name(unicode_prop_name_table, prop_name);
1365 if (prop_idx < 0)
1366 return -2;
1367 prop_idx += UNICODE_PROP_ASCII_Hex_Digit;
1368
1369 ret = 0;
1370 switch(prop_idx) {
1371 case UNICODE_PROP_ASCII:
1372 if (cr_add_interval(cr, 0x00, 0x7f + 1))
1373 return -1;
1374 break;
1375 case UNICODE_PROP_Any:
1376 if (cr_add_interval(cr, 0x00000, 0x10ffff + 1))
1377 return -1;
1378 break;
1379 case UNICODE_PROP_Assigned:
1380 ret = unicode_prop_ops(cr,
1381 POP_GC, M(Cn),
1382 POP_INVERT,
1383 POP_END);
1384 break;
1385 case UNICODE_PROP_Math:
1386 ret = unicode_prop_ops(cr,
1387 POP_GC, M(Sm),
1388 POP_PROP, UNICODE_PROP_Other_Math,
1389 POP_UNION,
1390 POP_END);
1391 break;
1392 case UNICODE_PROP_Lowercase:
1393 ret = unicode_prop_ops(cr,
1394 POP_GC, M(Ll),
1395 POP_PROP, UNICODE_PROP_Other_Lowercase,
1396 POP_UNION,
1397 POP_END);
1398 break;
1399 case UNICODE_PROP_Uppercase:
1400 ret = unicode_prop_ops(cr,
1401 POP_GC, M(Lu),
1402 POP_PROP, UNICODE_PROP_Other_Uppercase,
1403 POP_UNION,
1404 POP_END);
1405 break;
1406 case UNICODE_PROP_Cased:
1407 ret = unicode_prop_ops(cr,
1408 POP_GC, M(Lu) | M(Ll) | M(Lt),
1409 POP_PROP, UNICODE_PROP_Other_Uppercase,
1410 POP_UNION,
1411 POP_PROP, UNICODE_PROP_Other_Lowercase,
1412 POP_UNION,
1413 POP_END);
1414 break;
1415 case UNICODE_PROP_Alphabetic:
1416 ret = unicode_prop_ops(cr,
1417 POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1418 POP_PROP, UNICODE_PROP_Other_Uppercase,
1419 POP_UNION,
1420 POP_PROP, UNICODE_PROP_Other_Lowercase,
1421 POP_UNION,
1422 POP_PROP, UNICODE_PROP_Other_Alphabetic,
1423 POP_UNION,
1424 POP_END);
1425 break;
1426 case UNICODE_PROP_Grapheme_Base:
1427 ret = unicode_prop_ops(cr,
1428 POP_GC, M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn) | M(Zl) | M(Zp) | M(Me) | M(Mn),
1429 POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
1430 POP_UNION,
1431 POP_INVERT,
1432 POP_END);
1433 break;
1434 case UNICODE_PROP_Grapheme_Extend:
1435 ret = unicode_prop_ops(cr,
1436 POP_GC, M(Me) | M(Mn),
1437 POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
1438 POP_UNION,
1439 POP_END);
1440 break;
1441 case UNICODE_PROP_XID_Start:
1442 ret = unicode_prop_ops(cr,
1443 POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1444 POP_PROP, UNICODE_PROP_Other_ID_Start,
1445 POP_UNION,
1446 POP_PROP, UNICODE_PROP_Pattern_Syntax,
1447 POP_PROP, UNICODE_PROP_Pattern_White_Space,
1448 POP_UNION,
1449 POP_PROP, UNICODE_PROP_XID_Start1,
1450 POP_UNION,
1451 POP_INVERT,
1452 POP_INTER,
1453 POP_END);
1454 break;
1455 case UNICODE_PROP_XID_Continue:
1456 ret = unicode_prop_ops(cr,
1457 POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
1458 M(Mn) | M(Mc) | M(Nd) | M(Pc),
1459 POP_PROP, UNICODE_PROP_Other_ID_Start,
1460 POP_UNION,
1461 POP_PROP, UNICODE_PROP_Other_ID_Continue,
1462 POP_UNION,
1463 POP_PROP, UNICODE_PROP_Pattern_Syntax,
1464 POP_PROP, UNICODE_PROP_Pattern_White_Space,
1465 POP_UNION,
1466 POP_PROP, UNICODE_PROP_XID_Continue1,
1467 POP_UNION,
1468 POP_INVERT,
1469 POP_INTER,
1470 POP_END);
1471 break;
1472 case UNICODE_PROP_Changes_When_Uppercased:
1473 ret = unicode_case1(cr, CASE_U);
1474 break;
1475 case UNICODE_PROP_Changes_When_Lowercased:
1476 ret = unicode_case1(cr, CASE_L);
1477 break;
1478 case UNICODE_PROP_Changes_When_Casemapped:
1479 ret = unicode_case1(cr, CASE_U | CASE_L | CASE_F);
1480 break;
1481 case UNICODE_PROP_Changes_When_Titlecased:
1482 ret = unicode_prop_ops(cr,
1483 POP_CASE, CASE_U,
1484 POP_PROP, UNICODE_PROP_Changes_When_Titlecased1,
1485 POP_XOR,
1486 POP_END);
1487 break;
1488 case UNICODE_PROP_Changes_When_Casefolded:
1489 ret = unicode_prop_ops(cr,
1490 POP_CASE, CASE_F,
1491 POP_PROP, UNICODE_PROP_Changes_When_Casefolded1,
1492 POP_XOR,
1493 POP_END);
1494 break;
1495 case UNICODE_PROP_Changes_When_NFKC_Casefolded:
1496 ret = unicode_prop_ops(cr,
1497 POP_CASE, CASE_F,
1498 POP_PROP, UNICODE_PROP_Changes_When_NFKC_Casefolded1,
1499 POP_XOR,
1500 POP_END);
1501 break;
1502 #if 0
1503 case UNICODE_PROP_ID_Start:
1504 ret = unicode_prop_ops(cr,
1505 POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
1506 POP_PROP, UNICODE_PROP_Other_ID_Start,
1507 POP_UNION,
1508 POP_PROP, UNICODE_PROP_Pattern_Syntax,
1509 POP_PROP, UNICODE_PROP_Pattern_White_Space,
1510 POP_UNION,
1511 POP_INVERT,
1512 POP_INTER,
1513 POP_END);
1514 break;
1515 case UNICODE_PROP_ID_Continue:
1516 ret = unicode_prop_ops(cr,
1517 POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
1518 M(Mn) | M(Mc) | M(Nd) | M(Pc),
1519 POP_PROP, UNICODE_PROP_Other_ID_Start,
1520 POP_UNION,
1521 POP_PROP, UNICODE_PROP_Other_ID_Continue,
1522 POP_UNION,
1523 POP_PROP, UNICODE_PROP_Pattern_Syntax,
1524 POP_PROP, UNICODE_PROP_Pattern_White_Space,
1525 POP_UNION,
1526 POP_INVERT,
1527 POP_INTER,
1528 POP_END);
1529 break;
1530 case UNICODE_PROP_Case_Ignorable:
1531 ret = unicode_prop_ops(cr,
1532 POP_GC, M(Mn) | M(Cf) | M(Lm) | M(Sk),
1533 POP_PROP, UNICODE_PROP_Case_Ignorable1,
1534 POP_XOR,
1535 POP_END);
1536 break;
1537 #else
1538 /* we use the existing tables */
1539 case UNICODE_PROP_ID_Continue:
1540 ret = unicode_prop_ops(cr,
1541 POP_PROP, UNICODE_PROP_ID_Start,
1542 POP_PROP, UNICODE_PROP_ID_Continue1,
1543 POP_XOR,
1544 POP_END);
1545 break;
1546 #endif
1547 default:
1548 if (prop_idx >= countof(unicode_prop_table))
1549 return -2;
1550 ret = unicode_prop1(cr, prop_idx);
1551 break;
1552 }
1553 return ret;
1554 }
1555
1556 #endif /* CONFIG_ALL_UNICODE */
1557