1 /*
2 * Generation of Unicode tables
3 *
4 * Copyright (c) 2017-2018 Fabrice Bellard
5 * Copyright (c) 2017-2018 Charlie Gordon
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal
9 * in the Software without restriction, including without limitation the rights
10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the Software is
12 * furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 * THE SOFTWARE.
24 */
25 #include <stdlib.h>
26 #include <stdio.h>
27 #include <stdarg.h>
28 #include <inttypes.h>
29 #include <string.h>
30 #include <assert.h>
31 #include <ctype.h>
32 #include <time.h>
33
34 #include "cutils.h"
35
36 /* define it to be able to test unicode.c */
37 //#define USE_TEST
38 /* profile tests */
39 //#define PROFILE
40
41 //#define DUMP_CASE_CONV_TABLE
42 //#define DUMP_TABLE_SIZE
43 //#define DUMP_CC_TABLE
44 //#define DUMP_DECOMP_TABLE
45
46 /* Ideas:
47 - Generalize run length encoding + index for all tables
48 - remove redundant tables for ID_start, ID_continue, Case_Ignorable, Cased
49
50 Case conversion:
51 - use a single entry for consecutive U/LF runs
52 - allow EXT runs of length > 1
53
54 Decomposition:
55 - Greek lower case (+1f10/1f10) ?
56 - allow holes in B runs
57 - suppress more upper / lower case redundancy
58 */
59
60 #ifdef USE_TEST
61 #include "libunicode.c"
62 #endif
63
64 #define CHARCODE_MAX 0x10ffff
65 #define CC_LEN_MAX 3
66
mallocz(size_t size)67 void *mallocz(size_t size)
68 {
69 void *ptr;
70 ptr = malloc(size);
71 memset(ptr, 0, size);
72 return ptr;
73 }
74
get_field(const char * p,int n)75 const char *get_field(const char *p, int n)
76 {
77 int i;
78 for(i = 0; i < n; i++) {
79 while (*p != ';' && *p != '\0')
80 p++;
81 if (*p == '\0')
82 return NULL;
83 p++;
84 }
85 return p;
86 }
87
get_field_buf(char * buf,size_t buf_size,const char * p,int n)88 const char *get_field_buf(char *buf, size_t buf_size, const char *p, int n)
89 {
90 char *q;
91 p = get_field(p, n);
92 q = buf;
93 while (*p != ';' && *p != '\0') {
94 if ((q - buf) < buf_size - 1)
95 *q++ = *p;
96 p++;
97 }
98 *q = '\0';
99 return buf;
100 }
101
add_char(int ** pbuf,int * psize,int * plen,int c)102 void add_char(int **pbuf, int *psize, int *plen, int c)
103 {
104 int len, size;
105 size = *psize;
106 len = *plen;
107 if (len >= size) {
108 size = *psize;
109 size = max_int(len + 1, size * 3 / 2);
110 int *buf = realloc(*pbuf, sizeof((*pbuf)[0]) * size);
111 if (!buf) {
112 sprintf(stderr, "relloc failed. file:%s func:%s line:%d", __FILE__, __FUNCTION__, __LINE__);
113 exit(1);
114 } else {
115 *pbuf = buf;
116 *psize = size;
117 }
118 }
119 (*pbuf)[len++] = c;
120 *plen = len;
121 }
122
get_field_str(int * plen,const char * str,int n)123 int *get_field_str(int *plen, const char *str, int n)
124 {
125 const char *p;
126 int *buf, len, size;
127 p = get_field(str, n);
128 if (!p) {
129 *plen = 0;
130 return NULL;
131 }
132 len = 0;
133 size = 0;
134 buf = NULL;
135 for(;;) {
136 while (isspace(*p))
137 p++;
138 if (!isxdigit(*p))
139 break;
140 add_char(&buf, &size, &len, strtoul(p, (char **)&p, 16));
141 }
142 *plen = len;
143 return buf;
144 }
145
get_line(char * buf,int buf_size,FILE * f)146 char *get_line(char *buf, int buf_size, FILE *f)
147 {
148 int len;
149 if (!fgets(buf, buf_size, f))
150 return NULL;
151 len = strlen(buf);
152 if (len > 0 && buf[len - 1] == '\n')
153 buf[len - 1] = '\0';
154 return buf;
155 }
156
157 #define UNICODE_GENERAL_CATEGORY
158
159 typedef enum {
160 #define DEF(id, str) GCAT_ ## id,
161 #include "unicode_gen_def.h"
162 #undef DEF
163 GCAT_COUNT,
164 } UnicodeGCEnum1;
165
166 static const char *unicode_gc_name[] = {
167 #define DEF(id, str) #id,
168 #include "unicode_gen_def.h"
169 #undef DEF
170 };
171
172 static const char *unicode_gc_short_name[] = {
173 #define DEF(id, str) str,
174 #include "unicode_gen_def.h"
175 #undef DEF
176 };
177
178 #undef UNICODE_GENERAL_CATEGORY
179
180 #define UNICODE_SCRIPT
181
182 typedef enum {
183 #define DEF(id, str) SCRIPT_ ## id,
184 #include "unicode_gen_def.h"
185 #undef DEF
186 SCRIPT_COUNT,
187 } UnicodeScriptEnum1;
188
189 static const char *unicode_script_name[] = {
190 #define DEF(id, str) #id,
191 #include "unicode_gen_def.h"
192 #undef DEF
193 };
194
195 const char *unicode_script_short_name[] = {
196 #define DEF(id, str) str,
197 #include "unicode_gen_def.h"
198 #undef DEF
199 };
200
201 #undef UNICODE_SCRIPT
202
203 #define UNICODE_PROP_LIST
204
205 typedef enum {
206 #define DEF(id, str) PROP_ ## id,
207 #include "unicode_gen_def.h"
208 #undef DEF
209 PROP_COUNT,
210 } UnicodePropEnum1;
211
212 static const char *unicode_prop_name[] = {
213 #define DEF(id, str) #id,
214 #include "unicode_gen_def.h"
215 #undef DEF
216 };
217
218 static const char *unicode_prop_short_name[] = {
219 #define DEF(id, str) str,
220 #include "unicode_gen_def.h"
221 #undef DEF
222 };
223
224 #undef UNICODE_SPROP_LIST
225
226 typedef struct {
227 /* case conv */
228 uint8_t u_len;
229 uint8_t l_len;
230 int u_data[CC_LEN_MAX];
231 int l_data[CC_LEN_MAX];
232 int f_code;
233
234 uint8_t combining_class;
235 uint8_t is_compat:1;
236 uint8_t is_excluded:1;
237 uint8_t general_category;
238 uint8_t script;
239 uint8_t script_ext_len;
240 uint8_t *script_ext;
241 uint32_t prop_bitmap_tab[3];
242 /* decomposition */
243 int decomp_len;
244 int *decomp_data;
245 } CCInfo;
246
247 CCInfo *unicode_db;
248
find_name(const char ** tab,int tab_len,const char * name)249 int find_name(const char **tab, int tab_len, const char *name)
250 {
251 int i, len, name_len;
252 const char *p, *r;
253
254 name_len = strlen(name);
255 for(i = 0; i < tab_len; i++) {
256 p = tab[i];
257 for(;;) {
258 r = strchr(p, ',');
259 if (!r)
260 len = strlen(p);
261 else
262 len = r - p;
263 if (len == name_len && memcmp(p, name, len) == 0)
264 return i;
265 if (!r)
266 break;
267 p = r + 1;
268 }
269 }
270 return -1;
271 }
272
get_prop(uint32_t c,int prop_idx)273 static int get_prop(uint32_t c, int prop_idx)
274 {
275 return (unicode_db[c].prop_bitmap_tab[prop_idx >> 5] >> (prop_idx & 0x1f)) & 1;
276 }
277
set_prop(uint32_t c,int prop_idx,int val)278 static void set_prop(uint32_t c, int prop_idx, int val)
279 {
280 uint32_t mask;
281 mask = 1U << (prop_idx & 0x1f);
282 if (val)
283 unicode_db[c].prop_bitmap_tab[prop_idx >> 5] |= mask;
284 else
285 unicode_db[c].prop_bitmap_tab[prop_idx >> 5] &= ~mask;
286 }
287
parse_unicode_data(const char * filename)288 void parse_unicode_data(const char *filename)
289 {
290 FILE *f;
291 char line[1024];
292 char buf1[256];
293 const char *p;
294 int code, lc, uc, last_code;
295 CCInfo *ci, *tab = unicode_db;
296
297 f = fopen(filename, "rb");
298 if (!f) {
299 perror(filename);
300 exit(1);
301 }
302
303 last_code = 0;
304 for(;;) {
305 if (!get_line(line, sizeof(line), f))
306 break;
307 p = line;
308 while (isspace(*p))
309 p++;
310 if (*p == '#')
311 continue;
312
313 p = get_field(line, 0);
314 if (!p)
315 continue;
316 code = strtoul(p, NULL, 16);
317 lc = 0;
318 uc = 0;
319
320 p = get_field(line, 12);
321 if (p && *p != ';') {
322 uc = strtoul(p, NULL, 16);
323 }
324
325 p = get_field(line, 13);
326 if (p && *p != ';') {
327 lc = strtoul(p, NULL, 16);
328 }
329 ci = &tab[code];
330 if (uc > 0 || lc > 0) {
331 assert(code <= CHARCODE_MAX);
332 if (uc > 0) {
333 assert(ci->u_len == 0);
334 ci->u_len = 1;
335 ci->u_data[0] = uc;
336 }
337 if (lc > 0) {
338 assert(ci->l_len == 0);
339 ci->l_len = 1;
340 ci->l_data[0] = lc;
341 }
342 }
343
344 {
345 int i;
346 get_field_buf(buf1, sizeof(buf1), line, 2);
347 i = find_name(unicode_gc_name, countof(unicode_gc_name), buf1);
348 if (i < 0) {
349 fprintf(stderr, "General category '%s' not found\n",
350 buf1);
351 exit(1);
352 }
353 ci->general_category = i;
354 }
355
356 p = get_field(line, 3);
357 if (p && *p != ';' && *p != '\0') {
358 int cc;
359 cc = strtoul(p, NULL, 0);
360 if (cc != 0) {
361 assert(code <= CHARCODE_MAX);
362 ci->combining_class = cc;
363 // printf("%05x: %d\n", code, ci->combining_class);
364 }
365 }
366
367 p = get_field(line, 5);
368 if (p && *p != ';' && *p != '\0') {
369 int size;
370 assert(code <= CHARCODE_MAX);
371 ci->is_compat = 0;
372 if (*p == '<') {
373 while (*p != '\0' && *p != '>')
374 p++;
375 if (*p == '>')
376 p++;
377 ci->is_compat = 1;
378 }
379 size = 0;
380 for(;;) {
381 while (isspace(*p))
382 p++;
383 if (!isxdigit(*p))
384 break;
385 add_char(&ci->decomp_data, &size, &ci->decomp_len, strtoul(p, (char **)&p, 16));
386 }
387 #if 0
388 {
389 int i;
390 static int count, d_count;
391
392 printf("%05x: %c", code, ci->is_compat ? 'C': ' ');
393 for(i = 0; i < ci->decomp_len; i++)
394 printf(" %05x", ci->decomp_data[i]);
395 printf("\n");
396 count++;
397 d_count += ci->decomp_len;
398 // printf("%d %d\n", count, d_count);
399 }
400 #endif
401 }
402
403 p = get_field(line, 9);
404 if (p && *p == 'Y') {
405 set_prop(code, PROP_Bidi_Mirrored, 1);
406 }
407
408 /* handle ranges */
409 get_field_buf(buf1, sizeof(buf1), line, 1);
410 if (strstr(buf1, " Last>")) {
411 int i;
412 // printf("range: 0x%x-%0x\n", last_code, code);
413 assert(ci->decomp_len == 0);
414 assert(ci->script_ext_len == 0);
415 for(i = last_code + 1; i < code; i++) {
416 unicode_db[i] = *ci;
417 }
418 }
419 last_code = code;
420 }
421
422 fclose(f);
423 }
424
parse_special_casing(CCInfo * tab,const char * filename)425 void parse_special_casing(CCInfo *tab, const char *filename)
426 {
427 FILE *f;
428 char line[1024];
429 const char *p;
430 int code;
431 CCInfo *ci;
432
433 f = fopen(filename, "rb");
434 if (!f) {
435 perror(filename);
436 exit(1);
437 }
438
439 for(;;) {
440 if (!get_line(line, sizeof(line), f))
441 break;
442 p = line;
443 while (isspace(*p))
444 p++;
445 if (*p == '#')
446 continue;
447
448 p = get_field(line, 0);
449 if (!p)
450 continue;
451 code = strtoul(p, NULL, 16);
452 assert(code <= CHARCODE_MAX);
453 ci = &tab[code];
454
455 p = get_field(line, 4);
456 if (p) {
457 /* locale dependent casing */
458 while (isspace(*p))
459 p++;
460 if (*p != '#' && *p != '\0')
461 continue;
462 }
463
464
465 p = get_field(line, 1);
466 if (p && *p != ';') {
467 ci->l_len = 0;
468 for(;;) {
469 while (isspace(*p))
470 p++;
471 if (*p == ';')
472 break;
473 assert(ci->l_len < CC_LEN_MAX);
474 ci->l_data[ci->l_len++] = strtoul(p, (char **)&p, 16);
475 }
476
477 if (ci->l_len == 1 && ci->l_data[0] == code)
478 ci->l_len = 0;
479 }
480
481 p = get_field(line, 3);
482 if (p && *p != ';') {
483 ci->u_len = 0;
484 for(;;) {
485 while (isspace(*p))
486 p++;
487 if (*p == ';')
488 break;
489 assert(ci->u_len < CC_LEN_MAX);
490 ci->u_data[ci->u_len++] = strtoul(p, (char **)&p, 16);
491 }
492
493 if (ci->u_len == 1 && ci->u_data[0] == code)
494 ci->u_len = 0;
495 }
496 }
497
498 fclose(f);
499 }
500
parse_case_folding(CCInfo * tab,const char * filename)501 void parse_case_folding(CCInfo *tab, const char *filename)
502 {
503 FILE *f;
504 char line[1024];
505 const char *p;
506 int code;
507 CCInfo *ci;
508
509 f = fopen(filename, "rb");
510 if (!f) {
511 perror(filename);
512 exit(1);
513 }
514
515 for(;;) {
516 if (!get_line(line, sizeof(line), f))
517 break;
518 p = line;
519 while (isspace(*p))
520 p++;
521 if (*p == '#')
522 continue;
523
524 p = get_field(line, 0);
525 if (!p)
526 continue;
527 code = strtoul(p, NULL, 16);
528 assert(code <= CHARCODE_MAX);
529 ci = &tab[code];
530
531 p = get_field(line, 1);
532 if (!p)
533 continue;
534 /* locale dependent casing */
535 while (isspace(*p))
536 p++;
537 if (*p != 'C' && *p != 'S')
538 continue;
539
540 p = get_field(line, 2);
541 assert(p != 0);
542 assert(ci->f_code == 0);
543 ci->f_code = strtoul(p, NULL, 16);
544 assert(ci->f_code != 0 && ci->f_code != code);
545 }
546
547 fclose(f);
548 }
549
parse_composition_exclusions(const char * filename)550 void parse_composition_exclusions(const char *filename)
551 {
552 FILE *f;
553 char line[4096], *p;
554 uint32_t c0;
555
556 f = fopen(filename, "rb");
557 if (!f) {
558 perror(filename);
559 exit(1);
560 }
561
562 for(;;) {
563 if (!get_line(line, sizeof(line), f))
564 break;
565 p = line;
566 while (isspace(*p))
567 p++;
568 if (*p == '#' || *p == '@' || *p == '\0')
569 continue;
570 c0 = strtoul(p, (char **)&p, 16);
571 assert(c0 > 0 && c0 <= CHARCODE_MAX);
572 unicode_db[c0].is_excluded = TRUE;
573 }
574 fclose(f);
575 }
576
parse_derived_core_properties(const char * filename)577 void parse_derived_core_properties(const char *filename)
578 {
579 FILE *f;
580 char line[4096], *p, buf[256], *q;
581 uint32_t c0, c1, c;
582 int i;
583
584 f = fopen(filename, "rb");
585 if (!f) {
586 perror(filename);
587 exit(1);
588 }
589
590 for(;;) {
591 if (!get_line(line, sizeof(line), f))
592 break;
593 p = line;
594 while (isspace(*p))
595 p++;
596 if (*p == '#' || *p == '@' || *p == '\0')
597 continue;
598 c0 = strtoul(p, (char **)&p, 16);
599 if (*p == '.' && p[1] == '.') {
600 p += 2;
601 c1 = strtoul(p, (char **)&p, 16);
602 } else {
603 c1 = c0;
604 }
605 assert(c1 <= CHARCODE_MAX);
606 p += strspn(p, " \t");
607 if (*p == ';') {
608 p++;
609 p += strspn(p, " \t");
610 q = buf;
611 while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
612 if ((q - buf) < sizeof(buf) - 1)
613 *q++ = *p;
614 p++;
615 }
616 *q = '\0';
617 i = find_name(unicode_prop_name,
618 countof(unicode_prop_name), buf);
619 if (i < 0) {
620 if (!strcmp(buf, "Grapheme_Link"))
621 goto next;
622 fprintf(stderr, "Property not found: %s\n", buf);
623 exit(1);
624 }
625 for(c = c0; c <= c1; c++) {
626 set_prop(c, i, 1);
627 }
628 next: ;
629 }
630 }
631 fclose(f);
632 }
633
parse_derived_norm_properties(const char * filename)634 void parse_derived_norm_properties(const char *filename)
635 {
636 FILE *f;
637 char line[4096], *p, buf[256], *q;
638 uint32_t c0, c1, c;
639
640 f = fopen(filename, "rb");
641 if (!f) {
642 perror(filename);
643 exit(1);
644 }
645
646 for(;;) {
647 if (!get_line(line, sizeof(line), f))
648 break;
649 p = line;
650 while (isspace(*p))
651 p++;
652 if (*p == '#' || *p == '@' || *p == '\0')
653 continue;
654 c0 = strtoul(p, (char **)&p, 16);
655 if (*p == '.' && p[1] == '.') {
656 p += 2;
657 c1 = strtoul(p, (char **)&p, 16);
658 } else {
659 c1 = c0;
660 }
661 assert(c1 <= CHARCODE_MAX);
662 p += strspn(p, " \t");
663 if (*p == ';') {
664 p++;
665 p += strspn(p, " \t");
666 q = buf;
667 while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
668 if ((q - buf) < sizeof(buf) - 1)
669 *q++ = *p;
670 p++;
671 }
672 *q = '\0';
673 if (!strcmp(buf, "Changes_When_NFKC_Casefolded")) {
674 for(c = c0; c <= c1; c++) {
675 set_prop(c, PROP_Changes_When_NFKC_Casefolded, 1);
676 }
677 }
678 }
679 }
680 fclose(f);
681 }
682
parse_prop_list(const char * filename)683 void parse_prop_list(const char *filename)
684 {
685 FILE *f;
686 char line[4096], *p, buf[256], *q;
687 uint32_t c0, c1, c;
688 int i;
689
690 f = fopen(filename, "rb");
691 if (!f) {
692 perror(filename);
693 exit(1);
694 }
695
696 for(;;) {
697 if (!get_line(line, sizeof(line), f))
698 break;
699 p = line;
700 while (isspace(*p))
701 p++;
702 if (*p == '#' || *p == '@' || *p == '\0')
703 continue;
704 c0 = strtoul(p, (char **)&p, 16);
705 if (*p == '.' && p[1] == '.') {
706 p += 2;
707 c1 = strtoul(p, (char **)&p, 16);
708 } else {
709 c1 = c0;
710 }
711 assert(c1 <= CHARCODE_MAX);
712 p += strspn(p, " \t");
713 if (*p == ';') {
714 p++;
715 p += strspn(p, " \t");
716 q = buf;
717 while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
718 if ((q - buf) < sizeof(buf) - 1)
719 *q++ = *p;
720 p++;
721 }
722 *q = '\0';
723 i = find_name(unicode_prop_name,
724 countof(unicode_prop_name), buf);
725 if (i < 0) {
726 fprintf(stderr, "Property not found: %s\n", buf);
727 exit(1);
728 }
729 for(c = c0; c <= c1; c++) {
730 set_prop(c, i, 1);
731 }
732 }
733 }
734 fclose(f);
735 }
736
parse_scripts(const char * filename)737 void parse_scripts(const char *filename)
738 {
739 FILE *f;
740 char line[4096], *p, buf[256], *q;
741 uint32_t c0, c1, c;
742 int i;
743
744 f = fopen(filename, "rb");
745 if (!f) {
746 perror(filename);
747 exit(1);
748 }
749
750 for(;;) {
751 if (!get_line(line, sizeof(line), f))
752 break;
753 p = line;
754 while (isspace(*p))
755 p++;
756 if (*p == '#' || *p == '@' || *p == '\0')
757 continue;
758 c0 = strtoul(p, (char **)&p, 16);
759 if (*p == '.' && p[1] == '.') {
760 p += 2;
761 c1 = strtoul(p, (char **)&p, 16);
762 } else {
763 c1 = c0;
764 }
765 assert(c1 <= CHARCODE_MAX);
766 p += strspn(p, " \t");
767 if (*p == ';') {
768 p++;
769 p += strspn(p, " \t");
770 q = buf;
771 while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
772 if ((q - buf) < sizeof(buf) - 1)
773 *q++ = *p;
774 p++;
775 }
776 *q = '\0';
777 i = find_name(unicode_script_name,
778 countof(unicode_script_name), buf);
779 if (i < 0) {
780 fprintf(stderr, "Unknown script: '%s'\n", buf);
781 exit(1);
782 }
783 for(c = c0; c <= c1; c++)
784 unicode_db[c].script = i;
785 }
786 }
787 fclose(f);
788 }
789
parse_script_extensions(const char * filename)790 void parse_script_extensions(const char *filename)
791 {
792 FILE *f;
793 char line[4096], *p, buf[256], *q;
794 uint32_t c0, c1, c;
795 int i;
796 uint8_t script_ext[255];
797 int script_ext_len;
798
799 f = fopen(filename, "rb");
800 if (!f) {
801 perror(filename);
802 exit(1);
803 }
804
805 for(;;) {
806 if (!get_line(line, sizeof(line), f))
807 break;
808 p = line;
809 while (isspace(*p))
810 p++;
811 if (*p == '#' || *p == '@' || *p == '\0')
812 continue;
813 c0 = strtoul(p, (char **)&p, 16);
814 if (*p == '.' && p[1] == '.') {
815 p += 2;
816 c1 = strtoul(p, (char **)&p, 16);
817 } else {
818 c1 = c0;
819 }
820 assert(c1 <= CHARCODE_MAX);
821 p += strspn(p, " \t");
822 script_ext_len = 0;
823 if (*p == ';') {
824 p++;
825 for(;;) {
826 p += strspn(p, " \t");
827 q = buf;
828 while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
829 if ((q - buf) < sizeof(buf) - 1)
830 *q++ = *p;
831 p++;
832 }
833 *q = '\0';
834 if (buf[0] == '\0')
835 break;
836 i = find_name(unicode_script_short_name,
837 countof(unicode_script_short_name), buf);
838 if (i < 0) {
839 fprintf(stderr, "Script not found: %s\n", buf);
840 exit(1);
841 }
842 assert(script_ext_len < sizeof(script_ext));
843 script_ext[script_ext_len++] = i;
844 }
845 for(c = c0; c <= c1; c++) {
846 CCInfo *ci = &unicode_db[c];
847 ci->script_ext_len = script_ext_len;
848 ci->script_ext = malloc(sizeof(ci->script_ext[0]) * script_ext_len);
849 for(i = 0; i < script_ext_len; i++)
850 ci->script_ext[i] = script_ext[i];
851 }
852 }
853 }
854 fclose(f);
855 }
856
dump_cc_info(CCInfo * ci,int i)857 void dump_cc_info(CCInfo *ci, int i)
858 {
859 int j;
860 printf("%05x:", i);
861 if (ci->u_len != 0) {
862 printf(" U:");
863 for(j = 0; j < ci->u_len; j++)
864 printf(" %05x", ci->u_data[j]);
865 }
866 if (ci->l_len != 0) {
867 printf(" L:");
868 for(j = 0; j < ci->l_len; j++)
869 printf(" %05x", ci->l_data[j]);
870 }
871 if (ci->f_code != 0) {
872 printf(" F: %05x", ci->f_code);
873 }
874 printf("\n");
875 }
876
dump_data(CCInfo * tab)877 void dump_data(CCInfo *tab)
878 {
879 int i;
880 CCInfo *ci;
881 for(i = 0; i <= CHARCODE_MAX; i++) {
882 ci = &tab[i];
883 if (ci->u_len != 0 || ci->l_len != 0 || ci->f_code != 0) {
884 dump_cc_info(ci, i);
885 }
886 }
887 }
888
is_complicated_case(const CCInfo * ci)889 BOOL is_complicated_case(const CCInfo *ci)
890 {
891 return (ci->u_len > 1 || ci->l_len > 1 ||
892 (ci->u_len > 0 && ci->l_len > 0) ||
893 (ci->f_code != 0) != ci->l_len ||
894 (ci->f_code != 0 && ci->l_data[0] != ci->f_code));
895 }
896
897 #ifndef USE_TEST
898 enum {
899 RUN_TYPE_U,
900 RUN_TYPE_L,
901 RUN_TYPE_UF,
902 RUN_TYPE_LF,
903 RUN_TYPE_UL,
904 RUN_TYPE_LSU,
905 RUN_TYPE_U2L_399_EXT2,
906 RUN_TYPE_UF_D20,
907 RUN_TYPE_UF_D1_EXT,
908 RUN_TYPE_U_EXT,
909 RUN_TYPE_LF_EXT,
910 RUN_TYPE_U_EXT2,
911 RUN_TYPE_L_EXT2,
912 RUN_TYPE_U_EXT3,
913 };
914 #endif
915
916 const char *run_type_str[] = {
917 "U",
918 "L",
919 "UF",
920 "LF",
921 "UL",
922 "LSU",
923 "U2L_399_EXT2",
924 "UF_D20",
925 "UF_D1_EXT",
926 "U_EXT",
927 "LF_EXT",
928 "U_EXT2",
929 "L_EXT2",
930 "U_EXT3",
931 };
932
933 typedef struct {
934 int code;
935 int len;
936 int type;
937 int data;
938 int ext_len;
939 int ext_data[3];
940 int data_index; /* 'data' coming from the table */
941 } TableEntry;
942
943 /* code (17), len (7), type (4) */
944
find_run_type(TableEntry * te,CCInfo * tab,int code)945 void find_run_type(TableEntry *te, CCInfo *tab, int code)
946 {
947 int is_lower, len;
948 CCInfo *ci, *ci1, *ci2;
949
950 ci = &tab[code];
951 ci1 = &tab[code + 1];
952 ci2 = &tab[code + 2];
953 te->code = code;
954
955 if (ci->l_len == 1 && ci->l_data[0] == code + 2 &&
956 ci->f_code == ci->l_data[0] &&
957 ci->u_len == 0 &&
958
959 ci1->l_len == 1 && ci1->l_data[0] == code + 2 &&
960 ci1->f_code == ci1->l_data[0] &&
961 ci1->u_len == 1 && ci1->u_data[0] == code &&
962
963 ci2->l_len == 0 &&
964 ci2->f_code == 0 &&
965 ci2->u_len == 1 && ci2->u_data[0] == code) {
966 te->len = 3;
967 te->data = 0;
968 te->type = RUN_TYPE_LSU;
969 return;
970 }
971
972 if (is_complicated_case(ci)) {
973 len = 1;
974 while (code + len <= CHARCODE_MAX) {
975 ci1 = &tab[code + len];
976 if (ci1->u_len != 1 ||
977 ci1->u_data[0] != ci->u_data[0] + len ||
978 ci1->l_len != 0 ||
979 ci1->f_code != ci1->u_data[0])
980 break;
981 len++;
982 }
983 if (len > 1) {
984 te->len = len;
985 te->type = RUN_TYPE_UF;
986 te->data = ci->u_data[0];
987 return;
988 }
989
990 if (ci->u_len == 2 && ci->u_data[1] == 0x399 &&
991 ci->f_code == 0 && ci->l_len == 0) {
992 len = 1;
993 while (code + len <= CHARCODE_MAX) {
994 ci1 = &tab[code + len];
995 if (!(ci1->u_len == 2 &&
996 ci1->u_data[1] == 0x399 &&
997 ci1->u_data[0] == ci->u_data[0] + len &&
998 ci1->f_code == 0 &&
999 ci1->l_len == 0))
1000 break;
1001 len++;
1002 }
1003 te->len = len;
1004 te->type = RUN_TYPE_U_EXT2;
1005 te->ext_data[0] = ci->u_data[0];
1006 te->ext_data[1] = ci->u_data[1];
1007 te->ext_len = 2;
1008 return;
1009 }
1010
1011 if (ci->u_len == 2 && ci->u_data[1] == 0x399 &&
1012 ci->l_len == 1 && ci->f_code == ci->l_data[0]) {
1013 len = 1;
1014 while (code + len <= CHARCODE_MAX) {
1015 ci1 = &tab[code + len];
1016 if (!(ci1->u_len == 2 &&
1017 ci1->u_data[1] == 0x399 &&
1018 ci1->u_data[0] == ci->u_data[0] + len &&
1019 ci1->l_len == 1 &&
1020 ci1->l_data[0] == ci->l_data[0] + len &&
1021 ci1->f_code == ci1->l_data[0]))
1022 break;
1023 len++;
1024 }
1025 te->len = len;
1026 te->type = RUN_TYPE_U2L_399_EXT2;
1027 te->ext_data[0] = ci->u_data[0];
1028 te->ext_data[1] = ci->l_data[0];
1029 te->ext_len = 2;
1030 return;
1031 }
1032
1033 if (ci->l_len == 1 && ci->u_len == 0 && ci->f_code == 0) {
1034 len = 1;
1035 while (code + len <= CHARCODE_MAX) {
1036 ci1 = &tab[code + len];
1037 if (!(ci1->l_len == 1 &&
1038 ci1->l_data[0] == ci->l_data[0] + len &&
1039 ci1->u_len == 0 && ci1->f_code == 0))
1040 break;
1041 len++;
1042 }
1043 te->len = len;
1044 te->type = RUN_TYPE_L;
1045 te->data = ci->l_data[0];
1046 return;
1047 }
1048
1049 if (ci->l_len == 0 &&
1050 ci->u_len == 1 &&
1051 ci->u_data[0] < 0x1000 &&
1052 ci->f_code == ci->u_data[0] + 0x20) {
1053 te->len = 1;
1054 te->type = RUN_TYPE_UF_D20;
1055 te->data = ci->u_data[0];
1056 } else if (ci->l_len == 0 &&
1057 ci->u_len == 1 &&
1058 ci->f_code == ci->u_data[0] + 1) {
1059 te->len = 1;
1060 te->type = RUN_TYPE_UF_D1_EXT;
1061 te->ext_data[0] = ci->u_data[0];
1062 te->ext_len = 1;
1063 } else if (ci->l_len == 2 && ci->u_len == 0 && ci->f_code == 0) {
1064 te->len = 1;
1065 te->type = RUN_TYPE_L_EXT2;
1066 te->ext_data[0] = ci->l_data[0];
1067 te->ext_data[1] = ci->l_data[1];
1068 te->ext_len = 2;
1069 } else if (ci->u_len == 2 && ci->l_len == 0 && ci->f_code == 0) {
1070 te->len = 1;
1071 te->type = RUN_TYPE_U_EXT2;
1072 te->ext_data[0] = ci->u_data[0];
1073 te->ext_data[1] = ci->u_data[1];
1074 te->ext_len = 2;
1075 } else if (ci->u_len == 3 && ci->l_len == 0 && ci->f_code == 0) {
1076 te->len = 1;
1077 te->type = RUN_TYPE_U_EXT3;
1078 te->ext_data[0] = ci->u_data[0];
1079 te->ext_data[1] = ci->u_data[1];
1080 te->ext_data[2] = ci->u_data[2];
1081 te->ext_len = 3;
1082 } else {
1083 printf("unsupported encoding case:\n");
1084 dump_cc_info(ci, code);
1085 abort();
1086 }
1087 } else {
1088 /* look for a run of identical conversions */
1089 len = 0;
1090 for(;;) {
1091 if (code >= CHARCODE_MAX || len >= 126)
1092 break;
1093 ci = &tab[code + len];
1094 ci1 = &tab[code + len + 1];
1095 if (is_complicated_case(ci) || is_complicated_case(ci1)) {
1096 break;
1097 }
1098 if (ci->l_len != 1 || ci->l_data[0] != code + len + 1)
1099 break;
1100 if (ci1->u_len != 1 || ci1->u_data[0] != code + len)
1101 break;
1102 len += 2;
1103 }
1104 if (len > 0) {
1105 te->len = len;
1106 te->type = RUN_TYPE_UL;
1107 te->data = 0;
1108 return;
1109 }
1110
1111 ci = &tab[code];
1112 is_lower = ci->l_len > 0;
1113 len = 1;
1114 while (code + len <= CHARCODE_MAX) {
1115 ci1 = &tab[code + len];
1116 if (is_complicated_case(ci1))
1117 break;
1118 if (is_lower) {
1119 if (ci1->l_len != 1 ||
1120 ci1->l_data[0] != ci->l_data[0] + len)
1121 break;
1122 } else {
1123 if (ci1->u_len != 1 ||
1124 ci1->u_data[0] != ci->u_data[0] + len)
1125 break;
1126 }
1127 len++;
1128 }
1129 te->len = len;
1130 if (is_lower) {
1131 te->type = RUN_TYPE_LF;
1132 te->data = ci->l_data[0];
1133 } else {
1134 te->type = RUN_TYPE_U;
1135 te->data = ci->u_data[0];
1136 }
1137 }
1138 }
1139
1140 TableEntry conv_table[1000];
1141 int conv_table_len;
1142 int ext_data[1000];
1143 int ext_data_len;
1144
dump_case_conv_table1(void)1145 void dump_case_conv_table1(void)
1146 {
1147 int i, j;
1148 const TableEntry *te;
1149
1150 for(i = 0; i < conv_table_len; i++) {
1151 te = &conv_table[i];
1152 printf("%05x %02x %-10s %05x",
1153 te->code, te->len, run_type_str[te->type], te->data);
1154 for(j = 0; j < te->ext_len; j++) {
1155 printf(" %05x", te->ext_data[j]);
1156 }
1157 printf("\n");
1158 }
1159 printf("table_len=%d ext_len=%d\n", conv_table_len, ext_data_len);
1160 }
1161
find_data_index(const TableEntry * conv_table,int len,int data)1162 int find_data_index(const TableEntry *conv_table, int len, int data)
1163 {
1164 int i;
1165 const TableEntry *te;
1166 for(i = 0; i < len; i++) {
1167 te = &conv_table[i];
1168 if (te->code == data)
1169 return i;
1170 }
1171 return -1;
1172 }
1173
find_ext_data_index(int data)1174 int find_ext_data_index(int data)
1175 {
1176 int i;
1177 for(i = 0; i < ext_data_len; i++) {
1178 if (ext_data[i] == data)
1179 return i;
1180 }
1181 assert(ext_data_len < countof(ext_data));
1182 ext_data[ext_data_len++] = data;
1183 return ext_data_len - 1;
1184 }
1185
build_conv_table(CCInfo * tab)1186 void build_conv_table(CCInfo *tab)
1187 {
1188 int code, i, j;
1189 CCInfo *ci;
1190 TableEntry *te;
1191
1192 te = conv_table;
1193 for(code = 0; code <= CHARCODE_MAX; code++) {
1194 ci = &tab[code];
1195 if (ci->u_len == 0 && ci->l_len == 0 && ci->f_code == 0)
1196 continue;
1197 assert(te - conv_table < countof(conv_table));
1198 find_run_type(te, tab, code);
1199 #if 0
1200 if (te->type == RUN_TYPE_TODO) {
1201 printf("TODO: ");
1202 dump_cc_info(ci, code);
1203 }
1204 #endif
1205 assert(te->len <= 127);
1206 code += te->len - 1;
1207 te++;
1208 }
1209 conv_table_len = te - conv_table;
1210
1211 /* find the data index */
1212 for(i = 0; i < conv_table_len; i++) {
1213 int data_index;
1214 te = &conv_table[i];
1215
1216 switch(te->type) {
1217 case RUN_TYPE_U:
1218 case RUN_TYPE_L:
1219 case RUN_TYPE_UF:
1220 case RUN_TYPE_LF:
1221 data_index = find_data_index(conv_table, conv_table_len, te->data);
1222 if (data_index < 0) {
1223 switch(te->type) {
1224 case RUN_TYPE_U:
1225 te->type = RUN_TYPE_U_EXT;
1226 te->ext_len = 1;
1227 te->ext_data[0] = te->data;
1228 break;
1229 case RUN_TYPE_LF:
1230 te->type = RUN_TYPE_LF_EXT;
1231 te->ext_len = 1;
1232 te->ext_data[0] = te->data;
1233 break;
1234 default:
1235 printf("%05x: index not found\n", te->code);
1236 exit(1);
1237 }
1238 } else {
1239 te->data_index = data_index;
1240 }
1241 break;
1242 case RUN_TYPE_UF_D20:
1243 te->data_index = te->data;
1244 break;
1245 }
1246 }
1247
1248 /* find the data index for ext_data */
1249 for(i = 0; i < conv_table_len; i++) {
1250 te = &conv_table[i];
1251 if (te->type == RUN_TYPE_U_EXT3) {
1252 int p, v;
1253 v = 0;
1254 for(j = 0; j < 3; j++) {
1255 p = find_ext_data_index(te->ext_data[j]);
1256 assert(p < 16);
1257 v = (v << 4) | p;
1258 }
1259 te->data_index = v;
1260 }
1261 }
1262
1263 for(i = 0; i < conv_table_len; i++) {
1264 te = &conv_table[i];
1265 if (te->type == RUN_TYPE_L_EXT2 ||
1266 te->type == RUN_TYPE_U_EXT2 ||
1267 te->type == RUN_TYPE_U2L_399_EXT2) {
1268 int p, v;
1269 v = 0;
1270 for(j = 0; j < 2; j++) {
1271 p = find_ext_data_index(te->ext_data[j]);
1272 assert(p < 64);
1273 v = (v << 6) | p;
1274 }
1275 te->data_index = v;
1276 }
1277 }
1278
1279 for(i = 0; i < conv_table_len; i++) {
1280 te = &conv_table[i];
1281 if (te->type == RUN_TYPE_UF_D1_EXT ||
1282 te->type == RUN_TYPE_U_EXT ||
1283 te->type == RUN_TYPE_LF_EXT) {
1284 te->data_index = find_ext_data_index(te->ext_data[0]);
1285 }
1286 }
1287 #ifdef DUMP_CASE_CONV_TABLE
1288 dump_case_conv_table1();
1289 #endif
1290 }
1291
dump_case_conv_table(FILE * f)1292 void dump_case_conv_table(FILE *f)
1293 {
1294 int i;
1295 uint32_t v;
1296 const TableEntry *te;
1297
1298 fprintf(f, "static const uint32_t case_conv_table1[%u] = {", conv_table_len);
1299 for(i = 0; i < conv_table_len; i++) {
1300 if (i % 4 == 0)
1301 fprintf(f, "\n ");
1302 te = &conv_table[i];
1303 v = te->code << (32 - 17);
1304 v |= te->len << (32 - 17 - 7);
1305 v |= te->type << (32 - 17 - 7 - 4);
1306 v |= te->data_index >> 8;
1307 fprintf(f, " 0x%08x,", v);
1308 }
1309 fprintf(f, "\n};\n\n");
1310
1311 fprintf(f, "static const uint8_t case_conv_table2[%u] = {", conv_table_len);
1312 for(i = 0; i < conv_table_len; i++) {
1313 if (i % 8 == 0)
1314 fprintf(f, "\n ");
1315 te = &conv_table[i];
1316 fprintf(f, " 0x%02x,", te->data_index & 0xff);
1317 }
1318 fprintf(f, "\n};\n\n");
1319
1320 fprintf(f, "static const uint16_t case_conv_ext[%u] = {", ext_data_len);
1321 for(i = 0; i < ext_data_len; i++) {
1322 if (i % 8 == 0)
1323 fprintf(f, "\n ");
1324 fprintf(f, " 0x%04x,", ext_data[i]);
1325 }
1326 fprintf(f, "\n};\n\n");
1327 }
1328
tabcmp(const int * tab1,const int * tab2,int n)1329 int tabcmp(const int *tab1, const int *tab2, int n)
1330 {
1331 int i;
1332 for(i = 0; i < n; i++) {
1333 if (tab1[i] != tab2[i])
1334 return -1;
1335 }
1336 return 0;
1337 }
1338
dump_str(const char * str,const int * buf,int len)1339 void dump_str(const char *str, const int *buf, int len)
1340 {
1341 int i;
1342 printf("%s=", str);
1343 for(i = 0; i < len; i++)
1344 printf(" %05x", buf[i]);
1345 printf("\n");
1346 }
1347
compute_internal_props(void)1348 void compute_internal_props(void)
1349 {
1350 int i;
1351 BOOL has_ul;
1352
1353 for(i = 0; i <= CHARCODE_MAX; i++) {
1354 CCInfo *ci = &unicode_db[i];
1355 has_ul = (ci->u_len != 0 || ci->l_len != 0 || ci->f_code != 0);
1356 if (has_ul) {
1357 assert(get_prop(i, PROP_Cased));
1358 } else {
1359 set_prop(i, PROP_Cased1, get_prop(i, PROP_Cased));
1360 }
1361 set_prop(i, PROP_ID_Continue1,
1362 get_prop(i, PROP_ID_Continue) & (get_prop(i, PROP_ID_Start) ^ 1));
1363 set_prop(i, PROP_XID_Start1,
1364 get_prop(i, PROP_ID_Start) ^ get_prop(i, PROP_XID_Start));
1365 set_prop(i, PROP_XID_Continue1,
1366 get_prop(i, PROP_ID_Continue) ^ get_prop(i, PROP_XID_Continue));
1367 set_prop(i, PROP_Changes_When_Titlecased1,
1368 get_prop(i, PROP_Changes_When_Titlecased) ^ (ci->u_len != 0));
1369 set_prop(i, PROP_Changes_When_Casefolded1,
1370 get_prop(i, PROP_Changes_When_Casefolded) ^ (ci->f_code != 0));
1371 /* XXX: reduce table size (438 bytes) */
1372 set_prop(i, PROP_Changes_When_NFKC_Casefolded1,
1373 get_prop(i, PROP_Changes_When_NFKC_Casefolded) ^ (ci->f_code != 0));
1374 #if 0
1375 /* TEST */
1376 #define M(x) (1U << GCAT_ ## x)
1377 {
1378 int b;
1379 b = ((M(Mn) | M(Cf) | M(Lm) | M(Sk)) >>
1380 unicode_db[i].general_category) & 1;
1381 set_prop(i, PROP_Cased1,
1382 get_prop(i, PROP_Case_Ignorable) ^ b);
1383 }
1384 #undef M
1385 #endif
1386 }
1387 }
1388
dump_byte_table(FILE * f,const char * cname,const uint8_t * tab,int len)1389 void dump_byte_table(FILE *f, const char *cname, const uint8_t *tab, int len)
1390 {
1391 int i;
1392 fprintf(f, "static const uint8_t %s[%d] = {", cname, len);
1393 for(i = 0; i < len; i++) {
1394 if (i % 8 == 0)
1395 fprintf(f, "\n ");
1396 fprintf(f, " 0x%02x,", tab[i]);
1397 }
1398 fprintf(f, "\n};\n\n");
1399 }
1400
1401 #define PROP_BLOCK_LEN 32
1402
build_prop_table(FILE * f,int prop_index,BOOL add_index)1403 void build_prop_table(FILE *f, int prop_index, BOOL add_index)
1404 {
1405 int i, j, n, v, offset, code;
1406 DynBuf dbuf_s, *dbuf = &dbuf_s;
1407 DynBuf dbuf1_s, *dbuf1 = &dbuf1_s;
1408 DynBuf dbuf2_s, *dbuf2 = &dbuf2_s;
1409 const uint32_t *buf;
1410 int buf_len, block_end_pos, bit;
1411 char cname[128];
1412
1413 dbuf_init(dbuf1);
1414
1415 for(i = 0; i <= CHARCODE_MAX;) {
1416 v = get_prop(i, prop_index);
1417 j = i + 1;
1418 while (j <= CHARCODE_MAX && get_prop(j, prop_index) == v) {
1419 j++;
1420 }
1421 n = j - i;
1422 if (j == (CHARCODE_MAX + 1) && v == 0)
1423 break; /* no need to encode last zero run */
1424 //printf("%05x: %d %d\n", i, n, v);
1425 dbuf_put_u32(dbuf1, n - 1);
1426 i += n;
1427 }
1428
1429 dbuf_init(dbuf);
1430 dbuf_init(dbuf2);
1431 buf = (uint32_t *)dbuf1->buf;
1432 buf_len = dbuf1->size / sizeof(buf[0]);
1433
1434 /* the first value is assumed to be 0 */
1435 assert(get_prop(0, prop_index) == 0);
1436
1437 block_end_pos = PROP_BLOCK_LEN;
1438 i = 0;
1439 code = 0;
1440 bit = 0;
1441 while (i < buf_len) {
1442 if (add_index && dbuf->size >= block_end_pos && bit == 0) {
1443 offset = (dbuf->size - block_end_pos);
1444 /* XXX: offset could be larger in case of runs of small
1445 lengths. Could add code to change the encoding to
1446 prevent it at the expense of one byte loss */
1447 assert(offset <= 7);
1448 v = code | (offset << 21);
1449 dbuf_putc(dbuf2, v);
1450 dbuf_putc(dbuf2, v >> 8);
1451 dbuf_putc(dbuf2, v >> 16);
1452 block_end_pos += PROP_BLOCK_LEN;
1453 }
1454
1455 v = buf[i];
1456 code += v + 1;
1457 bit ^= 1;
1458 if (v < 8 && (i + 1) < buf_len && buf[i + 1] < 8) {
1459 code += buf[i + 1] + 1;
1460 bit ^= 1;
1461 dbuf_putc(dbuf, (v << 3) | buf[i + 1]);
1462 i += 2;
1463 } else if (v < 128) {
1464 dbuf_putc(dbuf, 0x80 + v);
1465 i++;
1466 } else if (v < (1 << 13)) {
1467 dbuf_putc(dbuf, 0x40 + (v >> 8));
1468 dbuf_putc(dbuf, v);
1469 i++;
1470 } else {
1471 assert(v < (1 << 21));
1472 dbuf_putc(dbuf, 0x60 + (v >> 16));
1473 dbuf_putc(dbuf, v >> 8);
1474 dbuf_putc(dbuf, v);
1475 i++;
1476 }
1477 }
1478
1479 if (add_index) {
1480 /* last index entry */
1481 v = code;
1482 dbuf_putc(dbuf2, v);
1483 dbuf_putc(dbuf2, v >> 8);
1484 dbuf_putc(dbuf2, v >> 16);
1485 }
1486
1487 #ifdef DUMP_TABLE_SIZE
1488 printf("prop %s: length=%d bytes\n", unicode_prop_name[prop_index],
1489 (int)(dbuf->size + dbuf2->size));
1490 #endif
1491 snprintf(cname, sizeof(cname), "unicode_prop_%s_table", unicode_prop_name[prop_index]);
1492 dump_byte_table(f, cname, dbuf->buf, dbuf->size);
1493 if (add_index) {
1494 snprintf(cname, sizeof(cname), "unicode_prop_%s_index", unicode_prop_name[prop_index]);
1495 dump_byte_table(f, cname, dbuf2->buf, dbuf2->size);
1496 }
1497
1498 dbuf_free(dbuf);
1499 dbuf_free(dbuf1);
1500 dbuf_free(dbuf2);
1501 }
1502
build_flags_tables(FILE * f)1503 void build_flags_tables(FILE *f)
1504 {
1505 build_prop_table(f, PROP_Cased1, TRUE);
1506 build_prop_table(f, PROP_Case_Ignorable, TRUE);
1507 build_prop_table(f, PROP_ID_Start, TRUE);
1508 build_prop_table(f, PROP_ID_Continue1, TRUE);
1509 }
1510
dump_name_table(FILE * f,const char * cname,const char ** tab_name,int len,const char ** tab_short_name)1511 void dump_name_table(FILE *f, const char *cname, const char **tab_name, int len,
1512 const char **tab_short_name)
1513 {
1514 int i, w, maxw;
1515
1516 maxw = 0;
1517 for(i = 0; i < len; i++) {
1518 w = strlen(tab_name[i]);
1519 if (tab_short_name[i][0] != '\0') {
1520 w += 1 + strlen(tab_short_name[i]);
1521 }
1522 if (maxw < w)
1523 maxw = w;
1524 }
1525
1526 /* generate a sequence of strings terminated by an empty string */
1527 fprintf(f, "static const char %s[] =\n", cname);
1528 for(i = 0; i < len; i++) {
1529 fprintf(f, " \"");
1530 w = fprintf(f, "%s", tab_name[i]);
1531 if (tab_short_name[i][0] != '\0') {
1532 w += fprintf(f, ",%s", tab_short_name[i]);
1533 }
1534 fprintf(f, "\"%*s\"\\0\"\n", 1 + maxw - w, "");
1535 }
1536 fprintf(f, ";\n\n");
1537 }
1538
build_general_category_table(FILE * f)1539 void build_general_category_table(FILE *f)
1540 {
1541 int i, v, j, n, n1;
1542 DynBuf dbuf_s, *dbuf = &dbuf_s;
1543 int cw_count, cw_len_count[4], cw_start;
1544
1545 fprintf(f, "typedef enum {\n");
1546 for(i = 0; i < GCAT_COUNT; i++)
1547 fprintf(f, " UNICODE_GC_%s,\n", unicode_gc_name[i]);
1548 fprintf(f, " UNICODE_GC_COUNT,\n");
1549 fprintf(f, "} UnicodeGCEnum;\n\n");
1550
1551 dump_name_table(f, "unicode_gc_name_table",
1552 unicode_gc_name, GCAT_COUNT,
1553 unicode_gc_short_name);
1554
1555
1556 dbuf_init(dbuf);
1557 cw_count = 0;
1558 for(i = 0; i < 4; i++)
1559 cw_len_count[i] = 0;
1560 for(i = 0; i <= CHARCODE_MAX;) {
1561 v = unicode_db[i].general_category;
1562 j = i + 1;
1563 while (j <= CHARCODE_MAX && unicode_db[j].general_category == v)
1564 j++;
1565 n = j - i;
1566 /* compress Lu/Ll runs */
1567 if (v == GCAT_Lu) {
1568 n1 = 1;
1569 while ((i + n1) <= CHARCODE_MAX && unicode_db[i + n1].general_category == (v + (n1 & 1))) {
1570 n1++;
1571 }
1572 if (n1 > n) {
1573 v = 31;
1574 n = n1;
1575 }
1576 }
1577 // printf("%05x %05x %d\n", i, n, v);
1578 cw_count++;
1579 n--;
1580 cw_start = dbuf->size;
1581 if (n < 7) {
1582 dbuf_putc(dbuf, (n << 5) | v);
1583 } else if (n < 7 + 128) {
1584 n1 = n - 7;
1585 assert(n1 < 128);
1586 dbuf_putc(dbuf, (0xf << 5) | v);
1587 dbuf_putc(dbuf, n1);
1588 } else if (n < 7 + 128 + (1 << 14)) {
1589 n1 = n - (7 + 128);
1590 assert(n1 < (1 << 14));
1591 dbuf_putc(dbuf, (0xf << 5) | v);
1592 dbuf_putc(dbuf, (n1 >> 8) + 128);
1593 dbuf_putc(dbuf, n1);
1594 } else {
1595 n1 = n - (7 + 128 + (1 << 14));
1596 assert(n1 < (1 << 22));
1597 dbuf_putc(dbuf, (0xf << 5) | v);
1598 dbuf_putc(dbuf, (n1 >> 16) + 128 + 64);
1599 dbuf_putc(dbuf, n1 >> 8);
1600 dbuf_putc(dbuf, n1);
1601 }
1602 cw_len_count[dbuf->size - cw_start - 1]++;
1603 i += n + 1;
1604 }
1605 #ifdef DUMP_TABLE_SIZE
1606 printf("general category: %d entries [",
1607 cw_count);
1608 for(i = 0; i < 4; i++)
1609 printf(" %d", cw_len_count[i]);
1610 printf(" ], length=%d bytes\n", (int)dbuf->size);
1611 #endif
1612
1613 dump_byte_table(f, "unicode_gc_table", dbuf->buf, dbuf->size);
1614
1615 dbuf_free(dbuf);
1616 }
1617
build_script_table(FILE * f)1618 void build_script_table(FILE *f)
1619 {
1620 int i, v, j, n, n1, type;
1621 DynBuf dbuf_s, *dbuf = &dbuf_s;
1622 int cw_count, cw_len_count[4], cw_start;
1623
1624 fprintf(f, "typedef enum {\n");
1625 for(i = 0; i < SCRIPT_COUNT; i++)
1626 fprintf(f, " UNICODE_SCRIPT_%s,\n", unicode_script_name[i]);
1627 fprintf(f, " UNICODE_SCRIPT_COUNT,\n");
1628 fprintf(f, "} UnicodeScriptEnum;\n\n");
1629
1630 i = 1;
1631 dump_name_table(f, "unicode_script_name_table",
1632 unicode_script_name + i, SCRIPT_COUNT - i,
1633 unicode_script_short_name + i);
1634
1635 dbuf_init(dbuf);
1636 cw_count = 0;
1637 for(i = 0; i < 4; i++)
1638 cw_len_count[i] = 0;
1639 for(i = 0; i <= CHARCODE_MAX;) {
1640 v = unicode_db[i].script;
1641 j = i + 1;
1642 while (j <= CHARCODE_MAX && unicode_db[j].script == v)
1643 j++;
1644 n = j - i;
1645 if (v == 0 && j == (CHARCODE_MAX + 1))
1646 break;
1647 // printf("%05x %05x %d\n", i, n, v);
1648 cw_count++;
1649 n--;
1650 cw_start = dbuf->size;
1651 if (v == 0)
1652 type = 0;
1653 else
1654 type = 1;
1655 if (n < 96) {
1656 dbuf_putc(dbuf, n | (type << 7));
1657 } else if (n < 96 + (1 << 12)) {
1658 n1 = n - 96;
1659 assert(n1 < (1 << 12));
1660 dbuf_putc(dbuf, ((n1 >> 8) + 96) | (type << 7));
1661 dbuf_putc(dbuf, n1);
1662 } else {
1663 n1 = n - (96 + (1 << 12));
1664 assert(n1 < (1 << 20));
1665 dbuf_putc(dbuf, ((n1 >> 16) + 112) | (type << 7));
1666 dbuf_putc(dbuf, n1 >> 8);
1667 dbuf_putc(dbuf, n1);
1668 }
1669 if (type != 0)
1670 dbuf_putc(dbuf, v);
1671
1672 cw_len_count[dbuf->size - cw_start - 1]++;
1673 i += n + 1;
1674 }
1675 #if defined(DUMP_TABLE_SIZE)
1676 printf("script: %d entries [",
1677 cw_count);
1678 for(i = 0; i < 4; i++)
1679 printf(" %d", cw_len_count[i]);
1680 printf(" ], length=%d bytes\n", (int)dbuf->size);
1681 #endif
1682
1683 dump_byte_table(f, "unicode_script_table", dbuf->buf, dbuf->size);
1684
1685 dbuf_free(dbuf);
1686 }
1687
build_script_ext_table(FILE * f)1688 void build_script_ext_table(FILE *f)
1689 {
1690 int i, j, n, n1, script_ext_len;
1691 DynBuf dbuf_s, *dbuf = &dbuf_s;
1692 int cw_count;
1693
1694 dbuf_init(dbuf);
1695 cw_count = 0;
1696 for(i = 0; i <= CHARCODE_MAX;) {
1697 script_ext_len = unicode_db[i].script_ext_len;
1698 j = i + 1;
1699 while (j <= CHARCODE_MAX &&
1700 unicode_db[j].script_ext_len == script_ext_len &&
1701 !memcmp(unicode_db[j].script_ext, unicode_db[i].script_ext,
1702 script_ext_len)) {
1703 j++;
1704 }
1705 n = j - i;
1706 cw_count++;
1707 n--;
1708 if (n < 128) {
1709 dbuf_putc(dbuf, n);
1710 } else if (n < 128 + (1 << 14)) {
1711 n1 = n - 128;
1712 assert(n1 < (1 << 14));
1713 dbuf_putc(dbuf, (n1 >> 8) + 128);
1714 dbuf_putc(dbuf, n1);
1715 } else {
1716 n1 = n - (128 + (1 << 14));
1717 assert(n1 < (1 << 22));
1718 dbuf_putc(dbuf, (n1 >> 16) + 128 + 64);
1719 dbuf_putc(dbuf, n1 >> 8);
1720 dbuf_putc(dbuf, n1);
1721 }
1722 dbuf_putc(dbuf, script_ext_len);
1723 for(j = 0; j < script_ext_len; j++)
1724 dbuf_putc(dbuf, unicode_db[i].script_ext[j]);
1725 i += n + 1;
1726 }
1727 #ifdef DUMP_TABLE_SIZE
1728 printf("script_ext: %d entries",
1729 cw_count);
1730 printf(", length=%d bytes\n", (int)dbuf->size);
1731 #endif
1732
1733 dump_byte_table(f, "unicode_script_ext_table", dbuf->buf, dbuf->size);
1734
1735 dbuf_free(dbuf);
1736 }
1737
1738 /* the following properties are synthetized so no table is necessary */
1739 #define PROP_TABLE_COUNT PROP_ASCII
1740
build_prop_list_table(FILE * f)1741 void build_prop_list_table(FILE *f)
1742 {
1743 int i;
1744
1745 for(i = 0; i < PROP_TABLE_COUNT; i++) {
1746 if (i == PROP_ID_Start ||
1747 i == PROP_Case_Ignorable ||
1748 i == PROP_ID_Continue1) {
1749 /* already generated */
1750 } else {
1751 build_prop_table(f, i, FALSE);
1752 }
1753 }
1754
1755 fprintf(f, "typedef enum {\n");
1756 for(i = 0; i < PROP_COUNT; i++)
1757 fprintf(f, " UNICODE_PROP_%s,\n", unicode_prop_name[i]);
1758 fprintf(f, " UNICODE_PROP_COUNT,\n");
1759 fprintf(f, "} UnicodePropertyEnum;\n\n");
1760
1761 i = PROP_ASCII_Hex_Digit;
1762 dump_name_table(f, "unicode_prop_name_table",
1763 unicode_prop_name + i, PROP_XID_Start - i + 1,
1764 unicode_prop_short_name + i);
1765
1766 fprintf(f, "static const uint8_t * const unicode_prop_table[] = {\n");
1767 for(i = 0; i < PROP_TABLE_COUNT; i++) {
1768 fprintf(f, " unicode_prop_%s_table,\n", unicode_prop_name[i]);
1769 }
1770 fprintf(f, "};\n\n");
1771
1772 fprintf(f, "static const uint16_t unicode_prop_len_table[] = {\n");
1773 for(i = 0; i < PROP_TABLE_COUNT; i++) {
1774 fprintf(f, " countof(unicode_prop_%s_table),\n", unicode_prop_name[i]);
1775 }
1776 fprintf(f, "};\n\n");
1777 }
1778
1779 #ifdef USE_TEST
check_conv(uint32_t * res,uint32_t c,int conv_type)1780 int check_conv(uint32_t *res, uint32_t c, int conv_type)
1781 {
1782 return lre_case_conv(res, c, conv_type);
1783 }
1784
check_case_conv(void)1785 void check_case_conv(void)
1786 {
1787 CCInfo *tab = unicode_db;
1788 uint32_t res[3];
1789 int l, error;
1790 CCInfo ci_s, *ci1, *ci = &ci_s;
1791 int code;
1792
1793 for(code = 0; code <= CHARCODE_MAX; code++) {
1794 ci1 = &tab[code];
1795 *ci = *ci1;
1796 if (ci->l_len == 0) {
1797 ci->l_len = 1;
1798 ci->l_data[0] = code;
1799 }
1800 if (ci->u_len == 0) {
1801 ci->u_len = 1;
1802 ci->u_data[0] = code;
1803 }
1804 if (ci->f_code == 0)
1805 ci->f_code = code;
1806
1807 error = 0;
1808 l = check_conv(res, code, 0);
1809 if (l != ci->u_len || tabcmp((int *)res, ci->u_data, l)) {
1810 printf("ERROR: L\n");
1811 error++;
1812 }
1813 l = check_conv(res, code, 1);
1814 if (l != ci->l_len || tabcmp((int *)res, ci->l_data, l)) {
1815 printf("ERROR: U\n");
1816 error++;
1817 }
1818 l = check_conv(res, code, 2);
1819 if (l != 1 || res[0] != ci->f_code) {
1820 printf("ERROR: F\n");
1821 error++;
1822 }
1823 if (error) {
1824 dump_cc_info(ci, code);
1825 exit(1);
1826 }
1827 }
1828 }
1829
1830 #ifdef PROFILE
get_time_ns(void)1831 static int64_t get_time_ns(void)
1832 {
1833 struct timespec ts;
1834 clock_gettime(CLOCK_MONOTONIC, &ts);
1835 return (int64_t)ts.tv_sec * 1000000000 + ts.tv_nsec;
1836 }
1837 #endif
1838
1839
check_flags(void)1840 void check_flags(void)
1841 {
1842 int c;
1843 BOOL flag_ref, flag;
1844 for(c = 0; c <= CHARCODE_MAX; c++) {
1845 flag_ref = get_prop(c, PROP_Cased);
1846 flag = lre_is_cased(c);
1847 if (flag != flag_ref) {
1848 printf("ERROR: c=%05x cased=%d ref=%d\n",
1849 c, flag, flag_ref);
1850 exit(1);
1851 }
1852
1853 flag_ref = get_prop(c, PROP_Case_Ignorable);
1854 flag = lre_is_case_ignorable(c);
1855 if (flag != flag_ref) {
1856 printf("ERROR: c=%05x case_ignorable=%d ref=%d\n",
1857 c, flag, flag_ref);
1858 exit(1);
1859 }
1860
1861 flag_ref = get_prop(c, PROP_ID_Start);
1862 flag = lre_is_id_start(c);
1863 if (flag != flag_ref) {
1864 printf("ERROR: c=%05x id_start=%d ref=%d\n",
1865 c, flag, flag_ref);
1866 exit(1);
1867 }
1868
1869 flag_ref = get_prop(c, PROP_ID_Continue);
1870 flag = lre_is_id_continue(c);
1871 if (flag != flag_ref) {
1872 printf("ERROR: c=%05x id_cont=%d ref=%d\n",
1873 c, flag, flag_ref);
1874 exit(1);
1875 }
1876 }
1877 #ifdef PROFILE
1878 {
1879 int64_t ti, count;
1880 ti = get_time_ns();
1881 count = 0;
1882 for(c = 0x20; c <= 0xffff; c++) {
1883 flag_ref = get_prop(c, PROP_ID_Start);
1884 flag = lre_is_id_start(c);
1885 assert(flag == flag_ref);
1886 count++;
1887 }
1888 ti = get_time_ns() - ti;
1889 printf("flags time=%0.1f ns/char\n",
1890 (double)ti / count);
1891 }
1892 #endif
1893 }
1894
1895 #endif
1896
1897 #define CC_BLOCK_LEN 32
1898
build_cc_table(FILE * f)1899 void build_cc_table(FILE *f)
1900 {
1901 int i, cc, n, cc_table_len, type, n1;
1902 DynBuf dbuf_s, *dbuf = &dbuf_s;
1903 DynBuf dbuf1_s, *dbuf1 = &dbuf1_s;
1904 int cw_len_tab[3], cw_start, block_end_pos;
1905 uint32_t v;
1906
1907 dbuf_init(dbuf);
1908 dbuf_init(dbuf1);
1909 cc_table_len = 0;
1910 for(i = 0; i < countof(cw_len_tab); i++)
1911 cw_len_tab[i] = 0;
1912 block_end_pos = CC_BLOCK_LEN;
1913 for(i = 0; i <= CHARCODE_MAX;) {
1914 cc = unicode_db[i].combining_class;
1915 assert(cc <= 255);
1916 /* check increasing values */
1917 n = 1;
1918 while ((i + n) <= CHARCODE_MAX &&
1919 unicode_db[i + n].combining_class == (cc + n))
1920 n++;
1921 if (n >= 2) {
1922 type = 1;
1923 } else {
1924 type = 0;
1925 n = 1;
1926 while ((i + n) <= CHARCODE_MAX &&
1927 unicode_db[i + n].combining_class == cc)
1928 n++;
1929 }
1930 /* no need to encode the last run */
1931 if (cc == 0 && (i + n - 1) == CHARCODE_MAX)
1932 break;
1933 #ifdef DUMP_CC_TABLE
1934 printf("%05x %6d %d %d\n", i, n, type, cc);
1935 #endif
1936 if (type == 0) {
1937 if (cc == 0)
1938 type = 2;
1939 else if (cc == 230)
1940 type = 3;
1941 }
1942 n1 = n - 1;
1943
1944 /* add an entry to the index if necessary */
1945 if (dbuf->size >= block_end_pos) {
1946 v = i | ((dbuf->size - block_end_pos) << 21);
1947 dbuf_putc(dbuf1, v);
1948 dbuf_putc(dbuf1, v >> 8);
1949 dbuf_putc(dbuf1, v >> 16);
1950 block_end_pos += CC_BLOCK_LEN;
1951 }
1952 cw_start = dbuf->size;
1953 if (n1 < 48) {
1954 dbuf_putc(dbuf, n1 | (type << 6));
1955 } else if (n1 < 48 + (1 << 11)) {
1956 n1 -= 48;
1957 dbuf_putc(dbuf, ((n1 >> 8) + 48) | (type << 6));
1958 dbuf_putc(dbuf, n1);
1959 } else {
1960 n1 -= 48 + (1 << 11);
1961 assert(n1 < (1 << 20));
1962 dbuf_putc(dbuf, ((n1 >> 16) + 56) | (type << 6));
1963 dbuf_putc(dbuf, n1 >> 8);
1964 dbuf_putc(dbuf, n1);
1965 }
1966 cw_len_tab[dbuf->size - cw_start - 1]++;
1967 if (type == 0 || type == 1)
1968 dbuf_putc(dbuf, cc);
1969 cc_table_len++;
1970 i += n;
1971 }
1972
1973 /* last index entry */
1974 v = i;
1975 dbuf_putc(dbuf1, v);
1976 dbuf_putc(dbuf1, v >> 8);
1977 dbuf_putc(dbuf1, v >> 16);
1978
1979 dump_byte_table(f, "unicode_cc_table", dbuf->buf, dbuf->size);
1980 dump_byte_table(f, "unicode_cc_index", dbuf1->buf, dbuf1->size);
1981
1982 #if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE)
1983 printf("CC table: size=%d (%d entries) [",
1984 (int)(dbuf->size + dbuf1->size),
1985 cc_table_len);
1986 for(i = 0; i < countof(cw_len_tab); i++)
1987 printf(" %d", cw_len_tab[i]);
1988 printf(" ]\n");
1989 #endif
1990 dbuf_free(dbuf);
1991 dbuf_free(dbuf1);
1992 }
1993
1994 /* maximum length of decomposition: 18 chars (1), then 8 */
1995 #ifndef USE_TEST
1996 typedef enum {
1997 DECOMP_TYPE_C1, /* 16 bit char */
1998 DECOMP_TYPE_L1, /* 16 bit char table */
1999 DECOMP_TYPE_L2,
2000 DECOMP_TYPE_L3,
2001 DECOMP_TYPE_L4,
2002 DECOMP_TYPE_L5, /* XXX: not used */
2003 DECOMP_TYPE_L6, /* XXX: could remove */
2004 DECOMP_TYPE_L7, /* XXX: could remove */
2005 DECOMP_TYPE_LL1, /* 18 bit char table */
2006 DECOMP_TYPE_LL2,
2007 DECOMP_TYPE_S1, /* 8 bit char table */
2008 DECOMP_TYPE_S2,
2009 DECOMP_TYPE_S3,
2010 DECOMP_TYPE_S4,
2011 DECOMP_TYPE_S5,
2012 DECOMP_TYPE_I1, /* increment 16 bit char value */
2013 DECOMP_TYPE_I2_0,
2014 DECOMP_TYPE_I2_1,
2015 DECOMP_TYPE_I3_1,
2016 DECOMP_TYPE_I3_2,
2017 DECOMP_TYPE_I4_1,
2018 DECOMP_TYPE_I4_2,
2019 DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */
2020 DECOMP_TYPE_B2,
2021 DECOMP_TYPE_B3,
2022 DECOMP_TYPE_B4,
2023 DECOMP_TYPE_B5,
2024 DECOMP_TYPE_B6,
2025 DECOMP_TYPE_B7,
2026 DECOMP_TYPE_B8,
2027 DECOMP_TYPE_B18,
2028 DECOMP_TYPE_LS2,
2029 DECOMP_TYPE_PAT3,
2030 DECOMP_TYPE_S2_UL,
2031 DECOMP_TYPE_LS2_UL,
2032 } DecompTypeEnum;
2033 #endif
2034
2035 const char *decomp_type_str[] = {
2036 "C1",
2037 "L1",
2038 "L2",
2039 "L3",
2040 "L4",
2041 "L5",
2042 "L6",
2043 "L7",
2044 "LL1",
2045 "LL2",
2046 "S1",
2047 "S2",
2048 "S3",
2049 "S4",
2050 "S5",
2051 "I1",
2052 "I2_0",
2053 "I2_1",
2054 "I3_1",
2055 "I3_2",
2056 "I4_1",
2057 "I4_2",
2058 "B1",
2059 "B2",
2060 "B3",
2061 "B4",
2062 "B5",
2063 "B6",
2064 "B7",
2065 "B8",
2066 "B18",
2067 "LS2",
2068 "PAT3",
2069 "S2_UL",
2070 "LS2_UL",
2071 };
2072
2073 const int decomp_incr_tab[4][4] = {
2074 { DECOMP_TYPE_I1, 0, -1 },
2075 { DECOMP_TYPE_I2_0, 0, 1, -1 },
2076 { DECOMP_TYPE_I3_1, 1, 2, -1 },
2077 { DECOMP_TYPE_I4_1, 1, 2, -1 },
2078 };
2079
2080 /*
2081 entry size:
2082 type bits
2083 code 18
2084 len 7
2085 compat 1
2086 type 5
2087 index 16
2088 total 47
2089 */
2090
2091 typedef struct {
2092 int code;
2093 uint8_t len;
2094 uint8_t type;
2095 uint8_t c_len;
2096 uint16_t c_min;
2097 uint16_t data_index;
2098 int cost; /* size in bytes from this entry to the end */
2099 } DecompEntry;
2100
get_decomp_run_size(const DecompEntry * de)2101 int get_decomp_run_size(const DecompEntry *de)
2102 {
2103 int s;
2104 s = 6;
2105 if (de->type <= DECOMP_TYPE_C1) {
2106 /* nothing more */
2107 } else if (de->type <= DECOMP_TYPE_L7) {
2108 s += de->len * de->c_len * 2;
2109 } else if (de->type <= DECOMP_TYPE_LL2) {
2110 /* 18 bits per char */
2111 s += (de->len * de->c_len * 18 + 7) / 8;
2112 } else if (de->type <= DECOMP_TYPE_S5) {
2113 s += de->len * de->c_len;
2114 } else if (de->type <= DECOMP_TYPE_I4_2) {
2115 s += de->c_len * 2;
2116 } else if (de->type <= DECOMP_TYPE_B18) {
2117 s += 2 + de->len * de->c_len;
2118 } else if (de->type <= DECOMP_TYPE_LS2) {
2119 s += de->len * 3;
2120 } else if (de->type <= DECOMP_TYPE_PAT3) {
2121 s += 4 + de->len * 2;
2122 } else if (de->type <= DECOMP_TYPE_S2_UL) {
2123 s += de->len;
2124 } else if (de->type <= DECOMP_TYPE_LS2_UL) {
2125 s += (de->len / 2) * 3;
2126 } else {
2127 abort();
2128 }
2129 return s;
2130 }
2131
2132 static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 };
2133
2134 /* return -1 if not found */
get_short_code(int c)2135 int get_short_code(int c)
2136 {
2137 int i;
2138 if (c < 0x80) {
2139 return c;
2140 } else if (c >= 0x300 && c < 0x350) {
2141 return c - 0x300 + 0x80;
2142 } else {
2143 for(i = 0; i < countof(unicode_short_table); i++) {
2144 if (c == unicode_short_table[i])
2145 return i + 0x80 + 0x50;
2146 }
2147 return -1;
2148 }
2149 }
2150
is_short(int code)2151 static BOOL is_short(int code)
2152 {
2153 return get_short_code(code) >= 0;
2154 }
2155
is_short_tab(const int * tab,int len)2156 static BOOL is_short_tab(const int *tab, int len)
2157 {
2158 int i;
2159 for(i = 0; i < len; i++) {
2160 if (!is_short(tab[i]))
2161 return FALSE;
2162 }
2163 return TRUE;
2164 }
2165
is_16bit(const int * tab,int len)2166 static BOOL is_16bit(const int *tab, int len)
2167 {
2168 int i;
2169 for(i = 0; i < len; i++) {
2170 if (tab[i] > 0xffff)
2171 return FALSE;
2172 }
2173 return TRUE;
2174 }
2175
to_lower_simple(uint32_t c)2176 static uint32_t to_lower_simple(uint32_t c)
2177 {
2178 /* Latin1 and Cyrillic */
2179 if (c < 0x100 || (c >= 0x410 && c <= 0x42f))
2180 c += 0x20;
2181 else
2182 c++;
2183 return c;
2184 }
2185
2186 /* select best encoding with dynamic programming */
find_decomp_run(DecompEntry * tab_de,int i)2187 void find_decomp_run(DecompEntry *tab_de, int i)
2188 {
2189 DecompEntry de_s, *de = &de_s;
2190 CCInfo *ci, *ci1, *ci2;
2191 int l, j, n, len_max;
2192
2193 ci = &unicode_db[i];
2194 l = ci->decomp_len;
2195 if (l == 0) {
2196 tab_de[i].cost = tab_de[i + 1].cost;
2197 return;
2198 }
2199
2200 /* the offset for the compose table has only 6 bits, so we must
2201 limit if it can be used by the compose table */
2202 if (!ci->is_compat && !ci->is_excluded && l == 2)
2203 len_max = 64;
2204 else
2205 len_max = 127;
2206
2207 tab_de[i].cost = 0x7fffffff;
2208
2209 if (!is_16bit(ci->decomp_data, l)) {
2210 assert(l <= 2);
2211
2212 n = 1;
2213 for(;;) {
2214 de->code = i;
2215 de->len = n;
2216 de->type = DECOMP_TYPE_LL1 + l - 1;
2217 de->c_len = l;
2218 de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2219 if (de->cost < tab_de[i].cost) {
2220 tab_de[i] = *de;
2221 }
2222 if (!((i + n) <= CHARCODE_MAX && n < len_max))
2223 break;
2224 ci1 = &unicode_db[i + n];
2225 /* Note: we accept a hole */
2226 if (!(ci1->decomp_len == 0 ||
2227 (ci1->decomp_len == l &&
2228 ci1->is_compat == ci->is_compat)))
2229 break;
2230 n++;
2231 }
2232 return;
2233 }
2234
2235 if (l <= 7) {
2236 n = 1;
2237 for(;;) {
2238 de->code = i;
2239 de->len = n;
2240 if (l == 1 && n == 1) {
2241 de->type = DECOMP_TYPE_C1;
2242 } else {
2243 assert(l <= 8);
2244 de->type = DECOMP_TYPE_L1 + l - 1;
2245 }
2246 de->c_len = l;
2247 de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2248 if (de->cost < tab_de[i].cost) {
2249 tab_de[i] = *de;
2250 }
2251
2252 if (!((i + n) <= CHARCODE_MAX && n < len_max))
2253 break;
2254 ci1 = &unicode_db[i + n];
2255 /* Note: we accept a hole */
2256 if (!(ci1->decomp_len == 0 ||
2257 (ci1->decomp_len == l &&
2258 ci1->is_compat == ci->is_compat &&
2259 is_16bit(ci1->decomp_data, l))))
2260 break;
2261 n++;
2262 }
2263 }
2264
2265 if (l <= 8 || l == 18) {
2266 int c_min, c_max, c;
2267 c_min = c_max = -1;
2268 n = 1;
2269 for(;;) {
2270 ci1 = &unicode_db[i + n - 1];
2271 for(j = 0; j < l; j++) {
2272 c = ci1->decomp_data[j];
2273 if (c == 0x20) {
2274 /* we accept space for Arabic */
2275 } else if (c_min == -1) {
2276 c_min = c_max = c;
2277 } else {
2278 c_min = min_int(c_min, c);
2279 c_max = max_int(c_max, c);
2280 }
2281 }
2282 if ((c_max - c_min) > 254)
2283 break;
2284 de->code = i;
2285 de->len = n;
2286 if (l == 18)
2287 de->type = DECOMP_TYPE_B18;
2288 else
2289 de->type = DECOMP_TYPE_B1 + l - 1;
2290 de->c_len = l;
2291 de->c_min = c_min;
2292 de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2293 if (de->cost < tab_de[i].cost) {
2294 tab_de[i] = *de;
2295 }
2296 if (!((i + n) <= CHARCODE_MAX && n < len_max))
2297 break;
2298 ci1 = &unicode_db[i + n];
2299 if (!(ci1->decomp_len == l &&
2300 ci1->is_compat == ci->is_compat))
2301 break;
2302 n++;
2303 }
2304 }
2305
2306 /* find an ascii run */
2307 if (l <= 5 && is_short_tab(ci->decomp_data, l)) {
2308 n = 1;
2309 for(;;) {
2310 de->code = i;
2311 de->len = n;
2312 de->type = DECOMP_TYPE_S1 + l - 1;
2313 de->c_len = l;
2314 de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2315 if (de->cost < tab_de[i].cost) {
2316 tab_de[i] = *de;
2317 }
2318
2319 if (!((i + n) <= CHARCODE_MAX && n < len_max))
2320 break;
2321 ci1 = &unicode_db[i + n];
2322 /* Note: we accept a hole */
2323 if (!(ci1->decomp_len == 0 ||
2324 (ci1->decomp_len == l &&
2325 ci1->is_compat == ci->is_compat &&
2326 is_short_tab(ci1->decomp_data, l))))
2327 break;
2328 n++;
2329 }
2330 }
2331
2332 /* check if a single char is increasing */
2333 if (l <= 4) {
2334 int idx1, idx;
2335
2336 for(idx1 = 1; (idx = decomp_incr_tab[l - 1][idx1]) >= 0; idx1++) {
2337 n = 1;
2338 for(;;) {
2339 de->code = i;
2340 de->len = n;
2341 de->type = decomp_incr_tab[l - 1][0] + idx1 - 1;
2342 de->c_len = l;
2343 de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2344 if (de->cost < tab_de[i].cost) {
2345 tab_de[i] = *de;
2346 }
2347
2348 if (!((i + n) <= CHARCODE_MAX && n < len_max))
2349 break;
2350 ci1 = &unicode_db[i + n];
2351 if (!(ci1->decomp_len == l &&
2352 ci1->is_compat == ci->is_compat))
2353 goto next1;
2354 for(j = 0; j < l; j++) {
2355 if (j == idx) {
2356 if (ci1->decomp_data[j] != ci->decomp_data[j] + n)
2357 goto next1;
2358 } else {
2359 if (ci1->decomp_data[j] != ci->decomp_data[j])
2360 goto next1;
2361 }
2362 }
2363 n++;
2364 }
2365 next1: ;
2366 }
2367 }
2368
2369 if (l == 3) {
2370 n = 1;
2371 for(;;) {
2372 de->code = i;
2373 de->len = n;
2374 de->type = DECOMP_TYPE_PAT3;
2375 de->c_len = l;
2376 de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2377 if (de->cost < tab_de[i].cost) {
2378 tab_de[i] = *de;
2379 }
2380 if (!((i + n) <= CHARCODE_MAX && n < len_max))
2381 break;
2382 ci1 = &unicode_db[i + n];
2383 if (!(ci1->decomp_len == l &&
2384 ci1->is_compat == ci->is_compat &&
2385 ci1->decomp_data[1] <= 0xffff &&
2386 ci1->decomp_data[0] == ci->decomp_data[0] &&
2387 ci1->decomp_data[l - 1] == ci->decomp_data[l - 1]))
2388 break;
2389 n++;
2390 }
2391 }
2392
2393 if (l == 2 && is_short(ci->decomp_data[1])) {
2394 n = 1;
2395 for(;;) {
2396 de->code = i;
2397 de->len = n;
2398 de->type = DECOMP_TYPE_LS2;
2399 de->c_len = l;
2400 de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2401 if (de->cost < tab_de[i].cost) {
2402 tab_de[i] = *de;
2403 }
2404 if (!((i + n) <= CHARCODE_MAX && n < len_max))
2405 break;
2406 ci1 = &unicode_db[i + n];
2407 if (!(ci1->decomp_len == 0 ||
2408 (ci1->decomp_len == l &&
2409 ci1->is_compat == ci->is_compat &&
2410 ci1->decomp_data[0] <= 0xffff &&
2411 is_short(ci1->decomp_data[1]))))
2412 break;
2413 n++;
2414 }
2415 }
2416
2417 if (l == 2) {
2418 BOOL is_16bit;
2419
2420 n = 0;
2421 is_16bit = FALSE;
2422 for(;;) {
2423 if (!((i + n + 1) <= CHARCODE_MAX && n + 2 <= len_max))
2424 break;
2425 ci1 = &unicode_db[i + n];
2426 if (!(ci1->decomp_len == l &&
2427 ci1->is_compat == ci->is_compat &&
2428 is_short(ci1->decomp_data[1])))
2429 break;
2430 if (!is_16bit && !is_short(ci1->decomp_data[0]))
2431 is_16bit = TRUE;
2432 ci2 = &unicode_db[i + n + 1];
2433 if (!(ci2->decomp_len == l &&
2434 ci2->is_compat == ci->is_compat &&
2435 ci2->decomp_data[0] == to_lower_simple(ci1->decomp_data[0]) &&
2436 ci2->decomp_data[1] == ci1->decomp_data[1]))
2437 break;
2438 n += 2;
2439 de->code = i;
2440 de->len = n;
2441 de->type = DECOMP_TYPE_S2_UL + is_16bit;
2442 de->c_len = l;
2443 de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2444 if (de->cost < tab_de[i].cost) {
2445 tab_de[i] = *de;
2446 }
2447 }
2448 }
2449 }
2450
put16(uint8_t * data_buf,int * pidx,uint16_t c)2451 void put16(uint8_t *data_buf, int *pidx, uint16_t c)
2452 {
2453 int idx;
2454 idx = *pidx;
2455 data_buf[idx++] = c;
2456 data_buf[idx++] = c >> 8;
2457 *pidx = idx;
2458 }
2459
add_decomp_data(uint8_t * data_buf,int * pidx,DecompEntry * de)2460 void add_decomp_data(uint8_t *data_buf, int *pidx, DecompEntry *de)
2461 {
2462 int i, j, idx, c;
2463 CCInfo *ci;
2464
2465 idx = *pidx;
2466 de->data_index = idx;
2467 if (de->type <= DECOMP_TYPE_C1) {
2468 ci = &unicode_db[de->code];
2469 assert(ci->decomp_len == 1);
2470 de->data_index = ci->decomp_data[0];
2471 } else if (de->type <= DECOMP_TYPE_L7) {
2472 for(i = 0; i < de->len; i++) {
2473 ci = &unicode_db[de->code + i];
2474 for(j = 0; j < de->c_len; j++) {
2475 if (ci->decomp_len == 0)
2476 c = 0;
2477 else
2478 c = ci->decomp_data[j];
2479 put16(data_buf, &idx, c);
2480 }
2481 }
2482 } else if (de->type <= DECOMP_TYPE_LL2) {
2483 int n, p, k;
2484 n = (de->len * de->c_len * 18 + 7) / 8;
2485 p = de->len * de->c_len * 2;
2486 memset(data_buf + idx, 0, n);
2487 k = 0;
2488 for(i = 0; i < de->len; i++) {
2489 ci = &unicode_db[de->code + i];
2490 for(j = 0; j < de->c_len; j++) {
2491 if (ci->decomp_len == 0)
2492 c = 0;
2493 else
2494 c = ci->decomp_data[j];
2495 data_buf[idx + k * 2] = c;
2496 data_buf[idx + k * 2 + 1] = c >> 8;
2497 data_buf[idx + p + (k / 4)] |= (c >> 16) << ((k % 4) * 2);
2498 k++;
2499 }
2500 }
2501 idx += n;
2502 } else if (de->type <= DECOMP_TYPE_S5) {
2503 for(i = 0; i < de->len; i++) {
2504 ci = &unicode_db[de->code + i];
2505 for(j = 0; j < de->c_len; j++) {
2506 if (ci->decomp_len == 0)
2507 c = 0;
2508 else
2509 c = ci->decomp_data[j];
2510 c = get_short_code(c);
2511 assert(c >= 0);
2512 data_buf[idx++] = c;
2513 }
2514 }
2515 } else if (de->type <= DECOMP_TYPE_I4_2) {
2516 ci = &unicode_db[de->code];
2517 assert(ci->decomp_len == de->c_len);
2518 for(j = 0; j < de->c_len; j++)
2519 put16(data_buf, &idx, ci->decomp_data[j]);
2520 } else if (de->type <= DECOMP_TYPE_B18) {
2521 c = de->c_min;
2522 data_buf[idx++] = c;
2523 data_buf[idx++] = c >> 8;
2524 for(i = 0; i < de->len; i++) {
2525 ci = &unicode_db[de->code + i];
2526 for(j = 0; j < de->c_len; j++) {
2527 assert(ci->decomp_len == de->c_len);
2528 c = ci->decomp_data[j];
2529 if (c == 0x20) {
2530 c = 0xff;
2531 } else {
2532 c -= de->c_min;
2533 assert((uint32_t)c <= 254);
2534 }
2535 data_buf[idx++] = c;
2536 }
2537 }
2538 } else if (de->type <= DECOMP_TYPE_LS2) {
2539 assert(de->c_len == 2);
2540 for(i = 0; i < de->len; i++) {
2541 ci = &unicode_db[de->code + i];
2542 if (ci->decomp_len == 0)
2543 c = 0;
2544 else
2545 c = ci->decomp_data[0];
2546 put16(data_buf, &idx, c);
2547
2548 if (ci->decomp_len == 0)
2549 c = 0;
2550 else
2551 c = ci->decomp_data[1];
2552 c = get_short_code(c);
2553 assert(c >= 0);
2554 data_buf[idx++] = c;
2555 }
2556 } else if (de->type <= DECOMP_TYPE_PAT3) {
2557 ci = &unicode_db[de->code];
2558 assert(ci->decomp_len == 3);
2559 put16(data_buf, &idx, ci->decomp_data[0]);
2560 put16(data_buf, &idx, ci->decomp_data[2]);
2561 for(i = 0; i < de->len; i++) {
2562 ci = &unicode_db[de->code + i];
2563 assert(ci->decomp_len == 3);
2564 put16(data_buf, &idx, ci->decomp_data[1]);
2565 }
2566 } else if (de->type <= DECOMP_TYPE_S2_UL) {
2567 for(i = 0; i < de->len; i += 2) {
2568 ci = &unicode_db[de->code + i];
2569 c = ci->decomp_data[0];
2570 c = get_short_code(c);
2571 assert(c >= 0);
2572 data_buf[idx++] = c;
2573 c = ci->decomp_data[1];
2574 c = get_short_code(c);
2575 assert(c >= 0);
2576 data_buf[idx++] = c;
2577 }
2578 } else if (de->type <= DECOMP_TYPE_LS2_UL) {
2579 for(i = 0; i < de->len; i += 2) {
2580 ci = &unicode_db[de->code + i];
2581 c = ci->decomp_data[0];
2582 put16(data_buf, &idx, c);
2583 c = ci->decomp_data[1];
2584 c = get_short_code(c);
2585 assert(c >= 0);
2586 data_buf[idx++] = c;
2587 }
2588 } else {
2589 abort();
2590 }
2591 *pidx = idx;
2592 }
2593
2594 #if 0
2595 void dump_large_char(void)
2596 {
2597 int i, j;
2598 for(i = 0; i <= CHARCODE_MAX; i++) {
2599 CCInfo *ci = &unicode_db[i];
2600 for(j = 0; j < ci->decomp_len; j++) {
2601 if (ci->decomp_data[j] > 0xffff)
2602 printf("%05x\n", ci->decomp_data[j]);
2603 }
2604 }
2605 }
2606 #endif
2607
2608 void build_compose_table(FILE *f, const DecompEntry *tab_de);
2609
build_decompose_table(FILE * f)2610 void build_decompose_table(FILE *f)
2611 {
2612 int i, array_len, code_max, data_len, count;
2613 DecompEntry *tab_de, de_s, *de = &de_s;
2614 uint8_t *data_buf;
2615
2616 code_max = CHARCODE_MAX;
2617
2618 tab_de = mallocz((code_max + 2) * sizeof(*tab_de));
2619
2620 for(i = code_max; i >= 0; i--) {
2621 find_decomp_run(tab_de, i);
2622 }
2623
2624 /* build the data buffer */
2625 data_buf = malloc(100000);
2626 data_len = 0;
2627 array_len = 0;
2628 for(i = 0; i <= code_max; i++) {
2629 de = &tab_de[i];
2630 if (de->len != 0) {
2631 add_decomp_data(data_buf, &data_len, de);
2632 i += de->len - 1;
2633 array_len++;
2634 }
2635 }
2636
2637 #ifdef DUMP_DECOMP_TABLE
2638 /* dump */
2639 {
2640 int size, size1;
2641
2642 printf("START LEN TYPE L C SIZE\n");
2643 size = 0;
2644 for(i = 0; i <= code_max; i++) {
2645 de = &tab_de[i];
2646 if (de->len != 0) {
2647 size1 = get_decomp_run_size(de);
2648 printf("%05x %3d %6s %2d %1d %4d\n", i, de->len,
2649 decomp_type_str[de->type], de->c_len,
2650 unicode_db[i].is_compat, size1);
2651 i += de->len - 1;
2652 size += size1;
2653 }
2654 }
2655
2656 printf("array_len=%d estimated size=%d bytes actual=%d bytes\n",
2657 array_len, size, array_len * 6 + data_len);
2658 }
2659 #endif
2660
2661 fprintf(f, "static const uint32_t unicode_decomp_table1[%u] = {",
2662 array_len);
2663 count = 0;
2664 for(i = 0; i <= code_max; i++) {
2665 de = &tab_de[i];
2666 if (de->len != 0) {
2667 uint32_t v;
2668 if (count++ % 4 == 0)
2669 fprintf(f, "\n ");
2670 v = (de->code << (32 - 18)) |
2671 (de->len << (32 - 18 - 7)) |
2672 (de->type << (32 - 18 - 7 - 6)) |
2673 unicode_db[de->code].is_compat;
2674 fprintf(f, " 0x%08x,", v);
2675 i += de->len - 1;
2676 }
2677 }
2678 fprintf(f, "\n};\n\n");
2679
2680 fprintf(f, "static const uint16_t unicode_decomp_table2[%u] = {",
2681 array_len);
2682 count = 0;
2683 for(i = 0; i <= code_max; i++) {
2684 de = &tab_de[i];
2685 if (de->len != 0) {
2686 if (count++ % 8 == 0)
2687 fprintf(f, "\n ");
2688 fprintf(f, " 0x%04x,", de->data_index);
2689 i += de->len - 1;
2690 }
2691 }
2692 fprintf(f, "\n};\n\n");
2693
2694 fprintf(f, "static const uint8_t unicode_decomp_data[%u] = {",
2695 data_len);
2696 for(i = 0; i < data_len; i++) {
2697 if (i % 8 == 0)
2698 fprintf(f, "\n ");
2699 fprintf(f, " 0x%02x,", data_buf[i]);
2700 }
2701 fprintf(f, "\n};\n\n");
2702
2703 build_compose_table(f, tab_de);
2704
2705 free(data_buf);
2706
2707 free(tab_de);
2708 }
2709
2710 typedef struct {
2711 uint32_t c[2];
2712 uint32_t p;
2713 } ComposeEntry;
2714
2715 #define COMPOSE_LEN_MAX 10000
2716
ce_cmp(const void * p1,const void * p2)2717 static int ce_cmp(const void *p1, const void *p2)
2718 {
2719 const ComposeEntry *ce1 = p1;
2720 const ComposeEntry *ce2 = p2;
2721 int i;
2722
2723 for(i = 0; i < 2; i++) {
2724 if (ce1->c[i] < ce2->c[i])
2725 return -1;
2726 else if (ce1->c[i] > ce2->c[i])
2727 return 1;
2728 }
2729 return 0;
2730 }
2731
2732
get_decomp_pos(const DecompEntry * tab_de,int c)2733 static int get_decomp_pos(const DecompEntry *tab_de, int c)
2734 {
2735 int i, v, k;
2736 const DecompEntry *de;
2737
2738 k = 0;
2739 for(i = 0; i <= CHARCODE_MAX; i++) {
2740 de = &tab_de[i];
2741 if (de->len != 0) {
2742 if (c >= de->code && c < de->code + de->len) {
2743 v = c - de->code;
2744 assert(v < 64);
2745 v |= k << 6;
2746 assert(v < 65536);
2747 return v;
2748 }
2749 i += de->len - 1;
2750 k++;
2751 }
2752 }
2753 return -1;
2754 }
2755
build_compose_table(FILE * f,const DecompEntry * tab_de)2756 void build_compose_table(FILE *f, const DecompEntry *tab_de)
2757 {
2758 int i, v, tab_ce_len;
2759 ComposeEntry *ce, *tab_ce;
2760
2761 tab_ce = malloc(sizeof(*tab_ce) * COMPOSE_LEN_MAX);
2762 tab_ce_len = 0;
2763 for(i = 0; i <= CHARCODE_MAX; i++) {
2764 CCInfo *ci = &unicode_db[i];
2765 if (ci->decomp_len == 2 && !ci->is_compat &&
2766 !ci->is_excluded) {
2767 assert(tab_ce_len < COMPOSE_LEN_MAX);
2768 ce = &tab_ce[tab_ce_len++];
2769 ce->c[0] = ci->decomp_data[0];
2770 ce->c[1] = ci->decomp_data[1];
2771 ce->p = i;
2772 }
2773 }
2774 qsort(tab_ce, tab_ce_len, sizeof(*tab_ce), ce_cmp);
2775
2776 #if 0
2777 {
2778 printf("tab_ce_len=%d\n", tab_ce_len);
2779 for(i = 0; i < tab_ce_len; i++) {
2780 ce = &tab_ce[i];
2781 printf("%05x %05x %05x\n", ce->c[0], ce->c[1], ce->p);
2782 }
2783 }
2784 #endif
2785
2786 fprintf(f, "static const uint16_t unicode_comp_table[%u] = {",
2787 tab_ce_len);
2788 for(i = 0; i < tab_ce_len; i++) {
2789 if (i % 8 == 0)
2790 fprintf(f, "\n ");
2791 v = get_decomp_pos(tab_de, tab_ce[i].p);
2792 if (v < 0) {
2793 printf("ERROR: entry for c=%04x not found\n",
2794 tab_ce[i].p);
2795 exit(1);
2796 }
2797 fprintf(f, " 0x%04x,", v);
2798 }
2799 fprintf(f, "\n};\n\n");
2800
2801 free(tab_ce);
2802 }
2803
2804 #ifdef USE_TEST
check_decompose_table(void)2805 void check_decompose_table(void)
2806 {
2807 int c;
2808 CCInfo *ci;
2809 int res[UNICODE_DECOMP_LEN_MAX], *ref;
2810 int len, ref_len, is_compat;
2811
2812 for(is_compat = 0; is_compat <= 1; is_compat++) {
2813 for(c = 0; c < CHARCODE_MAX; c++) {
2814 ci = &unicode_db[c];
2815 ref_len = ci->decomp_len;
2816 ref = ci->decomp_data;
2817 if (!is_compat && ci->is_compat) {
2818 ref_len = 0;
2819 }
2820 len = unicode_decomp_char((uint32_t *)res, c, is_compat);
2821 if (len != ref_len ||
2822 tabcmp(res, ref, ref_len) != 0) {
2823 printf("ERROR c=%05x compat=%d\n", c, is_compat);
2824 dump_str("res", res, len);
2825 dump_str("ref", ref, ref_len);
2826 exit(1);
2827 }
2828 }
2829 }
2830 }
2831
check_compose_table(void)2832 void check_compose_table(void)
2833 {
2834 int i, p;
2835 /* XXX: we don't test all the cases */
2836
2837 for(i = 0; i <= CHARCODE_MAX; i++) {
2838 CCInfo *ci = &unicode_db[i];
2839 if (ci->decomp_len == 2 && !ci->is_compat &&
2840 !ci->is_excluded) {
2841 p = unicode_compose_pair(ci->decomp_data[0], ci->decomp_data[1]);
2842 if (p != i) {
2843 printf("ERROR compose: c=%05x %05x -> %05x ref=%05x\n",
2844 ci->decomp_data[0], ci->decomp_data[1], p, i);
2845 exit(1);
2846 }
2847 }
2848 }
2849
2850
2851
2852 }
2853
2854 #endif
2855
2856
2857
2858 #ifdef USE_TEST
2859
check_str(const char * msg,int num,const int * in_buf,int in_len,const int * buf1,int len1,const int * buf2,int len2)2860 void check_str(const char *msg, int num, const int *in_buf, int in_len,
2861 const int *buf1, int len1,
2862 const int *buf2, int len2)
2863 {
2864 if (len1 != len2 || tabcmp(buf1, buf2, len1) != 0) {
2865 printf("%d: ERROR %s:\n", num, msg);
2866 dump_str(" in", in_buf, in_len);
2867 dump_str("res", buf1, len1);
2868 dump_str("ref", buf2, len2);
2869 exit(1);
2870 }
2871 }
2872
check_cc_table(void)2873 void check_cc_table(void)
2874 {
2875 int cc, cc_ref, c;
2876
2877 for(c = 0; c <= CHARCODE_MAX; c++) {
2878 cc_ref = unicode_db[c].combining_class;
2879 cc = unicode_get_cc(c);
2880 if (cc != cc_ref) {
2881 printf("ERROR: c=%04x cc=%d cc_ref=%d\n",
2882 c, cc, cc_ref);
2883 exit(1);
2884 }
2885 }
2886 #ifdef PROFILE
2887 {
2888 int64_t ti, count;
2889
2890 ti = get_time_ns();
2891 count = 0;
2892 /* only do it on meaningful chars */
2893 for(c = 0x20; c <= 0xffff; c++) {
2894 cc_ref = unicode_db[c].combining_class;
2895 cc = unicode_get_cc(c);
2896 count++;
2897 }
2898 ti = get_time_ns() - ti;
2899 printf("cc time=%0.1f ns/char\n",
2900 (double)ti / count);
2901 }
2902 #endif
2903 }
2904
normalization_test(const char * filename)2905 void normalization_test(const char *filename)
2906 {
2907 FILE *f;
2908 char line[4096], *p;
2909 int *in_str, *nfc_str, *nfd_str, *nfkc_str, *nfkd_str;
2910 int in_len, nfc_len, nfd_len, nfkc_len, nfkd_len;
2911 int *buf, buf_len, pos;
2912
2913 f = fopen(filename, "rb");
2914 if (!f) {
2915 perror(filename);
2916 exit(1);
2917 }
2918 pos = 0;
2919 for(;;) {
2920 if (!get_line(line, sizeof(line), f))
2921 break;
2922 pos++;
2923 p = line;
2924 while (isspace(*p))
2925 p++;
2926 if (*p == '#' || *p == '@')
2927 continue;
2928 in_str = get_field_str(&in_len, p, 0);
2929 nfc_str = get_field_str(&nfc_len, p, 1);
2930 nfd_str = get_field_str(&nfd_len, p, 2);
2931 nfkc_str = get_field_str(&nfkc_len, p, 3);
2932 nfkd_str = get_field_str(&nfkd_len, p, 4);
2933
2934 // dump_str("in", in_str, in_len);
2935
2936 buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFD, NULL, NULL);
2937 check_str("nfd", pos, in_str, in_len, buf, buf_len, nfd_str, nfd_len);
2938 free(buf);
2939
2940 buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFKD, NULL, NULL);
2941 check_str("nfkd", pos, in_str, in_len, buf, buf_len, nfkd_str, nfkd_len);
2942 free(buf);
2943
2944 buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFC, NULL, NULL);
2945 check_str("nfc", pos, in_str, in_len, buf, buf_len, nfc_str, nfc_len);
2946 free(buf);
2947
2948 buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFKC, NULL, NULL);
2949 check_str("nfkc", pos, in_str, in_len, buf, buf_len, nfkc_str, nfkc_len);
2950 free(buf);
2951
2952 free(in_str);
2953 free(nfc_str);
2954 free(nfd_str);
2955 free(nfkc_str);
2956 free(nfkd_str);
2957 }
2958 fclose(f);
2959 }
2960 #endif
2961
main(int argc,char ** argv)2962 int main(int argc, char **argv)
2963 {
2964 const char *unicode_db_path, *outfilename;
2965 char filename[1024];
2966
2967 if (argc < 2) {
2968 printf("usage: %s unicode_db_path [output_file]\n"
2969 "\n"
2970 "If no output_file is given, a self test is done using the current unicode library\n",
2971 argv[0]);
2972 exit(1);
2973 }
2974 unicode_db_path = argv[1];
2975 outfilename = NULL;
2976 if (argc >= 3)
2977 outfilename = argv[2];
2978
2979 unicode_db = mallocz(sizeof(unicode_db[0]) * (CHARCODE_MAX + 1));
2980
2981 snprintf(filename, sizeof(filename), "%s/UnicodeData.txt", unicode_db_path);
2982
2983 parse_unicode_data(filename);
2984
2985 snprintf(filename, sizeof(filename), "%s/SpecialCasing.txt", unicode_db_path);
2986 parse_special_casing(unicode_db, filename);
2987
2988 snprintf(filename, sizeof(filename), "%s/CaseFolding.txt", unicode_db_path);
2989 parse_case_folding(unicode_db, filename);
2990
2991 snprintf(filename, sizeof(filename), "%s/CompositionExclusions.txt", unicode_db_path);
2992 parse_composition_exclusions(filename);
2993
2994 snprintf(filename, sizeof(filename), "%s/DerivedCoreProperties.txt", unicode_db_path);
2995 parse_derived_core_properties(filename);
2996
2997 snprintf(filename, sizeof(filename), "%s/DerivedNormalizationProps.txt", unicode_db_path);
2998 parse_derived_norm_properties(filename);
2999
3000 snprintf(filename, sizeof(filename), "%s/PropList.txt", unicode_db_path);
3001 parse_prop_list(filename);
3002
3003 snprintf(filename, sizeof(filename), "%s/Scripts.txt", unicode_db_path);
3004 parse_scripts(filename);
3005
3006 snprintf(filename, sizeof(filename), "%s/ScriptExtensions.txt",
3007 unicode_db_path);
3008 parse_script_extensions(filename);
3009
3010 snprintf(filename, sizeof(filename), "%s/emoji-data.txt",
3011 unicode_db_path);
3012 parse_prop_list(filename);
3013
3014 // dump_data(unicode_db);
3015
3016 build_conv_table(unicode_db);
3017
3018 // dump_table();
3019
3020 if (!outfilename) {
3021 #ifdef USE_TEST
3022 check_case_conv();
3023 check_flags();
3024 check_decompose_table();
3025 check_compose_table();
3026 check_cc_table();
3027 snprintf(filename, sizeof(filename), "%s/NormalizationTest.txt", unicode_db_path);
3028 normalization_test(filename);
3029 #else
3030 fprintf(stderr, "Tests are not compiled\n");
3031 exit(1);
3032 #endif
3033 } else
3034 {
3035 FILE *fo = fopen(outfilename, "wb");
3036
3037 if (!fo) {
3038 perror(outfilename);
3039 exit(1);
3040 }
3041 fprintf(fo,
3042 "/* Compressed unicode tables */\n"
3043 "/* Automatically generated file - do not edit */\n"
3044 "\n"
3045 "#include <stdint.h>\n"
3046 "\n");
3047 dump_case_conv_table(fo);
3048 compute_internal_props();
3049 build_flags_tables(fo);
3050 fprintf(fo, "#ifdef CONFIG_ALL_UNICODE\n\n");
3051 build_cc_table(fo);
3052 build_decompose_table(fo);
3053 build_general_category_table(fo);
3054 build_script_table(fo);
3055 build_script_ext_table(fo);
3056 build_prop_list_table(fo);
3057 fprintf(fo, "#endif /* CONFIG_ALL_UNICODE */\n");
3058 fclose(fo);
3059 }
3060 return 0;
3061 }
3062