• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Generation of Unicode tables
3  *
4  * Copyright (c) 2017-2018 Fabrice Bellard
5  * Copyright (c) 2017-2018 Charlie Gordon
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 #include <stdlib.h>
26 #include <stdio.h>
27 #include <stdarg.h>
28 #include <inttypes.h>
29 #include <string.h>
30 #include <assert.h>
31 #include <ctype.h>
32 #include <time.h>
33 
34 #include "cutils.h"
35 
36 /* define it to be able to test unicode.c */
37 //#define USE_TEST
38 /* profile tests */
39 //#define PROFILE
40 
41 //#define DUMP_CASE_CONV_TABLE
42 //#define DUMP_TABLE_SIZE
43 //#define DUMP_CC_TABLE
44 //#define DUMP_DECOMP_TABLE
45 
46 /* Ideas:
47    - Generalize run length encoding + index for all tables
48    - remove redundant tables for ID_start, ID_continue, Case_Ignorable, Cased
49 
50    Case conversion:
51    - use a single entry for consecutive U/LF runs
52    - allow EXT runs of length > 1
53 
54    Decomposition:
55    - Greek lower case (+1f10/1f10) ?
56    - allow holes in B runs
57    - suppress more upper / lower case redundancy
58 */
59 
60 #ifdef USE_TEST
61 #include "libunicode.c"
62 #endif
63 
64 #define CHARCODE_MAX 0x10ffff
65 #define CC_LEN_MAX 3
66 
mallocz(size_t size)67 void *mallocz(size_t size)
68 {
69     void *ptr;
70     ptr = malloc(size);
71     memset(ptr, 0, size);
72     return ptr;
73 }
74 
get_field(const char * p,int n)75 const char *get_field(const char *p, int n)
76 {
77     int i;
78     for(i = 0; i < n; i++) {
79         while (*p != ';' && *p != '\0')
80             p++;
81         if (*p == '\0')
82             return NULL;
83         p++;
84     }
85     return p;
86 }
87 
get_field_buf(char * buf,size_t buf_size,const char * p,int n)88 const char *get_field_buf(char *buf, size_t buf_size, const char *p, int n)
89 {
90     char *q;
91     p = get_field(p, n);
92     q = buf;
93     while (*p != ';' && *p != '\0') {
94         if ((q - buf) < buf_size - 1)
95             *q++ = *p;
96         p++;
97     }
98     *q = '\0';
99     return buf;
100 }
101 
add_char(int ** pbuf,int * psize,int * plen,int c)102 void add_char(int **pbuf, int *psize, int *plen, int c)
103 {
104     int len, size;
105     size = *psize;
106     len = *plen;
107     if (len >= size) {
108         size = *psize;
109         size = max_int(len + 1, size * 3 / 2);
110         int *buf = realloc(*pbuf, sizeof((*pbuf)[0]) * size);
111         if (!buf) {
112             sprintf(stderr, "relloc failed. file:%s func:%s line:%d", __FILE__, __FUNCTION__, __LINE__);
113             exit(1);
114         } else {
115             *pbuf = buf;
116             *psize = size;
117         }
118     }
119     (*pbuf)[len++] = c;
120     *plen = len;
121 }
122 
get_field_str(int * plen,const char * str,int n)123 int *get_field_str(int *plen, const char *str, int n)
124 {
125     const char *p;
126     int *buf, len, size;
127     p = get_field(str, n);
128     if (!p) {
129         *plen = 0;
130         return NULL;
131     }
132     len = 0;
133     size = 0;
134     buf = NULL;
135     for(;;) {
136         while (isspace(*p))
137             p++;
138         if (!isxdigit(*p))
139             break;
140         add_char(&buf, &size, &len, strtoul(p, (char **)&p, 16));
141     }
142     *plen = len;
143     return buf;
144 }
145 
get_line(char * buf,int buf_size,FILE * f)146 char *get_line(char *buf, int buf_size, FILE *f)
147 {
148     int len;
149     if (!fgets(buf, buf_size, f))
150         return NULL;
151     len = strlen(buf);
152     if (len > 0 && buf[len - 1] == '\n')
153         buf[len - 1] = '\0';
154     return buf;
155 }
156 
157 #define UNICODE_GENERAL_CATEGORY
158 
159 typedef enum {
160 #define DEF(id, str) GCAT_ ## id,
161 #include "unicode_gen_def.h"
162 #undef DEF
163     GCAT_COUNT,
164 } UnicodeGCEnum1;
165 
166 static const char *unicode_gc_name[] = {
167 #define DEF(id, str) #id,
168 #include "unicode_gen_def.h"
169 #undef DEF
170 };
171 
172 static const char *unicode_gc_short_name[] = {
173 #define DEF(id, str) str,
174 #include "unicode_gen_def.h"
175 #undef DEF
176 };
177 
178 #undef UNICODE_GENERAL_CATEGORY
179 
180 #define UNICODE_SCRIPT
181 
182 typedef enum {
183 #define DEF(id, str) SCRIPT_ ## id,
184 #include "unicode_gen_def.h"
185 #undef DEF
186     SCRIPT_COUNT,
187 } UnicodeScriptEnum1;
188 
189 static const char *unicode_script_name[] = {
190 #define DEF(id, str) #id,
191 #include "unicode_gen_def.h"
192 #undef DEF
193 };
194 
195 const char *unicode_script_short_name[] = {
196 #define DEF(id, str) str,
197 #include "unicode_gen_def.h"
198 #undef DEF
199 };
200 
201 #undef UNICODE_SCRIPT
202 
203 #define UNICODE_PROP_LIST
204 
205 typedef enum {
206 #define DEF(id, str) PROP_ ## id,
207 #include "unicode_gen_def.h"
208 #undef DEF
209     PROP_COUNT,
210 } UnicodePropEnum1;
211 
212 static const char *unicode_prop_name[] = {
213 #define DEF(id, str) #id,
214 #include "unicode_gen_def.h"
215 #undef DEF
216 };
217 
218 static const char *unicode_prop_short_name[] = {
219 #define DEF(id, str) str,
220 #include "unicode_gen_def.h"
221 #undef DEF
222 };
223 
224 #undef UNICODE_SPROP_LIST
225 
226 typedef struct {
227     /* case conv */
228     uint8_t u_len;
229     uint8_t l_len;
230     int u_data[CC_LEN_MAX];
231     int l_data[CC_LEN_MAX];
232     int f_code;
233 
234     uint8_t combining_class;
235     uint8_t is_compat:1;
236     uint8_t is_excluded:1;
237     uint8_t general_category;
238     uint8_t script;
239     uint8_t script_ext_len;
240     uint8_t *script_ext;
241     uint32_t prop_bitmap_tab[3];
242     /* decomposition */
243     int decomp_len;
244     int *decomp_data;
245 } CCInfo;
246 
247 CCInfo *unicode_db;
248 
find_name(const char ** tab,int tab_len,const char * name)249 int find_name(const char **tab, int tab_len, const char *name)
250 {
251     int i, len, name_len;
252     const char *p, *r;
253 
254     name_len = strlen(name);
255     for(i = 0; i < tab_len; i++) {
256         p = tab[i];
257         for(;;) {
258             r = strchr(p, ',');
259             if (!r)
260                 len = strlen(p);
261             else
262                 len = r - p;
263             if (len == name_len && memcmp(p, name, len) == 0)
264                 return i;
265             if (!r)
266                 break;
267             p = r + 1;
268         }
269     }
270     return -1;
271 }
272 
get_prop(uint32_t c,int prop_idx)273 static int get_prop(uint32_t c, int prop_idx)
274 {
275     return (unicode_db[c].prop_bitmap_tab[prop_idx >> 5] >> (prop_idx & 0x1f)) & 1;
276 }
277 
set_prop(uint32_t c,int prop_idx,int val)278 static void set_prop(uint32_t c, int prop_idx, int val)
279 {
280     uint32_t mask;
281     mask = 1U << (prop_idx & 0x1f);
282     if (val)
283         unicode_db[c].prop_bitmap_tab[prop_idx >> 5] |= mask;
284     else
285         unicode_db[c].prop_bitmap_tab[prop_idx >> 5]  &= ~mask;
286 }
287 
parse_unicode_data(const char * filename)288 void parse_unicode_data(const char *filename)
289 {
290     FILE *f;
291     char line[1024];
292     char buf1[256];
293     const char *p;
294     int code, lc, uc, last_code;
295     CCInfo *ci, *tab = unicode_db;
296 
297     f = fopen(filename, "rb");
298     if (!f) {
299         perror(filename);
300         exit(1);
301     }
302 
303     last_code = 0;
304     for(;;) {
305         if (!get_line(line, sizeof(line), f))
306             break;
307         p = line;
308         while (isspace(*p))
309             p++;
310         if (*p == '#')
311             continue;
312 
313         p = get_field(line, 0);
314         if (!p)
315             continue;
316         code = strtoul(p, NULL, 16);
317         lc = 0;
318         uc = 0;
319 
320         p = get_field(line, 12);
321         if (p && *p != ';') {
322             uc = strtoul(p, NULL, 16);
323         }
324 
325         p = get_field(line, 13);
326         if (p && *p != ';') {
327             lc = strtoul(p, NULL, 16);
328         }
329         ci = &tab[code];
330         if (uc > 0 || lc > 0) {
331             assert(code <= CHARCODE_MAX);
332             if (uc > 0) {
333                 assert(ci->u_len == 0);
334                 ci->u_len = 1;
335                 ci->u_data[0] = uc;
336             }
337             if (lc > 0) {
338                 assert(ci->l_len == 0);
339                 ci->l_len = 1;
340                 ci->l_data[0] = lc;
341             }
342         }
343 
344         {
345             int i;
346             get_field_buf(buf1, sizeof(buf1), line, 2);
347             i = find_name(unicode_gc_name, countof(unicode_gc_name), buf1);
348             if (i < 0) {
349                 fprintf(stderr, "General category '%s' not found\n",
350                         buf1);
351                 exit(1);
352             }
353             ci->general_category = i;
354         }
355 
356         p = get_field(line, 3);
357         if (p && *p != ';' && *p != '\0') {
358             int cc;
359             cc = strtoul(p, NULL, 0);
360             if (cc != 0) {
361                 assert(code <= CHARCODE_MAX);
362                 ci->combining_class = cc;
363                 //                printf("%05x: %d\n", code, ci->combining_class);
364             }
365         }
366 
367         p = get_field(line, 5);
368         if (p && *p != ';' && *p != '\0') {
369             int size;
370             assert(code <= CHARCODE_MAX);
371             ci->is_compat = 0;
372             if (*p == '<') {
373                 while (*p != '\0' && *p != '>')
374                     p++;
375                 if (*p == '>')
376                     p++;
377                 ci->is_compat = 1;
378             }
379             size = 0;
380             for(;;) {
381                 while (isspace(*p))
382                     p++;
383                 if (!isxdigit(*p))
384                     break;
385                 add_char(&ci->decomp_data, &size, &ci->decomp_len, strtoul(p, (char **)&p, 16));
386             }
387 #if 0
388             {
389                 int i;
390                 static int count, d_count;
391 
392                 printf("%05x: %c", code, ci->is_compat ? 'C': ' ');
393                 for(i = 0; i < ci->decomp_len; i++)
394                     printf(" %05x", ci->decomp_data[i]);
395                 printf("\n");
396                 count++;
397                 d_count += ci->decomp_len;
398                 //                printf("%d %d\n", count, d_count);
399             }
400 #endif
401         }
402 
403         p = get_field(line, 9);
404         if (p && *p == 'Y') {
405             set_prop(code, PROP_Bidi_Mirrored, 1);
406         }
407 
408         /* handle ranges */
409         get_field_buf(buf1, sizeof(buf1), line, 1);
410         if (strstr(buf1, " Last>")) {
411             int i;
412             //            printf("range: 0x%x-%0x\n", last_code, code);
413             assert(ci->decomp_len == 0);
414             assert(ci->script_ext_len == 0);
415             for(i = last_code + 1; i < code; i++) {
416                 unicode_db[i] = *ci;
417             }
418         }
419         last_code = code;
420     }
421 
422     fclose(f);
423 }
424 
parse_special_casing(CCInfo * tab,const char * filename)425 void parse_special_casing(CCInfo *tab, const char *filename)
426 {
427     FILE *f;
428     char line[1024];
429     const char *p;
430     int code;
431     CCInfo *ci;
432 
433     f = fopen(filename, "rb");
434     if (!f) {
435         perror(filename);
436         exit(1);
437     }
438 
439     for(;;) {
440         if (!get_line(line, sizeof(line), f))
441             break;
442         p = line;
443         while (isspace(*p))
444             p++;
445         if (*p == '#')
446             continue;
447 
448         p = get_field(line, 0);
449         if (!p)
450             continue;
451         code = strtoul(p, NULL, 16);
452         assert(code <= CHARCODE_MAX);
453         ci = &tab[code];
454 
455         p = get_field(line, 4);
456         if (p) {
457             /* locale dependent casing */
458             while (isspace(*p))
459                 p++;
460             if (*p != '#' && *p != '\0')
461                 continue;
462         }
463 
464 
465         p = get_field(line, 1);
466         if (p && *p != ';') {
467             ci->l_len = 0;
468             for(;;) {
469                 while (isspace(*p))
470                     p++;
471                 if (*p == ';')
472                     break;
473                 assert(ci->l_len < CC_LEN_MAX);
474                 ci->l_data[ci->l_len++] = strtoul(p, (char **)&p, 16);
475             }
476 
477             if (ci->l_len == 1 && ci->l_data[0] == code)
478                 ci->l_len = 0;
479         }
480 
481         p = get_field(line, 3);
482         if (p && *p != ';') {
483             ci->u_len = 0;
484             for(;;) {
485                 while (isspace(*p))
486                     p++;
487                 if (*p == ';')
488                     break;
489                 assert(ci->u_len < CC_LEN_MAX);
490                 ci->u_data[ci->u_len++] = strtoul(p, (char **)&p, 16);
491             }
492 
493             if (ci->u_len == 1 && ci->u_data[0] == code)
494                 ci->u_len = 0;
495         }
496     }
497 
498     fclose(f);
499 }
500 
parse_case_folding(CCInfo * tab,const char * filename)501 void parse_case_folding(CCInfo *tab, const char *filename)
502 {
503     FILE *f;
504     char line[1024];
505     const char *p;
506     int code;
507     CCInfo *ci;
508 
509     f = fopen(filename, "rb");
510     if (!f) {
511         perror(filename);
512         exit(1);
513     }
514 
515     for(;;) {
516         if (!get_line(line, sizeof(line), f))
517             break;
518         p = line;
519         while (isspace(*p))
520             p++;
521         if (*p == '#')
522             continue;
523 
524         p = get_field(line, 0);
525         if (!p)
526             continue;
527         code = strtoul(p, NULL, 16);
528         assert(code <= CHARCODE_MAX);
529         ci = &tab[code];
530 
531         p = get_field(line, 1);
532         if (!p)
533             continue;
534         /* locale dependent casing */
535         while (isspace(*p))
536             p++;
537         if (*p != 'C' && *p != 'S')
538             continue;
539 
540         p = get_field(line, 2);
541         assert(p != 0);
542         assert(ci->f_code == 0);
543         ci->f_code = strtoul(p, NULL, 16);
544         assert(ci->f_code != 0 && ci->f_code != code);
545     }
546 
547     fclose(f);
548 }
549 
parse_composition_exclusions(const char * filename)550 void parse_composition_exclusions(const char *filename)
551 {
552     FILE *f;
553     char line[4096], *p;
554     uint32_t c0;
555 
556     f = fopen(filename, "rb");
557     if (!f) {
558         perror(filename);
559         exit(1);
560     }
561 
562     for(;;) {
563         if (!get_line(line, sizeof(line), f))
564             break;
565         p = line;
566         while (isspace(*p))
567             p++;
568         if (*p == '#' || *p == '@' || *p == '\0')
569             continue;
570         c0 = strtoul(p, (char **)&p, 16);
571         assert(c0 > 0 && c0 <= CHARCODE_MAX);
572         unicode_db[c0].is_excluded = TRUE;
573     }
574     fclose(f);
575 }
576 
parse_derived_core_properties(const char * filename)577 void parse_derived_core_properties(const char *filename)
578 {
579     FILE *f;
580     char line[4096], *p, buf[256], *q;
581     uint32_t c0, c1, c;
582     int i;
583 
584     f = fopen(filename, "rb");
585     if (!f) {
586         perror(filename);
587         exit(1);
588     }
589 
590     for(;;) {
591         if (!get_line(line, sizeof(line), f))
592             break;
593         p = line;
594         while (isspace(*p))
595             p++;
596         if (*p == '#' || *p == '@' || *p == '\0')
597             continue;
598         c0 = strtoul(p, (char **)&p, 16);
599         if (*p == '.' && p[1] == '.') {
600             p += 2;
601             c1 = strtoul(p, (char **)&p, 16);
602         } else {
603             c1 = c0;
604         }
605         assert(c1 <= CHARCODE_MAX);
606         p += strspn(p, " \t");
607         if (*p == ';') {
608             p++;
609             p += strspn(p, " \t");
610             q = buf;
611             while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
612                 if ((q - buf) < sizeof(buf) - 1)
613                     *q++ = *p;
614                 p++;
615             }
616             *q = '\0';
617             i = find_name(unicode_prop_name,
618                           countof(unicode_prop_name), buf);
619             if (i < 0) {
620                 if (!strcmp(buf, "Grapheme_Link"))
621                     goto next;
622                 fprintf(stderr, "Property not found: %s\n", buf);
623                 exit(1);
624             }
625             for(c = c0; c <= c1; c++) {
626                 set_prop(c, i, 1);
627             }
628 next: ;
629         }
630     }
631     fclose(f);
632 }
633 
parse_derived_norm_properties(const char * filename)634 void parse_derived_norm_properties(const char *filename)
635 {
636     FILE *f;
637     char line[4096], *p, buf[256], *q;
638     uint32_t c0, c1, c;
639 
640     f = fopen(filename, "rb");
641     if (!f) {
642         perror(filename);
643         exit(1);
644     }
645 
646     for(;;) {
647         if (!get_line(line, sizeof(line), f))
648             break;
649         p = line;
650         while (isspace(*p))
651             p++;
652         if (*p == '#' || *p == '@' || *p == '\0')
653             continue;
654         c0 = strtoul(p, (char **)&p, 16);
655         if (*p == '.' && p[1] == '.') {
656             p += 2;
657             c1 = strtoul(p, (char **)&p, 16);
658         } else {
659             c1 = c0;
660         }
661         assert(c1 <= CHARCODE_MAX);
662         p += strspn(p, " \t");
663         if (*p == ';') {
664             p++;
665             p += strspn(p, " \t");
666             q = buf;
667             while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
668                 if ((q - buf) < sizeof(buf) - 1)
669                     *q++ = *p;
670                 p++;
671             }
672             *q = '\0';
673             if (!strcmp(buf, "Changes_When_NFKC_Casefolded")) {
674                 for(c = c0; c <= c1; c++) {
675                     set_prop(c, PROP_Changes_When_NFKC_Casefolded, 1);
676                 }
677             }
678         }
679     }
680     fclose(f);
681 }
682 
parse_prop_list(const char * filename)683 void parse_prop_list(const char *filename)
684 {
685     FILE *f;
686     char line[4096], *p, buf[256], *q;
687     uint32_t c0, c1, c;
688     int i;
689 
690     f = fopen(filename, "rb");
691     if (!f) {
692         perror(filename);
693         exit(1);
694     }
695 
696     for(;;) {
697         if (!get_line(line, sizeof(line), f))
698             break;
699         p = line;
700         while (isspace(*p))
701             p++;
702         if (*p == '#' || *p == '@' || *p == '\0')
703             continue;
704         c0 = strtoul(p, (char **)&p, 16);
705         if (*p == '.' && p[1] == '.') {
706             p += 2;
707             c1 = strtoul(p, (char **)&p, 16);
708         } else {
709             c1 = c0;
710         }
711         assert(c1 <= CHARCODE_MAX);
712         p += strspn(p, " \t");
713         if (*p == ';') {
714             p++;
715             p += strspn(p, " \t");
716             q = buf;
717             while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
718                 if ((q - buf) < sizeof(buf) - 1)
719                     *q++ = *p;
720                 p++;
721             }
722             *q = '\0';
723             i = find_name(unicode_prop_name,
724                           countof(unicode_prop_name), buf);
725             if (i < 0) {
726                 fprintf(stderr, "Property not found: %s\n", buf);
727                 exit(1);
728             }
729             for(c = c0; c <= c1; c++) {
730                 set_prop(c, i, 1);
731             }
732         }
733     }
734     fclose(f);
735 }
736 
parse_scripts(const char * filename)737 void parse_scripts(const char *filename)
738 {
739     FILE *f;
740     char line[4096], *p, buf[256], *q;
741     uint32_t c0, c1, c;
742     int i;
743 
744     f = fopen(filename, "rb");
745     if (!f) {
746         perror(filename);
747         exit(1);
748     }
749 
750     for(;;) {
751         if (!get_line(line, sizeof(line), f))
752             break;
753         p = line;
754         while (isspace(*p))
755             p++;
756         if (*p == '#' || *p == '@' || *p == '\0')
757             continue;
758         c0 = strtoul(p, (char **)&p, 16);
759         if (*p == '.' && p[1] == '.') {
760             p += 2;
761             c1 = strtoul(p, (char **)&p, 16);
762         } else {
763             c1 = c0;
764         }
765         assert(c1 <= CHARCODE_MAX);
766         p += strspn(p, " \t");
767         if (*p == ';') {
768             p++;
769             p += strspn(p, " \t");
770             q = buf;
771             while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
772                 if ((q - buf) < sizeof(buf) - 1)
773                     *q++ = *p;
774                 p++;
775             }
776             *q = '\0';
777             i = find_name(unicode_script_name,
778                           countof(unicode_script_name), buf);
779             if (i < 0) {
780                 fprintf(stderr, "Unknown script: '%s'\n", buf);
781                 exit(1);
782             }
783             for(c = c0; c <= c1; c++)
784                 unicode_db[c].script = i;
785         }
786     }
787     fclose(f);
788 }
789 
parse_script_extensions(const char * filename)790 void parse_script_extensions(const char *filename)
791 {
792     FILE *f;
793     char line[4096], *p, buf[256], *q;
794     uint32_t c0, c1, c;
795     int i;
796     uint8_t script_ext[255];
797     int script_ext_len;
798 
799     f = fopen(filename, "rb");
800     if (!f) {
801         perror(filename);
802         exit(1);
803     }
804 
805     for(;;) {
806         if (!get_line(line, sizeof(line), f))
807             break;
808         p = line;
809         while (isspace(*p))
810             p++;
811         if (*p == '#' || *p == '@' || *p == '\0')
812             continue;
813         c0 = strtoul(p, (char **)&p, 16);
814         if (*p == '.' && p[1] == '.') {
815             p += 2;
816             c1 = strtoul(p, (char **)&p, 16);
817         } else {
818             c1 = c0;
819         }
820         assert(c1 <= CHARCODE_MAX);
821         p += strspn(p, " \t");
822         script_ext_len = 0;
823         if (*p == ';') {
824             p++;
825             for(;;) {
826                 p += strspn(p, " \t");
827                 q = buf;
828                 while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
829                     if ((q - buf) < sizeof(buf) - 1)
830                         *q++ = *p;
831                     p++;
832                 }
833                 *q = '\0';
834                 if (buf[0] == '\0')
835                     break;
836                 i = find_name(unicode_script_short_name,
837                               countof(unicode_script_short_name), buf);
838                 if (i < 0) {
839                     fprintf(stderr, "Script not found: %s\n", buf);
840                     exit(1);
841                 }
842                 assert(script_ext_len < sizeof(script_ext));
843                 script_ext[script_ext_len++] = i;
844             }
845             for(c = c0; c <= c1; c++) {
846                 CCInfo *ci = &unicode_db[c];
847                 ci->script_ext_len = script_ext_len;
848                 ci->script_ext = malloc(sizeof(ci->script_ext[0]) * script_ext_len);
849                 for(i = 0; i < script_ext_len; i++)
850                     ci->script_ext[i] = script_ext[i];
851             }
852         }
853     }
854     fclose(f);
855 }
856 
dump_cc_info(CCInfo * ci,int i)857 void dump_cc_info(CCInfo *ci, int i)
858 {
859     int j;
860     printf("%05x:", i);
861     if (ci->u_len != 0) {
862         printf(" U:");
863         for(j = 0; j < ci->u_len; j++)
864             printf(" %05x", ci->u_data[j]);
865     }
866     if (ci->l_len != 0) {
867         printf(" L:");
868         for(j = 0; j < ci->l_len; j++)
869             printf(" %05x", ci->l_data[j]);
870     }
871     if (ci->f_code != 0) {
872         printf(" F: %05x", ci->f_code);
873     }
874     printf("\n");
875 }
876 
dump_data(CCInfo * tab)877 void dump_data(CCInfo *tab)
878 {
879     int i;
880     CCInfo *ci;
881     for(i = 0; i <= CHARCODE_MAX; i++) {
882         ci = &tab[i];
883         if (ci->u_len != 0 || ci->l_len != 0 || ci->f_code != 0) {
884             dump_cc_info(ci, i);
885         }
886     }
887 }
888 
is_complicated_case(const CCInfo * ci)889 BOOL is_complicated_case(const CCInfo *ci)
890 {
891     return (ci->u_len > 1 || ci->l_len > 1 ||
892             (ci->u_len > 0 && ci->l_len > 0) ||
893             (ci->f_code != 0) != ci->l_len ||
894             (ci->f_code != 0 && ci->l_data[0] != ci->f_code));
895 }
896 
897 #ifndef USE_TEST
898 enum {
899     RUN_TYPE_U,
900     RUN_TYPE_L,
901     RUN_TYPE_UF,
902     RUN_TYPE_LF,
903     RUN_TYPE_UL,
904     RUN_TYPE_LSU,
905     RUN_TYPE_U2L_399_EXT2,
906     RUN_TYPE_UF_D20,
907     RUN_TYPE_UF_D1_EXT,
908     RUN_TYPE_U_EXT,
909     RUN_TYPE_LF_EXT,
910     RUN_TYPE_U_EXT2,
911     RUN_TYPE_L_EXT2,
912     RUN_TYPE_U_EXT3,
913 };
914 #endif
915 
916 const char *run_type_str[] = {
917     "U",
918     "L",
919     "UF",
920     "LF",
921     "UL",
922     "LSU",
923     "U2L_399_EXT2",
924     "UF_D20",
925     "UF_D1_EXT",
926     "U_EXT",
927     "LF_EXT",
928     "U_EXT2",
929     "L_EXT2",
930     "U_EXT3",
931 };
932 
933 typedef struct {
934     int code;
935     int len;
936     int type;
937     int data;
938     int ext_len;
939     int ext_data[3];
940     int data_index; /* 'data' coming from the table */
941 } TableEntry;
942 
943 /* code (17), len (7), type (4) */
944 
find_run_type(TableEntry * te,CCInfo * tab,int code)945 void find_run_type(TableEntry *te, CCInfo *tab, int code)
946 {
947     int is_lower, len;
948     CCInfo *ci, *ci1, *ci2;
949 
950     ci = &tab[code];
951     ci1 = &tab[code + 1];
952     ci2 = &tab[code + 2];
953     te->code = code;
954 
955     if (ci->l_len == 1 && ci->l_data[0] == code + 2 &&
956         ci->f_code == ci->l_data[0] &&
957         ci->u_len == 0 &&
958 
959         ci1->l_len == 1 && ci1->l_data[0] == code + 2 &&
960         ci1->f_code == ci1->l_data[0] &&
961         ci1->u_len == 1 && ci1->u_data[0] == code &&
962 
963         ci2->l_len == 0 &&
964         ci2->f_code == 0 &&
965         ci2->u_len == 1 && ci2->u_data[0] == code) {
966         te->len = 3;
967         te->data = 0;
968         te->type = RUN_TYPE_LSU;
969         return;
970     }
971 
972     if (is_complicated_case(ci)) {
973         len = 1;
974         while (code + len <= CHARCODE_MAX) {
975             ci1 = &tab[code + len];
976             if (ci1->u_len != 1 ||
977                 ci1->u_data[0] != ci->u_data[0] + len ||
978                 ci1->l_len != 0 ||
979                 ci1->f_code != ci1->u_data[0])
980                 break;
981             len++;
982         }
983         if (len > 1) {
984             te->len = len;
985             te->type = RUN_TYPE_UF;
986             te->data = ci->u_data[0];
987             return;
988         }
989 
990         if (ci->u_len == 2 && ci->u_data[1] == 0x399 &&
991             ci->f_code == 0 && ci->l_len == 0) {
992             len = 1;
993             while (code + len <= CHARCODE_MAX) {
994                 ci1 = &tab[code + len];
995                 if (!(ci1->u_len == 2 &&
996                     ci1->u_data[1] == 0x399 &&
997                       ci1->u_data[0] == ci->u_data[0] + len &&
998                       ci1->f_code == 0 &&
999                       ci1->l_len == 0))
1000                     break;
1001                 len++;
1002             }
1003             te->len = len;
1004             te->type = RUN_TYPE_U_EXT2;
1005             te->ext_data[0] = ci->u_data[0];
1006             te->ext_data[1] = ci->u_data[1];
1007             te->ext_len = 2;
1008             return;
1009         }
1010 
1011         if (ci->u_len == 2 && ci->u_data[1] == 0x399 &&
1012             ci->l_len == 1 && ci->f_code == ci->l_data[0]) {
1013             len = 1;
1014             while (code + len <= CHARCODE_MAX) {
1015                 ci1 = &tab[code + len];
1016                 if (!(ci1->u_len == 2 &&
1017                       ci1->u_data[1] == 0x399 &&
1018                       ci1->u_data[0] == ci->u_data[0] + len &&
1019                       ci1->l_len == 1 &&
1020                       ci1->l_data[0] == ci->l_data[0] + len &&
1021                       ci1->f_code == ci1->l_data[0]))
1022                     break;
1023                 len++;
1024             }
1025             te->len = len;
1026             te->type = RUN_TYPE_U2L_399_EXT2;
1027             te->ext_data[0] = ci->u_data[0];
1028             te->ext_data[1] = ci->l_data[0];
1029             te->ext_len = 2;
1030             return;
1031         }
1032 
1033         if (ci->l_len == 1 && ci->u_len == 0 && ci->f_code == 0) {
1034             len = 1;
1035             while (code + len <= CHARCODE_MAX) {
1036                 ci1 = &tab[code + len];
1037                 if (!(ci1->l_len == 1 &&
1038                       ci1->l_data[0] == ci->l_data[0] + len &&
1039                       ci1->u_len == 0 && ci1->f_code == 0))
1040                     break;
1041                 len++;
1042             }
1043             te->len = len;
1044             te->type = RUN_TYPE_L;
1045             te->data = ci->l_data[0];
1046             return;
1047         }
1048 
1049         if (ci->l_len == 0 &&
1050             ci->u_len == 1 &&
1051             ci->u_data[0] < 0x1000 &&
1052             ci->f_code == ci->u_data[0] + 0x20) {
1053             te->len = 1;
1054             te->type = RUN_TYPE_UF_D20;
1055             te->data = ci->u_data[0];
1056         } else if (ci->l_len == 0 &&
1057             ci->u_len == 1 &&
1058             ci->f_code == ci->u_data[0] + 1) {
1059             te->len = 1;
1060             te->type = RUN_TYPE_UF_D1_EXT;
1061             te->ext_data[0] = ci->u_data[0];
1062             te->ext_len = 1;
1063         } else if (ci->l_len == 2 && ci->u_len == 0 && ci->f_code == 0) {
1064             te->len = 1;
1065             te->type = RUN_TYPE_L_EXT2;
1066             te->ext_data[0] = ci->l_data[0];
1067             te->ext_data[1] = ci->l_data[1];
1068             te->ext_len = 2;
1069         } else if (ci->u_len == 2 && ci->l_len == 0 && ci->f_code == 0) {
1070             te->len = 1;
1071             te->type = RUN_TYPE_U_EXT2;
1072             te->ext_data[0] = ci->u_data[0];
1073             te->ext_data[1] = ci->u_data[1];
1074             te->ext_len = 2;
1075         } else if (ci->u_len == 3 && ci->l_len == 0 && ci->f_code == 0) {
1076             te->len = 1;
1077             te->type = RUN_TYPE_U_EXT3;
1078             te->ext_data[0] = ci->u_data[0];
1079             te->ext_data[1] = ci->u_data[1];
1080             te->ext_data[2] = ci->u_data[2];
1081             te->ext_len = 3;
1082         } else {
1083             printf("unsupported encoding case:\n");
1084             dump_cc_info(ci, code);
1085             abort();
1086         }
1087     } else {
1088         /* look for a run of identical conversions */
1089         len = 0;
1090         for(;;) {
1091             if (code >= CHARCODE_MAX || len >= 126)
1092                 break;
1093             ci = &tab[code + len];
1094             ci1 = &tab[code + len + 1];
1095             if (is_complicated_case(ci) || is_complicated_case(ci1)) {
1096                 break;
1097             }
1098             if (ci->l_len != 1 || ci->l_data[0] != code + len + 1)
1099                 break;
1100             if (ci1->u_len != 1 || ci1->u_data[0] != code + len)
1101                 break;
1102             len += 2;
1103         }
1104         if (len > 0) {
1105             te->len = len;
1106             te->type = RUN_TYPE_UL;
1107             te->data = 0;
1108             return;
1109         }
1110 
1111         ci = &tab[code];
1112         is_lower = ci->l_len > 0;
1113         len = 1;
1114         while (code + len <= CHARCODE_MAX) {
1115             ci1 = &tab[code + len];
1116             if (is_complicated_case(ci1))
1117                 break;
1118             if (is_lower) {
1119                 if (ci1->l_len != 1 ||
1120                     ci1->l_data[0] != ci->l_data[0] + len)
1121                     break;
1122             } else {
1123                 if (ci1->u_len != 1 ||
1124                     ci1->u_data[0] != ci->u_data[0] + len)
1125                     break;
1126             }
1127             len++;
1128         }
1129         te->len = len;
1130         if (is_lower) {
1131             te->type = RUN_TYPE_LF;
1132             te->data = ci->l_data[0];
1133         } else {
1134             te->type = RUN_TYPE_U;
1135             te->data = ci->u_data[0];
1136         }
1137     }
1138 }
1139 
1140 TableEntry conv_table[1000];
1141 int conv_table_len;
1142 int ext_data[1000];
1143 int ext_data_len;
1144 
dump_case_conv_table1(void)1145 void dump_case_conv_table1(void)
1146 {
1147     int i, j;
1148     const TableEntry *te;
1149 
1150     for(i = 0; i < conv_table_len; i++) {
1151         te = &conv_table[i];
1152         printf("%05x %02x %-10s %05x",
1153                te->code, te->len, run_type_str[te->type], te->data);
1154         for(j = 0; j < te->ext_len; j++) {
1155             printf(" %05x", te->ext_data[j]);
1156         }
1157         printf("\n");
1158     }
1159     printf("table_len=%d ext_len=%d\n", conv_table_len, ext_data_len);
1160 }
1161 
find_data_index(const TableEntry * conv_table,int len,int data)1162 int find_data_index(const TableEntry *conv_table, int len, int data)
1163 {
1164     int i;
1165     const TableEntry *te;
1166     for(i = 0; i < len; i++) {
1167         te = &conv_table[i];
1168         if (te->code == data)
1169             return i;
1170     }
1171     return -1;
1172 }
1173 
find_ext_data_index(int data)1174 int find_ext_data_index(int data)
1175 {
1176     int i;
1177     for(i = 0; i < ext_data_len; i++) {
1178         if (ext_data[i] == data)
1179             return i;
1180     }
1181     assert(ext_data_len < countof(ext_data));
1182     ext_data[ext_data_len++] = data;
1183     return ext_data_len - 1;
1184 }
1185 
build_conv_table(CCInfo * tab)1186 void build_conv_table(CCInfo *tab)
1187 {
1188     int code, i, j;
1189     CCInfo *ci;
1190     TableEntry *te;
1191 
1192     te = conv_table;
1193     for(code = 0; code <= CHARCODE_MAX; code++) {
1194         ci = &tab[code];
1195         if (ci->u_len == 0 && ci->l_len == 0 && ci->f_code == 0)
1196             continue;
1197         assert(te - conv_table < countof(conv_table));
1198         find_run_type(te, tab, code);
1199 #if 0
1200         if (te->type == RUN_TYPE_TODO) {
1201             printf("TODO: ");
1202             dump_cc_info(ci, code);
1203         }
1204 #endif
1205         assert(te->len <= 127);
1206         code += te->len - 1;
1207         te++;
1208     }
1209     conv_table_len = te - conv_table;
1210 
1211     /* find the data index */
1212     for(i = 0; i < conv_table_len; i++) {
1213         int data_index;
1214         te = &conv_table[i];
1215 
1216         switch(te->type) {
1217         case RUN_TYPE_U:
1218         case RUN_TYPE_L:
1219         case RUN_TYPE_UF:
1220         case RUN_TYPE_LF:
1221             data_index = find_data_index(conv_table, conv_table_len, te->data);
1222             if (data_index < 0) {
1223                 switch(te->type) {
1224                 case RUN_TYPE_U:
1225                     te->type = RUN_TYPE_U_EXT;
1226                     te->ext_len = 1;
1227                     te->ext_data[0] = te->data;
1228                     break;
1229                 case RUN_TYPE_LF:
1230                     te->type = RUN_TYPE_LF_EXT;
1231                     te->ext_len = 1;
1232                     te->ext_data[0] = te->data;
1233                     break;
1234                 default:
1235                     printf("%05x: index not found\n", te->code);
1236                     exit(1);
1237                 }
1238             } else {
1239                 te->data_index = data_index;
1240             }
1241             break;
1242         case RUN_TYPE_UF_D20:
1243             te->data_index = te->data;
1244             break;
1245         }
1246     }
1247 
1248     /* find the data index for ext_data */
1249     for(i = 0; i < conv_table_len; i++) {
1250         te = &conv_table[i];
1251         if (te->type == RUN_TYPE_U_EXT3) {
1252             int p, v;
1253             v = 0;
1254             for(j = 0; j < 3; j++) {
1255                 p = find_ext_data_index(te->ext_data[j]);
1256                 assert(p < 16);
1257                 v = (v << 4) | p;
1258             }
1259             te->data_index = v;
1260         }
1261     }
1262 
1263     for(i = 0; i < conv_table_len; i++) {
1264         te = &conv_table[i];
1265         if (te->type == RUN_TYPE_L_EXT2 ||
1266             te->type == RUN_TYPE_U_EXT2 ||
1267             te->type == RUN_TYPE_U2L_399_EXT2) {
1268             int p, v;
1269             v = 0;
1270             for(j = 0; j < 2; j++) {
1271                 p = find_ext_data_index(te->ext_data[j]);
1272                 assert(p < 64);
1273                 v = (v << 6) | p;
1274             }
1275             te->data_index = v;
1276         }
1277     }
1278 
1279     for(i = 0; i < conv_table_len; i++) {
1280         te = &conv_table[i];
1281         if (te->type == RUN_TYPE_UF_D1_EXT ||
1282             te->type == RUN_TYPE_U_EXT ||
1283             te->type == RUN_TYPE_LF_EXT) {
1284             te->data_index = find_ext_data_index(te->ext_data[0]);
1285         }
1286     }
1287 #ifdef DUMP_CASE_CONV_TABLE
1288     dump_case_conv_table1();
1289 #endif
1290 }
1291 
dump_case_conv_table(FILE * f)1292 void dump_case_conv_table(FILE *f)
1293 {
1294     int i;
1295     uint32_t v;
1296     const TableEntry *te;
1297 
1298     fprintf(f, "static const uint32_t case_conv_table1[%u] = {", conv_table_len);
1299     for(i = 0; i < conv_table_len; i++) {
1300         if (i % 4 == 0)
1301             fprintf(f, "\n   ");
1302         te = &conv_table[i];
1303         v = te->code << (32 - 17);
1304         v |= te->len << (32 - 17 - 7);
1305         v |= te->type << (32 - 17 - 7 - 4);
1306         v |= te->data_index >> 8;
1307         fprintf(f, " 0x%08x,", v);
1308     }
1309     fprintf(f, "\n};\n\n");
1310 
1311     fprintf(f, "static const uint8_t case_conv_table2[%u] = {", conv_table_len);
1312     for(i = 0; i < conv_table_len; i++) {
1313         if (i % 8 == 0)
1314             fprintf(f, "\n   ");
1315         te = &conv_table[i];
1316         fprintf(f, " 0x%02x,", te->data_index & 0xff);
1317     }
1318     fprintf(f, "\n};\n\n");
1319 
1320     fprintf(f, "static const uint16_t case_conv_ext[%u] = {", ext_data_len);
1321     for(i = 0; i < ext_data_len; i++) {
1322         if (i % 8 == 0)
1323             fprintf(f, "\n   ");
1324         fprintf(f, " 0x%04x,", ext_data[i]);
1325     }
1326     fprintf(f, "\n};\n\n");
1327 }
1328 
tabcmp(const int * tab1,const int * tab2,int n)1329 int tabcmp(const int *tab1, const int *tab2, int n)
1330 {
1331     int i;
1332     for(i = 0; i < n; i++) {
1333         if (tab1[i] != tab2[i])
1334             return -1;
1335     }
1336     return 0;
1337 }
1338 
dump_str(const char * str,const int * buf,int len)1339 void dump_str(const char *str, const int *buf, int len)
1340 {
1341     int i;
1342     printf("%s=", str);
1343     for(i = 0; i < len; i++)
1344         printf(" %05x", buf[i]);
1345     printf("\n");
1346 }
1347 
compute_internal_props(void)1348 void compute_internal_props(void)
1349 {
1350     int i;
1351     BOOL has_ul;
1352 
1353     for(i = 0; i <= CHARCODE_MAX; i++) {
1354         CCInfo *ci = &unicode_db[i];
1355         has_ul = (ci->u_len != 0 || ci->l_len != 0 || ci->f_code != 0);
1356         if (has_ul) {
1357             assert(get_prop(i, PROP_Cased));
1358         } else {
1359             set_prop(i, PROP_Cased1, get_prop(i, PROP_Cased));
1360         }
1361         set_prop(i, PROP_ID_Continue1,
1362                  get_prop(i, PROP_ID_Continue) & (get_prop(i, PROP_ID_Start) ^ 1));
1363         set_prop(i, PROP_XID_Start1,
1364                  get_prop(i, PROP_ID_Start) ^ get_prop(i, PROP_XID_Start));
1365         set_prop(i, PROP_XID_Continue1,
1366                  get_prop(i, PROP_ID_Continue) ^ get_prop(i, PROP_XID_Continue));
1367         set_prop(i, PROP_Changes_When_Titlecased1,
1368                  get_prop(i, PROP_Changes_When_Titlecased) ^ (ci->u_len != 0));
1369         set_prop(i, PROP_Changes_When_Casefolded1,
1370                  get_prop(i, PROP_Changes_When_Casefolded) ^ (ci->f_code != 0));
1371         /* XXX: reduce table size (438 bytes) */
1372         set_prop(i, PROP_Changes_When_NFKC_Casefolded1,
1373                  get_prop(i, PROP_Changes_When_NFKC_Casefolded) ^ (ci->f_code != 0));
1374 #if 0
1375         /* TEST */
1376 #define M(x) (1U << GCAT_ ## x)
1377         {
1378             int b;
1379             b = ((M(Mn) | M(Cf) | M(Lm) | M(Sk)) >>
1380                  unicode_db[i].general_category) & 1;
1381             set_prop(i, PROP_Cased1,
1382                      get_prop(i, PROP_Case_Ignorable) ^ b);
1383         }
1384 #undef M
1385 #endif
1386     }
1387 }
1388 
dump_byte_table(FILE * f,const char * cname,const uint8_t * tab,int len)1389 void dump_byte_table(FILE *f, const char *cname, const uint8_t *tab, int len)
1390 {
1391     int i;
1392     fprintf(f, "static const uint8_t %s[%d] = {", cname, len);
1393     for(i = 0; i < len; i++) {
1394         if (i % 8 == 0)
1395             fprintf(f, "\n   ");
1396         fprintf(f, " 0x%02x,", tab[i]);
1397     }
1398     fprintf(f, "\n};\n\n");
1399 }
1400 
1401 #define PROP_BLOCK_LEN 32
1402 
build_prop_table(FILE * f,int prop_index,BOOL add_index)1403 void build_prop_table(FILE *f, int prop_index, BOOL add_index)
1404 {
1405     int i, j, n, v, offset, code;
1406     DynBuf dbuf_s, *dbuf = &dbuf_s;
1407     DynBuf dbuf1_s, *dbuf1 = &dbuf1_s;
1408     DynBuf dbuf2_s, *dbuf2 = &dbuf2_s;
1409     const uint32_t *buf;
1410     int buf_len, block_end_pos, bit;
1411     char cname[128];
1412 
1413     dbuf_init(dbuf1);
1414 
1415     for(i = 0; i <= CHARCODE_MAX;) {
1416         v = get_prop(i, prop_index);
1417         j = i + 1;
1418         while (j <= CHARCODE_MAX && get_prop(j, prop_index) == v) {
1419             j++;
1420         }
1421         n = j - i;
1422         if (j == (CHARCODE_MAX + 1) && v == 0)
1423             break; /* no need to encode last zero run */
1424         //printf("%05x: %d %d\n", i, n, v);
1425         dbuf_put_u32(dbuf1, n - 1);
1426         i += n;
1427     }
1428 
1429     dbuf_init(dbuf);
1430     dbuf_init(dbuf2);
1431     buf = (uint32_t *)dbuf1->buf;
1432     buf_len = dbuf1->size / sizeof(buf[0]);
1433 
1434     /* the first value is assumed to be 0 */
1435     assert(get_prop(0, prop_index) == 0);
1436 
1437     block_end_pos = PROP_BLOCK_LEN;
1438     i = 0;
1439     code = 0;
1440     bit = 0;
1441     while (i < buf_len) {
1442         if (add_index && dbuf->size >= block_end_pos && bit == 0) {
1443             offset = (dbuf->size - block_end_pos);
1444             /* XXX: offset could be larger in case of runs of small
1445                lengths. Could add code to change the encoding to
1446                prevent it at the expense of one byte loss */
1447             assert(offset <= 7);
1448             v = code | (offset << 21);
1449             dbuf_putc(dbuf2, v);
1450             dbuf_putc(dbuf2, v >> 8);
1451             dbuf_putc(dbuf2, v >> 16);
1452             block_end_pos += PROP_BLOCK_LEN;
1453         }
1454 
1455         v = buf[i];
1456         code += v + 1;
1457         bit ^= 1;
1458         if (v < 8 && (i + 1) < buf_len && buf[i + 1] < 8) {
1459             code += buf[i + 1] + 1;
1460             bit ^= 1;
1461             dbuf_putc(dbuf, (v << 3) | buf[i + 1]);
1462             i += 2;
1463         } else if (v < 128) {
1464             dbuf_putc(dbuf, 0x80 + v);
1465             i++;
1466         } else if (v < (1 << 13)) {
1467             dbuf_putc(dbuf, 0x40 + (v >> 8));
1468             dbuf_putc(dbuf, v);
1469             i++;
1470         } else {
1471             assert(v < (1 << 21));
1472             dbuf_putc(dbuf, 0x60 + (v >> 16));
1473             dbuf_putc(dbuf, v >> 8);
1474             dbuf_putc(dbuf, v);
1475             i++;
1476         }
1477     }
1478 
1479     if (add_index) {
1480         /* last index entry */
1481         v = code;
1482         dbuf_putc(dbuf2, v);
1483         dbuf_putc(dbuf2, v >> 8);
1484         dbuf_putc(dbuf2, v >> 16);
1485     }
1486 
1487 #ifdef DUMP_TABLE_SIZE
1488     printf("prop %s: length=%d bytes\n", unicode_prop_name[prop_index],
1489            (int)(dbuf->size + dbuf2->size));
1490 #endif
1491     snprintf(cname, sizeof(cname), "unicode_prop_%s_table", unicode_prop_name[prop_index]);
1492     dump_byte_table(f, cname, dbuf->buf, dbuf->size);
1493     if (add_index) {
1494         snprintf(cname, sizeof(cname), "unicode_prop_%s_index", unicode_prop_name[prop_index]);
1495         dump_byte_table(f, cname, dbuf2->buf, dbuf2->size);
1496     }
1497 
1498     dbuf_free(dbuf);
1499     dbuf_free(dbuf1);
1500     dbuf_free(dbuf2);
1501 }
1502 
build_flags_tables(FILE * f)1503 void build_flags_tables(FILE *f)
1504 {
1505     build_prop_table(f, PROP_Cased1, TRUE);
1506     build_prop_table(f, PROP_Case_Ignorable, TRUE);
1507     build_prop_table(f, PROP_ID_Start, TRUE);
1508     build_prop_table(f, PROP_ID_Continue1, TRUE);
1509 }
1510 
dump_name_table(FILE * f,const char * cname,const char ** tab_name,int len,const char ** tab_short_name)1511 void dump_name_table(FILE *f, const char *cname, const char **tab_name, int len,
1512                      const char **tab_short_name)
1513 {
1514     int i, w, maxw;
1515 
1516     maxw = 0;
1517     for(i = 0; i < len; i++) {
1518         w = strlen(tab_name[i]);
1519         if (tab_short_name[i][0] != '\0') {
1520             w += 1 + strlen(tab_short_name[i]);
1521         }
1522         if (maxw < w)
1523             maxw = w;
1524     }
1525 
1526     /* generate a sequence of strings terminated by an empty string */
1527     fprintf(f, "static const char %s[] =\n", cname);
1528     for(i = 0; i < len; i++) {
1529         fprintf(f, "    \"");
1530         w = fprintf(f, "%s", tab_name[i]);
1531         if (tab_short_name[i][0] != '\0') {
1532             w += fprintf(f, ",%s", tab_short_name[i]);
1533         }
1534         fprintf(f, "\"%*s\"\\0\"\n", 1 + maxw - w, "");
1535     }
1536     fprintf(f, ";\n\n");
1537 }
1538 
build_general_category_table(FILE * f)1539 void build_general_category_table(FILE *f)
1540 {
1541     int i, v, j, n, n1;
1542     DynBuf dbuf_s, *dbuf = &dbuf_s;
1543     int cw_count, cw_len_count[4], cw_start;
1544 
1545     fprintf(f, "typedef enum {\n");
1546     for(i = 0; i < GCAT_COUNT; i++)
1547         fprintf(f, "    UNICODE_GC_%s,\n", unicode_gc_name[i]);
1548     fprintf(f, "    UNICODE_GC_COUNT,\n");
1549     fprintf(f, "} UnicodeGCEnum;\n\n");
1550 
1551     dump_name_table(f, "unicode_gc_name_table",
1552                     unicode_gc_name, GCAT_COUNT,
1553                     unicode_gc_short_name);
1554 
1555 
1556     dbuf_init(dbuf);
1557     cw_count = 0;
1558     for(i = 0; i < 4; i++)
1559         cw_len_count[i] = 0;
1560     for(i = 0; i <= CHARCODE_MAX;) {
1561         v = unicode_db[i].general_category;
1562         j = i + 1;
1563         while (j <= CHARCODE_MAX && unicode_db[j].general_category == v)
1564             j++;
1565         n = j - i;
1566         /* compress Lu/Ll runs */
1567         if (v == GCAT_Lu) {
1568             n1 = 1;
1569             while ((i + n1) <= CHARCODE_MAX && unicode_db[i + n1].general_category == (v + (n1 & 1))) {
1570                 n1++;
1571             }
1572             if (n1 > n) {
1573                 v = 31;
1574                 n = n1;
1575             }
1576         }
1577         //        printf("%05x %05x %d\n", i, n, v);
1578         cw_count++;
1579         n--;
1580         cw_start = dbuf->size;
1581         if (n < 7) {
1582             dbuf_putc(dbuf, (n << 5) | v);
1583         } else if (n < 7 + 128) {
1584             n1 = n - 7;
1585             assert(n1 < 128);
1586             dbuf_putc(dbuf, (0xf << 5) | v);
1587             dbuf_putc(dbuf, n1);
1588         } else if (n < 7 + 128 + (1 << 14)) {
1589             n1 = n - (7 + 128);
1590             assert(n1 < (1 << 14));
1591             dbuf_putc(dbuf, (0xf << 5) | v);
1592             dbuf_putc(dbuf, (n1 >> 8) + 128);
1593             dbuf_putc(dbuf, n1);
1594         } else {
1595             n1 = n - (7 + 128 + (1 << 14));
1596             assert(n1 < (1 << 22));
1597             dbuf_putc(dbuf, (0xf << 5) | v);
1598             dbuf_putc(dbuf, (n1 >> 16) + 128 + 64);
1599             dbuf_putc(dbuf, n1 >> 8);
1600             dbuf_putc(dbuf, n1);
1601         }
1602         cw_len_count[dbuf->size - cw_start - 1]++;
1603         i += n + 1;
1604     }
1605 #ifdef DUMP_TABLE_SIZE
1606     printf("general category: %d entries [",
1607            cw_count);
1608     for(i = 0; i < 4; i++)
1609         printf(" %d", cw_len_count[i]);
1610     printf(" ], length=%d bytes\n", (int)dbuf->size);
1611 #endif
1612 
1613     dump_byte_table(f, "unicode_gc_table", dbuf->buf, dbuf->size);
1614 
1615     dbuf_free(dbuf);
1616 }
1617 
build_script_table(FILE * f)1618 void build_script_table(FILE *f)
1619 {
1620     int i, v, j, n, n1, type;
1621     DynBuf dbuf_s, *dbuf = &dbuf_s;
1622     int cw_count, cw_len_count[4], cw_start;
1623 
1624     fprintf(f, "typedef enum {\n");
1625     for(i = 0; i < SCRIPT_COUNT; i++)
1626         fprintf(f, "    UNICODE_SCRIPT_%s,\n", unicode_script_name[i]);
1627     fprintf(f, "    UNICODE_SCRIPT_COUNT,\n");
1628     fprintf(f, "} UnicodeScriptEnum;\n\n");
1629 
1630     i = 1;
1631     dump_name_table(f, "unicode_script_name_table",
1632                     unicode_script_name + i, SCRIPT_COUNT - i,
1633                     unicode_script_short_name + i);
1634 
1635     dbuf_init(dbuf);
1636     cw_count = 0;
1637     for(i = 0; i < 4; i++)
1638         cw_len_count[i] = 0;
1639     for(i = 0; i <= CHARCODE_MAX;) {
1640         v = unicode_db[i].script;
1641         j = i + 1;
1642         while (j <= CHARCODE_MAX && unicode_db[j].script == v)
1643             j++;
1644         n = j - i;
1645         if (v == 0 && j == (CHARCODE_MAX + 1))
1646             break;
1647         //        printf("%05x %05x %d\n", i, n, v);
1648         cw_count++;
1649         n--;
1650         cw_start = dbuf->size;
1651         if (v == 0)
1652             type = 0;
1653         else
1654             type = 1;
1655         if (n < 96) {
1656             dbuf_putc(dbuf, n | (type << 7));
1657         } else if (n < 96 + (1 << 12)) {
1658             n1 = n - 96;
1659             assert(n1 < (1 << 12));
1660             dbuf_putc(dbuf, ((n1 >> 8) + 96) | (type << 7));
1661             dbuf_putc(dbuf, n1);
1662         } else {
1663             n1 = n - (96 + (1 << 12));
1664             assert(n1 < (1 << 20));
1665             dbuf_putc(dbuf, ((n1 >> 16) + 112) | (type << 7));
1666             dbuf_putc(dbuf, n1 >> 8);
1667             dbuf_putc(dbuf, n1);
1668         }
1669         if (type != 0)
1670             dbuf_putc(dbuf, v);
1671 
1672         cw_len_count[dbuf->size - cw_start - 1]++;
1673         i += n + 1;
1674     }
1675 #if defined(DUMP_TABLE_SIZE)
1676     printf("script: %d entries [",
1677            cw_count);
1678     for(i = 0; i < 4; i++)
1679         printf(" %d", cw_len_count[i]);
1680     printf(" ], length=%d bytes\n", (int)dbuf->size);
1681 #endif
1682 
1683     dump_byte_table(f, "unicode_script_table", dbuf->buf, dbuf->size);
1684 
1685     dbuf_free(dbuf);
1686 }
1687 
build_script_ext_table(FILE * f)1688 void build_script_ext_table(FILE *f)
1689 {
1690     int i, j, n, n1, script_ext_len;
1691     DynBuf dbuf_s, *dbuf = &dbuf_s;
1692     int cw_count;
1693 
1694     dbuf_init(dbuf);
1695     cw_count = 0;
1696     for(i = 0; i <= CHARCODE_MAX;) {
1697         script_ext_len = unicode_db[i].script_ext_len;
1698         j = i + 1;
1699         while (j <= CHARCODE_MAX &&
1700                unicode_db[j].script_ext_len == script_ext_len &&
1701                !memcmp(unicode_db[j].script_ext, unicode_db[i].script_ext,
1702                        script_ext_len)) {
1703             j++;
1704         }
1705         n = j - i;
1706         cw_count++;
1707         n--;
1708         if (n < 128) {
1709             dbuf_putc(dbuf, n);
1710         } else if (n < 128 + (1 << 14)) {
1711             n1 = n - 128;
1712             assert(n1 < (1 << 14));
1713             dbuf_putc(dbuf, (n1 >> 8) + 128);
1714             dbuf_putc(dbuf, n1);
1715         } else {
1716             n1 = n - (128 + (1 << 14));
1717             assert(n1 < (1 << 22));
1718             dbuf_putc(dbuf, (n1 >> 16) + 128 + 64);
1719             dbuf_putc(dbuf, n1 >> 8);
1720             dbuf_putc(dbuf, n1);
1721         }
1722         dbuf_putc(dbuf, script_ext_len);
1723         for(j = 0; j < script_ext_len; j++)
1724             dbuf_putc(dbuf, unicode_db[i].script_ext[j]);
1725         i += n + 1;
1726     }
1727 #ifdef DUMP_TABLE_SIZE
1728     printf("script_ext: %d entries",
1729            cw_count);
1730     printf(", length=%d bytes\n", (int)dbuf->size);
1731 #endif
1732 
1733     dump_byte_table(f, "unicode_script_ext_table", dbuf->buf, dbuf->size);
1734 
1735     dbuf_free(dbuf);
1736 }
1737 
1738 /* the following properties are synthetized so no table is necessary */
1739 #define PROP_TABLE_COUNT PROP_ASCII
1740 
build_prop_list_table(FILE * f)1741 void build_prop_list_table(FILE *f)
1742 {
1743     int i;
1744 
1745     for(i = 0; i < PROP_TABLE_COUNT; i++) {
1746         if (i == PROP_ID_Start ||
1747             i == PROP_Case_Ignorable ||
1748             i == PROP_ID_Continue1) {
1749             /* already generated */
1750         } else {
1751             build_prop_table(f, i, FALSE);
1752         }
1753     }
1754 
1755     fprintf(f, "typedef enum {\n");
1756     for(i = 0; i < PROP_COUNT; i++)
1757         fprintf(f, "    UNICODE_PROP_%s,\n", unicode_prop_name[i]);
1758     fprintf(f, "    UNICODE_PROP_COUNT,\n");
1759     fprintf(f, "} UnicodePropertyEnum;\n\n");
1760 
1761     i = PROP_ASCII_Hex_Digit;
1762     dump_name_table(f, "unicode_prop_name_table",
1763                     unicode_prop_name + i, PROP_XID_Start - i + 1,
1764                     unicode_prop_short_name + i);
1765 
1766     fprintf(f, "static const uint8_t * const unicode_prop_table[] = {\n");
1767     for(i = 0; i < PROP_TABLE_COUNT; i++) {
1768         fprintf(f, "    unicode_prop_%s_table,\n", unicode_prop_name[i]);
1769     }
1770     fprintf(f, "};\n\n");
1771 
1772     fprintf(f, "static const uint16_t unicode_prop_len_table[] = {\n");
1773     for(i = 0; i < PROP_TABLE_COUNT; i++) {
1774         fprintf(f, "    countof(unicode_prop_%s_table),\n", unicode_prop_name[i]);
1775     }
1776     fprintf(f, "};\n\n");
1777 }
1778 
1779 #ifdef USE_TEST
check_conv(uint32_t * res,uint32_t c,int conv_type)1780 int check_conv(uint32_t *res, uint32_t c, int conv_type)
1781 {
1782     return lre_case_conv(res, c, conv_type);
1783 }
1784 
check_case_conv(void)1785 void check_case_conv(void)
1786 {
1787     CCInfo *tab = unicode_db;
1788     uint32_t res[3];
1789     int l, error;
1790     CCInfo ci_s, *ci1, *ci = &ci_s;
1791     int code;
1792 
1793     for(code = 0; code <= CHARCODE_MAX; code++) {
1794         ci1 = &tab[code];
1795         *ci = *ci1;
1796         if (ci->l_len == 0) {
1797             ci->l_len = 1;
1798             ci->l_data[0] = code;
1799         }
1800         if (ci->u_len == 0) {
1801             ci->u_len = 1;
1802             ci->u_data[0] = code;
1803         }
1804         if (ci->f_code == 0)
1805             ci->f_code = code;
1806 
1807         error = 0;
1808         l = check_conv(res, code, 0);
1809         if (l != ci->u_len || tabcmp((int *)res, ci->u_data, l)) {
1810             printf("ERROR: L\n");
1811             error++;
1812         }
1813         l = check_conv(res, code, 1);
1814         if (l != ci->l_len || tabcmp((int *)res, ci->l_data, l)) {
1815             printf("ERROR: U\n");
1816             error++;
1817         }
1818         l = check_conv(res, code, 2);
1819         if (l != 1 || res[0] != ci->f_code) {
1820             printf("ERROR: F\n");
1821             error++;
1822         }
1823         if (error) {
1824             dump_cc_info(ci, code);
1825             exit(1);
1826         }
1827     }
1828 }
1829 
1830 #ifdef PROFILE
get_time_ns(void)1831 static int64_t get_time_ns(void)
1832 {
1833     struct timespec ts;
1834     clock_gettime(CLOCK_MONOTONIC, &ts);
1835     return (int64_t)ts.tv_sec * 1000000000 + ts.tv_nsec;
1836 }
1837 #endif
1838 
1839 
check_flags(void)1840 void check_flags(void)
1841 {
1842     int c;
1843     BOOL flag_ref, flag;
1844     for(c = 0; c <= CHARCODE_MAX; c++) {
1845         flag_ref = get_prop(c, PROP_Cased);
1846         flag = lre_is_cased(c);
1847         if (flag != flag_ref) {
1848             printf("ERROR: c=%05x cased=%d ref=%d\n",
1849                    c, flag, flag_ref);
1850             exit(1);
1851         }
1852 
1853         flag_ref = get_prop(c, PROP_Case_Ignorable);
1854         flag = lre_is_case_ignorable(c);
1855         if (flag != flag_ref) {
1856             printf("ERROR: c=%05x case_ignorable=%d ref=%d\n",
1857                    c, flag, flag_ref);
1858             exit(1);
1859         }
1860 
1861         flag_ref = get_prop(c, PROP_ID_Start);
1862         flag = lre_is_id_start(c);
1863         if (flag != flag_ref) {
1864             printf("ERROR: c=%05x id_start=%d ref=%d\n",
1865                    c, flag, flag_ref);
1866             exit(1);
1867         }
1868 
1869         flag_ref = get_prop(c, PROP_ID_Continue);
1870         flag = lre_is_id_continue(c);
1871         if (flag != flag_ref) {
1872             printf("ERROR: c=%05x id_cont=%d ref=%d\n",
1873                    c, flag, flag_ref);
1874             exit(1);
1875         }
1876     }
1877 #ifdef PROFILE
1878     {
1879         int64_t ti, count;
1880         ti = get_time_ns();
1881         count = 0;
1882         for(c = 0x20; c <= 0xffff; c++) {
1883             flag_ref = get_prop(c, PROP_ID_Start);
1884             flag = lre_is_id_start(c);
1885             assert(flag == flag_ref);
1886             count++;
1887         }
1888         ti = get_time_ns() - ti;
1889         printf("flags time=%0.1f ns/char\n",
1890                (double)ti / count);
1891     }
1892 #endif
1893 }
1894 
1895 #endif
1896 
1897 #define CC_BLOCK_LEN 32
1898 
build_cc_table(FILE * f)1899 void build_cc_table(FILE *f)
1900 {
1901     int i, cc, n, cc_table_len, type, n1;
1902     DynBuf dbuf_s, *dbuf = &dbuf_s;
1903     DynBuf dbuf1_s, *dbuf1 = &dbuf1_s;
1904     int cw_len_tab[3], cw_start, block_end_pos;
1905     uint32_t v;
1906 
1907     dbuf_init(dbuf);
1908     dbuf_init(dbuf1);
1909     cc_table_len = 0;
1910     for(i = 0; i < countof(cw_len_tab); i++)
1911         cw_len_tab[i] = 0;
1912     block_end_pos = CC_BLOCK_LEN;
1913     for(i = 0; i <= CHARCODE_MAX;) {
1914         cc = unicode_db[i].combining_class;
1915         assert(cc <= 255);
1916         /* check increasing values */
1917         n = 1;
1918         while ((i + n) <= CHARCODE_MAX &&
1919                unicode_db[i + n].combining_class == (cc + n))
1920             n++;
1921         if (n >= 2) {
1922             type = 1;
1923         } else {
1924             type = 0;
1925             n = 1;
1926             while ((i + n) <= CHARCODE_MAX &&
1927                    unicode_db[i + n].combining_class == cc)
1928                 n++;
1929         }
1930         /* no need to encode the last run */
1931         if (cc == 0 && (i + n - 1) == CHARCODE_MAX)
1932             break;
1933 #ifdef DUMP_CC_TABLE
1934         printf("%05x %6d %d %d\n", i, n, type, cc);
1935 #endif
1936         if (type == 0) {
1937             if (cc == 0)
1938                 type = 2;
1939             else if (cc == 230)
1940                 type = 3;
1941         }
1942         n1 = n - 1;
1943 
1944         /* add an entry to the index if necessary */
1945         if (dbuf->size >= block_end_pos) {
1946             v = i | ((dbuf->size - block_end_pos) << 21);
1947             dbuf_putc(dbuf1, v);
1948             dbuf_putc(dbuf1, v >> 8);
1949             dbuf_putc(dbuf1, v >> 16);
1950             block_end_pos += CC_BLOCK_LEN;
1951         }
1952         cw_start = dbuf->size;
1953         if (n1 < 48) {
1954             dbuf_putc(dbuf, n1 | (type << 6));
1955         } else if (n1 < 48 + (1 << 11)) {
1956             n1 -= 48;
1957             dbuf_putc(dbuf, ((n1 >> 8) + 48) | (type << 6));
1958             dbuf_putc(dbuf, n1);
1959         } else {
1960             n1 -= 48 + (1 << 11);
1961             assert(n1 < (1 << 20));
1962             dbuf_putc(dbuf, ((n1 >> 16) + 56) | (type << 6));
1963             dbuf_putc(dbuf, n1 >> 8);
1964             dbuf_putc(dbuf, n1);
1965         }
1966         cw_len_tab[dbuf->size - cw_start - 1]++;
1967         if (type == 0 || type == 1)
1968             dbuf_putc(dbuf, cc);
1969         cc_table_len++;
1970         i += n;
1971     }
1972 
1973     /* last index entry */
1974     v = i;
1975     dbuf_putc(dbuf1, v);
1976     dbuf_putc(dbuf1, v >> 8);
1977     dbuf_putc(dbuf1, v >> 16);
1978 
1979     dump_byte_table(f, "unicode_cc_table", dbuf->buf, dbuf->size);
1980     dump_byte_table(f, "unicode_cc_index", dbuf1->buf, dbuf1->size);
1981 
1982 #if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE)
1983     printf("CC table: size=%d (%d entries) [",
1984            (int)(dbuf->size + dbuf1->size),
1985            cc_table_len);
1986     for(i = 0; i < countof(cw_len_tab); i++)
1987         printf(" %d", cw_len_tab[i]);
1988     printf(" ]\n");
1989 #endif
1990     dbuf_free(dbuf);
1991     dbuf_free(dbuf1);
1992 }
1993 
1994 /* maximum length of decomposition: 18 chars (1), then 8 */
1995 #ifndef USE_TEST
1996 typedef enum {
1997     DECOMP_TYPE_C1, /* 16 bit char */
1998     DECOMP_TYPE_L1, /* 16 bit char table */
1999     DECOMP_TYPE_L2,
2000     DECOMP_TYPE_L3,
2001     DECOMP_TYPE_L4,
2002     DECOMP_TYPE_L5, /* XXX: not used */
2003     DECOMP_TYPE_L6, /* XXX: could remove */
2004     DECOMP_TYPE_L7, /* XXX: could remove */
2005     DECOMP_TYPE_LL1, /* 18 bit char table */
2006     DECOMP_TYPE_LL2,
2007     DECOMP_TYPE_S1, /* 8 bit char table */
2008     DECOMP_TYPE_S2,
2009     DECOMP_TYPE_S3,
2010     DECOMP_TYPE_S4,
2011     DECOMP_TYPE_S5,
2012     DECOMP_TYPE_I1, /* increment 16 bit char value */
2013     DECOMP_TYPE_I2_0,
2014     DECOMP_TYPE_I2_1,
2015     DECOMP_TYPE_I3_1,
2016     DECOMP_TYPE_I3_2,
2017     DECOMP_TYPE_I4_1,
2018     DECOMP_TYPE_I4_2,
2019     DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */
2020     DECOMP_TYPE_B2,
2021     DECOMP_TYPE_B3,
2022     DECOMP_TYPE_B4,
2023     DECOMP_TYPE_B5,
2024     DECOMP_TYPE_B6,
2025     DECOMP_TYPE_B7,
2026     DECOMP_TYPE_B8,
2027     DECOMP_TYPE_B18,
2028     DECOMP_TYPE_LS2,
2029     DECOMP_TYPE_PAT3,
2030     DECOMP_TYPE_S2_UL,
2031     DECOMP_TYPE_LS2_UL,
2032 } DecompTypeEnum;
2033 #endif
2034 
2035 const char *decomp_type_str[] = {
2036     "C1",
2037     "L1",
2038     "L2",
2039     "L3",
2040     "L4",
2041     "L5",
2042     "L6",
2043     "L7",
2044     "LL1",
2045     "LL2",
2046     "S1",
2047     "S2",
2048     "S3",
2049     "S4",
2050     "S5",
2051     "I1",
2052     "I2_0",
2053     "I2_1",
2054     "I3_1",
2055     "I3_2",
2056     "I4_1",
2057     "I4_2",
2058     "B1",
2059     "B2",
2060     "B3",
2061     "B4",
2062     "B5",
2063     "B6",
2064     "B7",
2065     "B8",
2066     "B18",
2067     "LS2",
2068     "PAT3",
2069     "S2_UL",
2070     "LS2_UL",
2071 };
2072 
2073 const int decomp_incr_tab[4][4] = {
2074     { DECOMP_TYPE_I1, 0, -1 },
2075     { DECOMP_TYPE_I2_0, 0, 1, -1 },
2076     { DECOMP_TYPE_I3_1, 1, 2, -1 },
2077     { DECOMP_TYPE_I4_1, 1, 2, -1 },
2078 };
2079 
2080 /*
2081   entry size:
2082   type   bits
2083   code   18
2084   len    7
2085   compat 1
2086   type   5
2087   index  16
2088   total  47
2089 */
2090 
2091 typedef struct {
2092     int code;
2093     uint8_t len;
2094     uint8_t type;
2095     uint8_t c_len;
2096     uint16_t c_min;
2097     uint16_t data_index;
2098     int cost; /* size in bytes from this entry to the end */
2099 } DecompEntry;
2100 
get_decomp_run_size(const DecompEntry * de)2101 int get_decomp_run_size(const DecompEntry *de)
2102 {
2103     int s;
2104     s = 6;
2105     if (de->type <= DECOMP_TYPE_C1) {
2106         /* nothing more */
2107     } else if (de->type <= DECOMP_TYPE_L7) {
2108         s += de->len * de->c_len * 2;
2109     } else if (de->type <= DECOMP_TYPE_LL2) {
2110         /* 18 bits per char */
2111         s += (de->len * de->c_len * 18 + 7) / 8;
2112     } else if (de->type <= DECOMP_TYPE_S5) {
2113         s += de->len * de->c_len;
2114     } else if (de->type <= DECOMP_TYPE_I4_2) {
2115         s += de->c_len * 2;
2116     } else if (de->type <= DECOMP_TYPE_B18) {
2117         s += 2 + de->len * de->c_len;
2118     } else if (de->type <= DECOMP_TYPE_LS2) {
2119         s += de->len * 3;
2120     } else if (de->type <= DECOMP_TYPE_PAT3) {
2121         s += 4 + de->len * 2;
2122     } else if (de->type <= DECOMP_TYPE_S2_UL) {
2123         s += de->len;
2124     } else if (de->type <= DECOMP_TYPE_LS2_UL) {
2125         s += (de->len / 2) * 3;
2126     } else {
2127         abort();
2128     }
2129     return s;
2130 }
2131 
2132 static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 };
2133 
2134 /* return -1 if not found */
get_short_code(int c)2135 int get_short_code(int c)
2136 {
2137     int i;
2138     if (c < 0x80) {
2139         return c;
2140     } else if (c >= 0x300 && c < 0x350) {
2141         return c - 0x300 + 0x80;
2142     } else {
2143         for(i = 0; i < countof(unicode_short_table); i++) {
2144             if (c == unicode_short_table[i])
2145                 return i + 0x80 + 0x50;
2146         }
2147         return -1;
2148     }
2149 }
2150 
is_short(int code)2151 static BOOL is_short(int code)
2152 {
2153     return get_short_code(code) >= 0;
2154 }
2155 
is_short_tab(const int * tab,int len)2156 static BOOL is_short_tab(const int *tab, int len)
2157 {
2158     int i;
2159     for(i = 0; i < len; i++) {
2160         if (!is_short(tab[i]))
2161             return FALSE;
2162     }
2163     return TRUE;
2164 }
2165 
is_16bit(const int * tab,int len)2166 static BOOL is_16bit(const int *tab, int len)
2167 {
2168     int i;
2169     for(i = 0; i < len; i++) {
2170         if (tab[i] > 0xffff)
2171             return FALSE;
2172     }
2173     return TRUE;
2174 }
2175 
to_lower_simple(uint32_t c)2176 static uint32_t to_lower_simple(uint32_t c)
2177 {
2178     /* Latin1 and Cyrillic */
2179     if (c < 0x100 || (c >= 0x410 && c <= 0x42f))
2180         c += 0x20;
2181     else
2182         c++;
2183     return c;
2184 }
2185 
2186 /* select best encoding with dynamic programming */
find_decomp_run(DecompEntry * tab_de,int i)2187 void find_decomp_run(DecompEntry *tab_de, int i)
2188 {
2189     DecompEntry de_s, *de = &de_s;
2190     CCInfo *ci, *ci1, *ci2;
2191     int l, j, n, len_max;
2192 
2193     ci = &unicode_db[i];
2194     l = ci->decomp_len;
2195     if (l == 0) {
2196         tab_de[i].cost = tab_de[i + 1].cost;
2197         return;
2198     }
2199 
2200     /* the offset for the compose table has only 6 bits, so we must
2201        limit if it can be used by the compose table */
2202     if (!ci->is_compat && !ci->is_excluded && l == 2)
2203         len_max = 64;
2204     else
2205         len_max = 127;
2206 
2207     tab_de[i].cost = 0x7fffffff;
2208 
2209     if (!is_16bit(ci->decomp_data, l)) {
2210         assert(l <= 2);
2211 
2212         n = 1;
2213         for(;;) {
2214             de->code = i;
2215             de->len = n;
2216             de->type = DECOMP_TYPE_LL1 + l - 1;
2217             de->c_len = l;
2218             de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2219             if (de->cost < tab_de[i].cost) {
2220                 tab_de[i] = *de;
2221             }
2222             if (!((i + n) <= CHARCODE_MAX && n < len_max))
2223                 break;
2224             ci1 = &unicode_db[i + n];
2225             /* Note: we accept a hole */
2226             if (!(ci1->decomp_len == 0 ||
2227                   (ci1->decomp_len == l &&
2228                    ci1->is_compat == ci->is_compat)))
2229                 break;
2230             n++;
2231         }
2232         return;
2233     }
2234 
2235     if (l <= 7) {
2236         n = 1;
2237         for(;;) {
2238             de->code = i;
2239             de->len = n;
2240             if (l == 1 && n == 1) {
2241                 de->type = DECOMP_TYPE_C1;
2242             } else {
2243                 assert(l <= 8);
2244                 de->type = DECOMP_TYPE_L1 + l - 1;
2245             }
2246             de->c_len = l;
2247             de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2248             if (de->cost < tab_de[i].cost) {
2249                 tab_de[i] = *de;
2250             }
2251 
2252             if (!((i + n) <= CHARCODE_MAX && n < len_max))
2253                 break;
2254             ci1 = &unicode_db[i + n];
2255             /* Note: we accept a hole */
2256             if (!(ci1->decomp_len == 0 ||
2257                   (ci1->decomp_len == l &&
2258                    ci1->is_compat == ci->is_compat &&
2259                    is_16bit(ci1->decomp_data, l))))
2260                 break;
2261             n++;
2262         }
2263     }
2264 
2265     if (l <= 8 || l == 18) {
2266         int c_min, c_max, c;
2267         c_min = c_max = -1;
2268         n = 1;
2269         for(;;) {
2270             ci1 = &unicode_db[i + n - 1];
2271             for(j = 0; j < l; j++) {
2272                 c = ci1->decomp_data[j];
2273                 if (c == 0x20) {
2274                     /* we accept space for Arabic */
2275                 } else if (c_min == -1) {
2276                     c_min = c_max = c;
2277                 } else {
2278                     c_min = min_int(c_min, c);
2279                     c_max = max_int(c_max, c);
2280                 }
2281             }
2282             if ((c_max - c_min) > 254)
2283                 break;
2284             de->code = i;
2285             de->len = n;
2286             if (l == 18)
2287                 de->type = DECOMP_TYPE_B18;
2288             else
2289                 de->type = DECOMP_TYPE_B1 + l - 1;
2290             de->c_len = l;
2291             de->c_min = c_min;
2292             de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2293             if (de->cost < tab_de[i].cost) {
2294                 tab_de[i] = *de;
2295             }
2296             if (!((i + n) <= CHARCODE_MAX && n < len_max))
2297                 break;
2298             ci1 = &unicode_db[i + n];
2299             if (!(ci1->decomp_len == l &&
2300                   ci1->is_compat == ci->is_compat))
2301                 break;
2302             n++;
2303         }
2304     }
2305 
2306     /* find an ascii run */
2307     if (l <= 5 && is_short_tab(ci->decomp_data, l)) {
2308         n = 1;
2309         for(;;) {
2310             de->code = i;
2311             de->len = n;
2312             de->type = DECOMP_TYPE_S1 + l - 1;
2313             de->c_len = l;
2314             de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2315             if (de->cost < tab_de[i].cost) {
2316                 tab_de[i] = *de;
2317             }
2318 
2319             if (!((i + n) <= CHARCODE_MAX && n < len_max))
2320                 break;
2321             ci1 = &unicode_db[i + n];
2322             /* Note: we accept a hole */
2323             if (!(ci1->decomp_len == 0 ||
2324                   (ci1->decomp_len == l &&
2325                    ci1->is_compat == ci->is_compat &&
2326                    is_short_tab(ci1->decomp_data, l))))
2327                 break;
2328             n++;
2329         }
2330     }
2331 
2332     /* check if a single char is increasing */
2333     if (l <= 4) {
2334         int idx1, idx;
2335 
2336         for(idx1 = 1; (idx = decomp_incr_tab[l - 1][idx1]) >= 0; idx1++) {
2337             n = 1;
2338             for(;;) {
2339                 de->code = i;
2340                 de->len = n;
2341                 de->type = decomp_incr_tab[l - 1][0] + idx1 - 1;
2342                 de->c_len = l;
2343                 de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2344                 if (de->cost < tab_de[i].cost) {
2345                     tab_de[i] = *de;
2346                 }
2347 
2348                 if (!((i + n) <= CHARCODE_MAX && n < len_max))
2349                     break;
2350                 ci1 = &unicode_db[i + n];
2351                 if (!(ci1->decomp_len == l &&
2352                       ci1->is_compat == ci->is_compat))
2353                     goto next1;
2354                 for(j = 0; j < l; j++) {
2355                     if (j == idx) {
2356                         if (ci1->decomp_data[j] != ci->decomp_data[j] + n)
2357                             goto next1;
2358                     } else {
2359                         if (ci1->decomp_data[j] != ci->decomp_data[j])
2360                             goto next1;
2361                     }
2362                 }
2363                 n++;
2364             }
2365         next1: ;
2366         }
2367     }
2368 
2369     if (l == 3) {
2370         n = 1;
2371         for(;;) {
2372             de->code = i;
2373             de->len = n;
2374             de->type = DECOMP_TYPE_PAT3;
2375             de->c_len = l;
2376             de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2377             if (de->cost < tab_de[i].cost) {
2378                 tab_de[i] = *de;
2379             }
2380             if (!((i + n) <= CHARCODE_MAX && n < len_max))
2381                 break;
2382             ci1 = &unicode_db[i + n];
2383             if (!(ci1->decomp_len == l &&
2384                   ci1->is_compat == ci->is_compat &&
2385                   ci1->decomp_data[1] <= 0xffff &&
2386                   ci1->decomp_data[0] == ci->decomp_data[0] &&
2387                   ci1->decomp_data[l - 1] == ci->decomp_data[l - 1]))
2388                 break;
2389             n++;
2390         }
2391     }
2392 
2393     if (l == 2 && is_short(ci->decomp_data[1])) {
2394         n = 1;
2395         for(;;) {
2396             de->code = i;
2397             de->len = n;
2398             de->type = DECOMP_TYPE_LS2;
2399             de->c_len = l;
2400             de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2401             if (de->cost < tab_de[i].cost) {
2402                 tab_de[i] = *de;
2403             }
2404             if (!((i + n) <= CHARCODE_MAX && n < len_max))
2405                 break;
2406             ci1 = &unicode_db[i + n];
2407             if (!(ci1->decomp_len == 0 ||
2408                   (ci1->decomp_len == l &&
2409                    ci1->is_compat == ci->is_compat &&
2410                    ci1->decomp_data[0] <= 0xffff &&
2411                    is_short(ci1->decomp_data[1]))))
2412                 break;
2413             n++;
2414         }
2415     }
2416 
2417     if (l == 2) {
2418         BOOL is_16bit;
2419 
2420         n = 0;
2421         is_16bit = FALSE;
2422         for(;;) {
2423             if (!((i + n + 1) <= CHARCODE_MAX && n + 2 <= len_max))
2424                 break;
2425             ci1 = &unicode_db[i + n];
2426             if (!(ci1->decomp_len == l &&
2427                   ci1->is_compat == ci->is_compat &&
2428                   is_short(ci1->decomp_data[1])))
2429                 break;
2430             if (!is_16bit && !is_short(ci1->decomp_data[0]))
2431                 is_16bit = TRUE;
2432             ci2 = &unicode_db[i + n + 1];
2433             if (!(ci2->decomp_len == l &&
2434                   ci2->is_compat == ci->is_compat &&
2435                   ci2->decomp_data[0] == to_lower_simple(ci1->decomp_data[0])  &&
2436                   ci2->decomp_data[1] == ci1->decomp_data[1]))
2437                 break;
2438             n += 2;
2439             de->code = i;
2440             de->len = n;
2441             de->type = DECOMP_TYPE_S2_UL + is_16bit;
2442             de->c_len = l;
2443             de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2444             if (de->cost < tab_de[i].cost) {
2445                 tab_de[i] = *de;
2446             }
2447         }
2448     }
2449 }
2450 
put16(uint8_t * data_buf,int * pidx,uint16_t c)2451 void put16(uint8_t *data_buf, int *pidx, uint16_t c)
2452 {
2453     int idx;
2454     idx = *pidx;
2455     data_buf[idx++] = c;
2456     data_buf[idx++] = c >> 8;
2457     *pidx = idx;
2458 }
2459 
add_decomp_data(uint8_t * data_buf,int * pidx,DecompEntry * de)2460 void add_decomp_data(uint8_t *data_buf, int *pidx, DecompEntry *de)
2461 {
2462     int i, j, idx, c;
2463     CCInfo *ci;
2464 
2465     idx = *pidx;
2466     de->data_index = idx;
2467     if (de->type <= DECOMP_TYPE_C1) {
2468         ci = &unicode_db[de->code];
2469         assert(ci->decomp_len == 1);
2470         de->data_index = ci->decomp_data[0];
2471     } else if (de->type <= DECOMP_TYPE_L7) {
2472         for(i = 0; i < de->len; i++) {
2473             ci = &unicode_db[de->code + i];
2474             for(j = 0; j < de->c_len; j++) {
2475                 if (ci->decomp_len == 0)
2476                     c = 0;
2477                 else
2478                     c = ci->decomp_data[j];
2479                 put16(data_buf, &idx,  c);
2480             }
2481         }
2482     } else if (de->type <= DECOMP_TYPE_LL2) {
2483         int n, p, k;
2484         n = (de->len * de->c_len * 18 + 7) / 8;
2485         p = de->len * de->c_len * 2;
2486         memset(data_buf + idx, 0, n);
2487         k = 0;
2488         for(i = 0; i < de->len; i++) {
2489             ci = &unicode_db[de->code + i];
2490             for(j = 0; j < de->c_len; j++) {
2491                 if (ci->decomp_len == 0)
2492                     c = 0;
2493                 else
2494                     c = ci->decomp_data[j];
2495                 data_buf[idx + k * 2] = c;
2496                 data_buf[idx + k * 2 + 1] = c >> 8;
2497                 data_buf[idx + p + (k / 4)] |= (c >> 16) << ((k % 4) * 2);
2498                 k++;
2499             }
2500         }
2501         idx += n;
2502     } else if (de->type <= DECOMP_TYPE_S5) {
2503         for(i = 0; i < de->len; i++) {
2504             ci = &unicode_db[de->code + i];
2505             for(j = 0; j < de->c_len; j++) {
2506                 if (ci->decomp_len == 0)
2507                     c = 0;
2508                 else
2509                     c = ci->decomp_data[j];
2510                 c = get_short_code(c);
2511                 assert(c >= 0);
2512                 data_buf[idx++] = c;
2513             }
2514         }
2515     } else if (de->type <= DECOMP_TYPE_I4_2) {
2516         ci = &unicode_db[de->code];
2517         assert(ci->decomp_len == de->c_len);
2518         for(j = 0; j < de->c_len; j++)
2519             put16(data_buf, &idx, ci->decomp_data[j]);
2520     } else if (de->type <= DECOMP_TYPE_B18) {
2521         c = de->c_min;
2522         data_buf[idx++] = c;
2523         data_buf[idx++] = c >> 8;
2524         for(i = 0; i < de->len; i++) {
2525             ci = &unicode_db[de->code + i];
2526             for(j = 0; j < de->c_len; j++) {
2527                 assert(ci->decomp_len == de->c_len);
2528                 c = ci->decomp_data[j];
2529                 if (c == 0x20) {
2530                     c = 0xff;
2531                 } else {
2532                     c -= de->c_min;
2533                     assert((uint32_t)c <= 254);
2534                 }
2535                 data_buf[idx++] = c;
2536             }
2537         }
2538     } else if (de->type <= DECOMP_TYPE_LS2) {
2539         assert(de->c_len == 2);
2540         for(i = 0; i < de->len; i++) {
2541             ci = &unicode_db[de->code + i];
2542             if (ci->decomp_len == 0)
2543                 c = 0;
2544             else
2545                 c = ci->decomp_data[0];
2546             put16(data_buf, &idx,  c);
2547 
2548             if (ci->decomp_len == 0)
2549                 c = 0;
2550             else
2551                 c = ci->decomp_data[1];
2552             c = get_short_code(c);
2553             assert(c >= 0);
2554             data_buf[idx++] = c;
2555         }
2556     } else if (de->type <= DECOMP_TYPE_PAT3) {
2557         ci = &unicode_db[de->code];
2558         assert(ci->decomp_len == 3);
2559         put16(data_buf, &idx,  ci->decomp_data[0]);
2560         put16(data_buf, &idx,  ci->decomp_data[2]);
2561         for(i = 0; i < de->len; i++) {
2562             ci = &unicode_db[de->code + i];
2563             assert(ci->decomp_len == 3);
2564             put16(data_buf, &idx,  ci->decomp_data[1]);
2565         }
2566     } else if (de->type <= DECOMP_TYPE_S2_UL) {
2567         for(i = 0; i < de->len; i += 2) {
2568             ci = &unicode_db[de->code + i];
2569             c = ci->decomp_data[0];
2570             c = get_short_code(c);
2571             assert(c >= 0);
2572             data_buf[idx++] = c;
2573             c = ci->decomp_data[1];
2574             c = get_short_code(c);
2575             assert(c >= 0);
2576             data_buf[idx++] = c;
2577         }
2578     } else if (de->type <= DECOMP_TYPE_LS2_UL) {
2579         for(i = 0; i < de->len; i += 2) {
2580             ci = &unicode_db[de->code + i];
2581             c = ci->decomp_data[0];
2582             put16(data_buf, &idx,  c);
2583             c = ci->decomp_data[1];
2584             c = get_short_code(c);
2585             assert(c >= 0);
2586             data_buf[idx++] = c;
2587         }
2588     } else {
2589         abort();
2590     }
2591     *pidx = idx;
2592 }
2593 
2594 #if 0
2595 void dump_large_char(void)
2596 {
2597     int i, j;
2598     for(i = 0; i <= CHARCODE_MAX; i++) {
2599         CCInfo *ci = &unicode_db[i];
2600         for(j = 0; j < ci->decomp_len; j++) {
2601             if (ci->decomp_data[j] > 0xffff)
2602                 printf("%05x\n", ci->decomp_data[j]);
2603         }
2604     }
2605 }
2606 #endif
2607 
2608 void build_compose_table(FILE *f, const DecompEntry *tab_de);
2609 
build_decompose_table(FILE * f)2610 void build_decompose_table(FILE *f)
2611 {
2612     int i, array_len, code_max, data_len, count;
2613     DecompEntry *tab_de, de_s, *de = &de_s;
2614     uint8_t *data_buf;
2615 
2616     code_max = CHARCODE_MAX;
2617 
2618     tab_de = mallocz((code_max + 2) * sizeof(*tab_de));
2619 
2620     for(i = code_max; i >= 0; i--) {
2621         find_decomp_run(tab_de, i);
2622     }
2623 
2624     /* build the data buffer */
2625     data_buf = malloc(100000);
2626     data_len = 0;
2627     array_len = 0;
2628     for(i = 0; i <= code_max; i++) {
2629         de = &tab_de[i];
2630         if (de->len != 0) {
2631             add_decomp_data(data_buf, &data_len, de);
2632             i += de->len - 1;
2633             array_len++;
2634         }
2635     }
2636 
2637 #ifdef DUMP_DECOMP_TABLE
2638     /* dump */
2639     {
2640         int size, size1;
2641 
2642         printf("START LEN   TYPE  L C SIZE\n");
2643         size = 0;
2644         for(i = 0; i <= code_max; i++) {
2645             de = &tab_de[i];
2646             if (de->len != 0) {
2647                 size1 = get_decomp_run_size(de);
2648                 printf("%05x %3d %6s %2d %1d %4d\n", i, de->len,
2649                        decomp_type_str[de->type], de->c_len,
2650                        unicode_db[i].is_compat, size1);
2651                 i += de->len - 1;
2652                 size += size1;
2653             }
2654         }
2655 
2656         printf("array_len=%d estimated size=%d bytes actual=%d bytes\n",
2657                array_len, size, array_len * 6 + data_len);
2658     }
2659 #endif
2660 
2661     fprintf(f, "static const uint32_t unicode_decomp_table1[%u] = {",
2662             array_len);
2663     count = 0;
2664     for(i = 0; i <= code_max; i++) {
2665         de = &tab_de[i];
2666         if (de->len != 0) {
2667             uint32_t v;
2668             if (count++ % 4 == 0)
2669                 fprintf(f, "\n   ");
2670             v = (de->code << (32 - 18)) |
2671                 (de->len << (32 - 18 - 7)) |
2672                 (de->type << (32 - 18 - 7 - 6)) |
2673                 unicode_db[de->code].is_compat;
2674             fprintf(f, " 0x%08x,", v);
2675             i += de->len - 1;
2676         }
2677     }
2678     fprintf(f, "\n};\n\n");
2679 
2680     fprintf(f, "static const uint16_t unicode_decomp_table2[%u] = {",
2681             array_len);
2682     count = 0;
2683     for(i = 0; i <= code_max; i++) {
2684         de = &tab_de[i];
2685         if (de->len != 0) {
2686             if (count++ % 8 == 0)
2687                 fprintf(f, "\n   ");
2688             fprintf(f, " 0x%04x,", de->data_index);
2689             i += de->len - 1;
2690         }
2691     }
2692     fprintf(f, "\n};\n\n");
2693 
2694     fprintf(f, "static const uint8_t unicode_decomp_data[%u] = {",
2695             data_len);
2696     for(i = 0; i < data_len; i++) {
2697         if (i % 8 == 0)
2698             fprintf(f, "\n   ");
2699         fprintf(f, " 0x%02x,", data_buf[i]);
2700     }
2701     fprintf(f, "\n};\n\n");
2702 
2703     build_compose_table(f, tab_de);
2704 
2705     free(data_buf);
2706 
2707     free(tab_de);
2708 }
2709 
2710 typedef struct {
2711     uint32_t c[2];
2712     uint32_t p;
2713 } ComposeEntry;
2714 
2715 #define COMPOSE_LEN_MAX 10000
2716 
ce_cmp(const void * p1,const void * p2)2717 static int ce_cmp(const void *p1, const void *p2)
2718 {
2719     const ComposeEntry *ce1 = p1;
2720     const ComposeEntry *ce2 = p2;
2721     int i;
2722 
2723     for(i = 0; i < 2; i++) {
2724         if (ce1->c[i] < ce2->c[i])
2725             return -1;
2726         else if (ce1->c[i] > ce2->c[i])
2727             return 1;
2728     }
2729     return 0;
2730 }
2731 
2732 
get_decomp_pos(const DecompEntry * tab_de,int c)2733 static int get_decomp_pos(const DecompEntry *tab_de, int c)
2734 {
2735     int i, v, k;
2736     const DecompEntry *de;
2737 
2738     k = 0;
2739     for(i = 0; i <= CHARCODE_MAX; i++) {
2740         de = &tab_de[i];
2741         if (de->len != 0) {
2742             if (c >= de->code && c < de->code + de->len) {
2743                 v = c - de->code;
2744                 assert(v < 64);
2745                 v |= k << 6;
2746                 assert(v < 65536);
2747                 return v;
2748             }
2749             i += de->len - 1;
2750             k++;
2751         }
2752     }
2753     return -1;
2754 }
2755 
build_compose_table(FILE * f,const DecompEntry * tab_de)2756 void build_compose_table(FILE *f, const DecompEntry *tab_de)
2757 {
2758     int i, v, tab_ce_len;
2759     ComposeEntry *ce, *tab_ce;
2760 
2761     tab_ce = malloc(sizeof(*tab_ce) * COMPOSE_LEN_MAX);
2762     tab_ce_len = 0;
2763     for(i = 0; i <= CHARCODE_MAX; i++) {
2764         CCInfo *ci = &unicode_db[i];
2765         if (ci->decomp_len == 2 && !ci->is_compat &&
2766             !ci->is_excluded) {
2767             assert(tab_ce_len < COMPOSE_LEN_MAX);
2768             ce = &tab_ce[tab_ce_len++];
2769             ce->c[0] = ci->decomp_data[0];
2770             ce->c[1] = ci->decomp_data[1];
2771             ce->p = i;
2772         }
2773     }
2774     qsort(tab_ce, tab_ce_len, sizeof(*tab_ce), ce_cmp);
2775 
2776 #if 0
2777     {
2778         printf("tab_ce_len=%d\n", tab_ce_len);
2779         for(i = 0; i < tab_ce_len; i++) {
2780             ce = &tab_ce[i];
2781             printf("%05x %05x %05x\n", ce->c[0], ce->c[1], ce->p);
2782         }
2783     }
2784 #endif
2785 
2786     fprintf(f, "static const uint16_t unicode_comp_table[%u] = {",
2787             tab_ce_len);
2788     for(i = 0; i < tab_ce_len; i++) {
2789         if (i % 8 == 0)
2790             fprintf(f, "\n   ");
2791         v = get_decomp_pos(tab_de, tab_ce[i].p);
2792         if (v < 0) {
2793             printf("ERROR: entry for c=%04x not found\n",
2794                    tab_ce[i].p);
2795             exit(1);
2796         }
2797         fprintf(f, " 0x%04x,", v);
2798     }
2799     fprintf(f, "\n};\n\n");
2800 
2801     free(tab_ce);
2802 }
2803 
2804 #ifdef USE_TEST
check_decompose_table(void)2805 void check_decompose_table(void)
2806 {
2807     int c;
2808     CCInfo *ci;
2809     int res[UNICODE_DECOMP_LEN_MAX], *ref;
2810     int len, ref_len, is_compat;
2811 
2812     for(is_compat = 0; is_compat <= 1; is_compat++) {
2813         for(c = 0; c < CHARCODE_MAX; c++) {
2814             ci = &unicode_db[c];
2815             ref_len = ci->decomp_len;
2816             ref = ci->decomp_data;
2817             if (!is_compat && ci->is_compat) {
2818                 ref_len = 0;
2819             }
2820             len = unicode_decomp_char((uint32_t *)res, c, is_compat);
2821             if (len != ref_len ||
2822                 tabcmp(res, ref, ref_len) != 0) {
2823                 printf("ERROR c=%05x compat=%d\n", c, is_compat);
2824                 dump_str("res", res, len);
2825                 dump_str("ref", ref, ref_len);
2826                 exit(1);
2827             }
2828         }
2829     }
2830 }
2831 
check_compose_table(void)2832 void check_compose_table(void)
2833 {
2834     int i, p;
2835     /* XXX: we don't test all the cases */
2836 
2837     for(i = 0; i <= CHARCODE_MAX; i++) {
2838         CCInfo *ci = &unicode_db[i];
2839         if (ci->decomp_len == 2 && !ci->is_compat &&
2840             !ci->is_excluded) {
2841             p = unicode_compose_pair(ci->decomp_data[0], ci->decomp_data[1]);
2842             if (p != i) {
2843                 printf("ERROR compose: c=%05x %05x -> %05x ref=%05x\n",
2844                        ci->decomp_data[0], ci->decomp_data[1], p, i);
2845                 exit(1);
2846             }
2847         }
2848     }
2849 
2850 
2851 
2852 }
2853 
2854 #endif
2855 
2856 
2857 
2858 #ifdef USE_TEST
2859 
check_str(const char * msg,int num,const int * in_buf,int in_len,const int * buf1,int len1,const int * buf2,int len2)2860 void check_str(const char *msg, int num, const int *in_buf, int in_len,
2861                const int *buf1, int len1,
2862                const int *buf2, int len2)
2863 {
2864     if (len1 != len2 || tabcmp(buf1, buf2, len1) != 0) {
2865         printf("%d: ERROR %s:\n", num, msg);
2866         dump_str(" in", in_buf, in_len);
2867         dump_str("res", buf1, len1);
2868         dump_str("ref", buf2, len2);
2869         exit(1);
2870     }
2871 }
2872 
check_cc_table(void)2873 void check_cc_table(void)
2874 {
2875     int cc, cc_ref, c;
2876 
2877     for(c = 0; c <= CHARCODE_MAX; c++) {
2878         cc_ref = unicode_db[c].combining_class;
2879         cc = unicode_get_cc(c);
2880         if (cc != cc_ref) {
2881             printf("ERROR: c=%04x cc=%d cc_ref=%d\n",
2882                    c, cc, cc_ref);
2883             exit(1);
2884         }
2885     }
2886 #ifdef PROFILE
2887     {
2888         int64_t ti, count;
2889 
2890         ti = get_time_ns();
2891         count = 0;
2892         /* only do it on meaningful chars */
2893         for(c = 0x20; c <= 0xffff; c++) {
2894             cc_ref = unicode_db[c].combining_class;
2895             cc = unicode_get_cc(c);
2896             count++;
2897         }
2898         ti = get_time_ns() - ti;
2899         printf("cc time=%0.1f ns/char\n",
2900                (double)ti / count);
2901     }
2902 #endif
2903 }
2904 
normalization_test(const char * filename)2905 void normalization_test(const char *filename)
2906 {
2907     FILE *f;
2908     char line[4096], *p;
2909     int *in_str, *nfc_str, *nfd_str, *nfkc_str, *nfkd_str;
2910     int in_len, nfc_len, nfd_len, nfkc_len, nfkd_len;
2911     int *buf, buf_len, pos;
2912 
2913     f = fopen(filename, "rb");
2914     if (!f) {
2915         perror(filename);
2916         exit(1);
2917     }
2918     pos = 0;
2919     for(;;) {
2920         if (!get_line(line, sizeof(line), f))
2921             break;
2922         pos++;
2923         p = line;
2924         while (isspace(*p))
2925             p++;
2926         if (*p == '#' || *p == '@')
2927             continue;
2928         in_str = get_field_str(&in_len, p, 0);
2929         nfc_str = get_field_str(&nfc_len, p, 1);
2930         nfd_str = get_field_str(&nfd_len, p, 2);
2931         nfkc_str = get_field_str(&nfkc_len, p, 3);
2932         nfkd_str = get_field_str(&nfkd_len, p, 4);
2933 
2934         //        dump_str("in", in_str, in_len);
2935 
2936         buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFD, NULL, NULL);
2937         check_str("nfd", pos, in_str, in_len, buf, buf_len, nfd_str, nfd_len);
2938         free(buf);
2939 
2940         buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFKD, NULL, NULL);
2941         check_str("nfkd", pos, in_str, in_len, buf, buf_len, nfkd_str, nfkd_len);
2942         free(buf);
2943 
2944         buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFC, NULL, NULL);
2945         check_str("nfc", pos, in_str, in_len, buf, buf_len, nfc_str, nfc_len);
2946         free(buf);
2947 
2948         buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFKC, NULL, NULL);
2949         check_str("nfkc", pos, in_str, in_len, buf, buf_len, nfkc_str, nfkc_len);
2950         free(buf);
2951 
2952         free(in_str);
2953         free(nfc_str);
2954         free(nfd_str);
2955         free(nfkc_str);
2956         free(nfkd_str);
2957     }
2958     fclose(f);
2959 }
2960 #endif
2961 
main(int argc,char ** argv)2962 int main(int argc, char **argv)
2963 {
2964     const char *unicode_db_path, *outfilename;
2965     char filename[1024];
2966 
2967     if (argc < 2) {
2968         printf("usage: %s unicode_db_path [output_file]\n"
2969                "\n"
2970                "If no output_file is given, a self test is done using the current unicode library\n",
2971                argv[0]);
2972         exit(1);
2973     }
2974     unicode_db_path = argv[1];
2975     outfilename = NULL;
2976     if (argc >= 3)
2977         outfilename = argv[2];
2978 
2979     unicode_db = mallocz(sizeof(unicode_db[0]) * (CHARCODE_MAX + 1));
2980 
2981     snprintf(filename, sizeof(filename), "%s/UnicodeData.txt", unicode_db_path);
2982 
2983     parse_unicode_data(filename);
2984 
2985     snprintf(filename, sizeof(filename), "%s/SpecialCasing.txt", unicode_db_path);
2986     parse_special_casing(unicode_db, filename);
2987 
2988     snprintf(filename, sizeof(filename), "%s/CaseFolding.txt", unicode_db_path);
2989     parse_case_folding(unicode_db, filename);
2990 
2991     snprintf(filename, sizeof(filename), "%s/CompositionExclusions.txt", unicode_db_path);
2992     parse_composition_exclusions(filename);
2993 
2994     snprintf(filename, sizeof(filename), "%s/DerivedCoreProperties.txt", unicode_db_path);
2995     parse_derived_core_properties(filename);
2996 
2997     snprintf(filename, sizeof(filename), "%s/DerivedNormalizationProps.txt", unicode_db_path);
2998     parse_derived_norm_properties(filename);
2999 
3000     snprintf(filename, sizeof(filename), "%s/PropList.txt", unicode_db_path);
3001     parse_prop_list(filename);
3002 
3003     snprintf(filename, sizeof(filename), "%s/Scripts.txt", unicode_db_path);
3004     parse_scripts(filename);
3005 
3006     snprintf(filename, sizeof(filename), "%s/ScriptExtensions.txt",
3007              unicode_db_path);
3008     parse_script_extensions(filename);
3009 
3010     snprintf(filename, sizeof(filename), "%s/emoji-data.txt",
3011              unicode_db_path);
3012     parse_prop_list(filename);
3013 
3014     //    dump_data(unicode_db);
3015 
3016     build_conv_table(unicode_db);
3017 
3018     //    dump_table();
3019 
3020     if (!outfilename) {
3021 #ifdef USE_TEST
3022         check_case_conv();
3023         check_flags();
3024         check_decompose_table();
3025         check_compose_table();
3026         check_cc_table();
3027         snprintf(filename, sizeof(filename), "%s/NormalizationTest.txt", unicode_db_path);
3028         normalization_test(filename);
3029 #else
3030         fprintf(stderr, "Tests are not compiled\n");
3031         exit(1);
3032 #endif
3033     } else
3034     {
3035         FILE *fo = fopen(outfilename, "wb");
3036 
3037         if (!fo) {
3038             perror(outfilename);
3039             exit(1);
3040         }
3041         fprintf(fo,
3042                 "/* Compressed unicode tables */\n"
3043                 "/* Automatically generated file - do not edit */\n"
3044                 "\n"
3045                 "#include <stdint.h>\n"
3046                 "\n");
3047         dump_case_conv_table(fo);
3048         compute_internal_props();
3049         build_flags_tables(fo);
3050         fprintf(fo, "#ifdef CONFIG_ALL_UNICODE\n\n");
3051         build_cc_table(fo);
3052         build_decompose_table(fo);
3053         build_general_category_table(fo);
3054         build_script_table(fo);
3055         build_script_ext_table(fo);
3056         build_prop_list_table(fo);
3057         fprintf(fo, "#endif /* CONFIG_ALL_UNICODE */\n");
3058         fclose(fo);
3059     }
3060     return 0;
3061 }
3062