1 /*
2 * Copyright (c) 2014 SGI.
3 * All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18
19 /* Generator for a compact trie for unicode normalization */
20
21 #include <sys/types.h>
22 #include <stddef.h>
23 #include <stdlib.h>
24 #include <stdio.h>
25 #include <assert.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <errno.h>
29
30 /* Default names of the in- and output files. */
31
32 #define AGE_NAME "DerivedAge.txt"
33 #define CCC_NAME "DerivedCombiningClass.txt"
34 #define PROP_NAME "DerivedCoreProperties.txt"
35 #define DATA_NAME "UnicodeData.txt"
36 #define FOLD_NAME "CaseFolding.txt"
37 #define NORM_NAME "NormalizationCorrections.txt"
38 #define TEST_NAME "NormalizationTest.txt"
39 #define UTF8_NAME "utf8data.h"
40
41 const char *age_name = AGE_NAME;
42 const char *ccc_name = CCC_NAME;
43 const char *prop_name = PROP_NAME;
44 const char *data_name = DATA_NAME;
45 const char *fold_name = FOLD_NAME;
46 const char *norm_name = NORM_NAME;
47 const char *test_name = TEST_NAME;
48 const char *utf8_name = UTF8_NAME;
49
50 int verbose = 0;
51
52 /* An arbitrary line size limit on input lines. */
53
54 #define LINESIZE 1024
55 char line[LINESIZE];
56 char buf0[LINESIZE];
57 char buf1[LINESIZE];
58 char buf2[LINESIZE];
59 char buf3[LINESIZE];
60
61 const char *argv0;
62
63 /* ------------------------------------------------------------------ */
64
65 /*
66 * Unicode version numbers consist of three parts: major, minor, and a
67 * revision. These numbers are packed into an unsigned int to obtain
68 * a single version number.
69 *
70 * To save space in the generated trie, the unicode version is not
71 * stored directly, instead we calculate a generation number from the
72 * unicode versions seen in the DerivedAge file, and use that as an
73 * index into a table of unicode versions.
74 */
75 #define UNICODE_MAJ_SHIFT (16)
76 #define UNICODE_MIN_SHIFT (8)
77
78 #define UNICODE_MAJ_MAX ((unsigned short)-1)
79 #define UNICODE_MIN_MAX ((unsigned char)-1)
80 #define UNICODE_REV_MAX ((unsigned char)-1)
81
82 #define UNICODE_AGE(MAJ,MIN,REV) \
83 (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
84 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
85 ((unsigned int)(REV)))
86
87 unsigned int *ages;
88 int ages_count;
89
90 unsigned int unicode_maxage;
91
age_valid(unsigned int major,unsigned int minor,unsigned int revision)92 static int age_valid(unsigned int major, unsigned int minor,
93 unsigned int revision)
94 {
95 if (major > UNICODE_MAJ_MAX)
96 return 0;
97 if (minor > UNICODE_MIN_MAX)
98 return 0;
99 if (revision > UNICODE_REV_MAX)
100 return 0;
101 return 1;
102 }
103
104 /* ------------------------------------------------------------------ */
105
106 /*
107 * utf8trie_t
108 *
109 * A compact binary tree, used to decode UTF-8 characters.
110 *
111 * Internal nodes are one byte for the node itself, and up to three
112 * bytes for an offset into the tree. The first byte contains the
113 * following information:
114 * NEXTBYTE - flag - advance to next byte if set
115 * BITNUM - 3 bit field - the bit number to tested
116 * OFFLEN - 2 bit field - number of bytes in the offset
117 * if offlen == 0 (non-branching node)
118 * RIGHTPATH - 1 bit field - set if the following node is for the
119 * right-hand path (tested bit is set)
120 * TRIENODE - 1 bit field - set if the following node is an internal
121 * node, otherwise it is a leaf node
122 * if offlen != 0 (branching node)
123 * LEFTNODE - 1 bit field - set if the left-hand node is internal
124 * RIGHTNODE - 1 bit field - set if the right-hand node is internal
125 *
126 * Due to the way utf8 works, there cannot be branching nodes with
127 * NEXTBYTE set, and moreover those nodes always have a righthand
128 * descendant.
129 */
130 typedef unsigned char utf8trie_t;
131 #define BITNUM 0x07
132 #define NEXTBYTE 0x08
133 #define OFFLEN 0x30
134 #define OFFLEN_SHIFT 4
135 #define RIGHTPATH 0x40
136 #define TRIENODE 0x80
137 #define RIGHTNODE 0x40
138 #define LEFTNODE 0x80
139
140 /*
141 * utf8leaf_t
142 *
143 * The leaves of the trie are embedded in the trie, and so the same
144 * underlying datatype, unsigned char.
145 *
146 * leaf[0]: The unicode version, stored as a generation number that is
147 * an index into utf8agetab[]. With this we can filter code
148 * points based on the unicode version in which they were
149 * defined. The CCC of a non-defined code point is 0.
150 * leaf[1]: Canonical Combining Class. During normalization, we need
151 * to do a stable sort into ascending order of all characters
152 * with a non-zero CCC that occur between two characters with
153 * a CCC of 0, or at the begin or end of a string.
154 * The unicode standard guarantees that all CCC values are
155 * between 0 and 254 inclusive, which leaves 255 available as
156 * a special value.
157 * Code points with CCC 0 are known as stoppers.
158 * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
159 * start of a NUL-terminated string that is the decomposition
160 * of the character.
161 * The CCC of a decomposable character is the same as the CCC
162 * of the first character of its decomposition.
163 * Some characters decompose as the empty string: these are
164 * characters with the Default_Ignorable_Code_Point property.
165 * These do affect normalization, as they all have CCC 0.
166 *
167 * The decompositions in the trie have been fully expanded.
168 *
169 * Casefolding, if applicable, is also done using decompositions.
170 */
171 typedef unsigned char utf8leaf_t;
172
173 #define LEAF_GEN(LEAF) ((LEAF)[0])
174 #define LEAF_CCC(LEAF) ((LEAF)[1])
175 #define LEAF_STR(LEAF) ((const char*)((LEAF) + 2))
176
177 #define MAXGEN (255)
178
179 #define MINCCC (0)
180 #define MAXCCC (254)
181 #define STOPPER (0)
182 #define DECOMPOSE (255)
183 #define HANGUL ((char)(255))
184
185 #define UTF8HANGULLEAF (12)
186
187 struct tree;
188 static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
189 const char *, size_t);
190 static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
191
192 unsigned char *utf8data;
193 size_t utf8data_size;
194
195 utf8trie_t *nfkdi;
196 utf8trie_t *nfkdicf;
197
198 /* ------------------------------------------------------------------ */
199
200 /*
201 * UTF8 valid ranges.
202 *
203 * The UTF-8 encoding spreads the bits of a 32bit word over several
204 * bytes. This table gives the ranges that can be held and how they'd
205 * be represented.
206 *
207 * 0x00000000 0x0000007F: 0xxxxxxx
208 * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
209 * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
210 * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
211 * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
212 * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
213 *
214 * There is an additional requirement on UTF-8, in that only the
215 * shortest representation of a 32bit value is to be used. A decoder
216 * must not decode sequences that do not satisfy this requirement.
217 * Thus the allowed ranges have a lower bound.
218 *
219 * 0x00000000 0x0000007F: 0xxxxxxx
220 * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
221 * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
222 * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
223 * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
224 * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
225 *
226 * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
227 * 17 planes of 65536 values. This limits the sequences actually seen
228 * even more, to just the following.
229 *
230 * 0 - 0x7f: 0 0x7f
231 * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf
232 * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf
233 * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf
234 *
235 * Even within those ranges not all values are allowed: the surrogates
236 * 0xd800 - 0xdfff should never be seen.
237 *
238 * Note that the longest sequence seen with valid usage is 4 bytes,
239 * the same a single UTF-32 character. This makes the UTF-8
240 * representation of Unicode strictly smaller than UTF-32.
241 *
242 * The shortest sequence requirement was introduced by:
243 * Corrigendum #1: UTF-8 Shortest Form
244 * It can be found here:
245 * http://www.unicode.org/versions/corrigendum1.html
246 *
247 */
248
249 #define UTF8_2_BITS 0xC0
250 #define UTF8_3_BITS 0xE0
251 #define UTF8_4_BITS 0xF0
252 #define UTF8_N_BITS 0x80
253 #define UTF8_2_MASK 0xE0
254 #define UTF8_3_MASK 0xF0
255 #define UTF8_4_MASK 0xF8
256 #define UTF8_N_MASK 0xC0
257 #define UTF8_V_MASK 0x3F
258 #define UTF8_V_SHIFT 6
259
utf8encode(char * str,unsigned int val)260 static int utf8encode(char *str, unsigned int val)
261 {
262 int len;
263
264 if (val < 0x80) {
265 str[0] = val;
266 len = 1;
267 } else if (val < 0x800) {
268 str[1] = val & UTF8_V_MASK;
269 str[1] |= UTF8_N_BITS;
270 val >>= UTF8_V_SHIFT;
271 str[0] = val;
272 str[0] |= UTF8_2_BITS;
273 len = 2;
274 } else if (val < 0x10000) {
275 str[2] = val & UTF8_V_MASK;
276 str[2] |= UTF8_N_BITS;
277 val >>= UTF8_V_SHIFT;
278 str[1] = val & UTF8_V_MASK;
279 str[1] |= UTF8_N_BITS;
280 val >>= UTF8_V_SHIFT;
281 str[0] = val;
282 str[0] |= UTF8_3_BITS;
283 len = 3;
284 } else if (val < 0x110000) {
285 str[3] = val & UTF8_V_MASK;
286 str[3] |= UTF8_N_BITS;
287 val >>= UTF8_V_SHIFT;
288 str[2] = val & UTF8_V_MASK;
289 str[2] |= UTF8_N_BITS;
290 val >>= UTF8_V_SHIFT;
291 str[1] = val & UTF8_V_MASK;
292 str[1] |= UTF8_N_BITS;
293 val >>= UTF8_V_SHIFT;
294 str[0] = val;
295 str[0] |= UTF8_4_BITS;
296 len = 4;
297 } else {
298 printf("%#x: illegal val\n", val);
299 len = 0;
300 }
301 return len;
302 }
303
utf8decode(const char * str)304 static unsigned int utf8decode(const char *str)
305 {
306 const unsigned char *s = (const unsigned char*)str;
307 unsigned int unichar = 0;
308
309 if (*s < 0x80) {
310 unichar = *s;
311 } else if (*s < UTF8_3_BITS) {
312 unichar = *s++ & 0x1F;
313 unichar <<= UTF8_V_SHIFT;
314 unichar |= *s & 0x3F;
315 } else if (*s < UTF8_4_BITS) {
316 unichar = *s++ & 0x0F;
317 unichar <<= UTF8_V_SHIFT;
318 unichar |= *s++ & 0x3F;
319 unichar <<= UTF8_V_SHIFT;
320 unichar |= *s & 0x3F;
321 } else {
322 unichar = *s++ & 0x0F;
323 unichar <<= UTF8_V_SHIFT;
324 unichar |= *s++ & 0x3F;
325 unichar <<= UTF8_V_SHIFT;
326 unichar |= *s++ & 0x3F;
327 unichar <<= UTF8_V_SHIFT;
328 unichar |= *s & 0x3F;
329 }
330 return unichar;
331 }
332
utf32valid(unsigned int unichar)333 static int utf32valid(unsigned int unichar)
334 {
335 return unichar < 0x110000;
336 }
337
338 #define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3)
339
340 #define NODE 1
341 #define LEAF 0
342
343 struct tree {
344 void *root;
345 int childnode;
346 const char *type;
347 unsigned int maxage;
348 struct tree *next;
349 int (*leaf_equal)(void *, void *);
350 void (*leaf_print)(void *, int);
351 int (*leaf_mark)(void *);
352 int (*leaf_size)(void *);
353 int *(*leaf_index)(struct tree *, void *);
354 unsigned char *(*leaf_emit)(void *, unsigned char *);
355 int leafindex[0x110000];
356 int index;
357 };
358
359 struct node {
360 int index;
361 int offset;
362 int mark;
363 int size;
364 struct node *parent;
365 void *left;
366 void *right;
367 unsigned char bitnum;
368 unsigned char nextbyte;
369 unsigned char leftnode;
370 unsigned char rightnode;
371 unsigned int keybits;
372 unsigned int keymask;
373 };
374
375 /*
376 * Example lookup function for a tree.
377 */
lookup(struct tree * tree,const char * key)378 static void *lookup(struct tree *tree, const char *key)
379 {
380 struct node *node;
381 void *leaf = NULL;
382
383 node = tree->root;
384 while (!leaf && node) {
385 if (node->nextbyte)
386 key++;
387 if (*key & (1 << (node->bitnum & 7))) {
388 /* Right leg */
389 if (node->rightnode == NODE) {
390 node = node->right;
391 } else if (node->rightnode == LEAF) {
392 leaf = node->right;
393 } else {
394 node = NULL;
395 }
396 } else {
397 /* Left leg */
398 if (node->leftnode == NODE) {
399 node = node->left;
400 } else if (node->leftnode == LEAF) {
401 leaf = node->left;
402 } else {
403 node = NULL;
404 }
405 }
406 }
407
408 return leaf;
409 }
410
411 /*
412 * A simple non-recursive tree walker: keep track of visits to the
413 * left and right branches in the leftmask and rightmask.
414 */
tree_walk(struct tree * tree)415 static void tree_walk(struct tree *tree)
416 {
417 struct node *node;
418 unsigned int leftmask;
419 unsigned int rightmask;
420 unsigned int bitmask;
421 int indent = 1;
422 int nodes, singletons, leaves;
423
424 nodes = singletons = leaves = 0;
425
426 printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
427 if (tree->childnode == LEAF) {
428 assert(tree->root);
429 tree->leaf_print(tree->root, indent);
430 leaves = 1;
431 } else {
432 assert(tree->childnode == NODE);
433 node = tree->root;
434 leftmask = rightmask = 0;
435 while (node) {
436 printf("%*snode @ %p bitnum %d nextbyte %d"
437 " left %p right %p mask %x bits %x\n",
438 indent, "", node,
439 node->bitnum, node->nextbyte,
440 node->left, node->right,
441 node->keymask, node->keybits);
442 nodes += 1;
443 if (!(node->left && node->right))
444 singletons += 1;
445
446 while (node) {
447 bitmask = 1 << node->bitnum;
448 if ((leftmask & bitmask) == 0) {
449 leftmask |= bitmask;
450 if (node->leftnode == LEAF) {
451 assert(node->left);
452 tree->leaf_print(node->left,
453 indent+1);
454 leaves += 1;
455 } else if (node->left) {
456 assert(node->leftnode == NODE);
457 indent += 1;
458 node = node->left;
459 break;
460 }
461 }
462 if ((rightmask & bitmask) == 0) {
463 rightmask |= bitmask;
464 if (node->rightnode == LEAF) {
465 assert(node->right);
466 tree->leaf_print(node->right,
467 indent+1);
468 leaves += 1;
469 } else if (node->right) {
470 assert(node->rightnode == NODE);
471 indent += 1;
472 node = node->right;
473 break;
474 }
475 }
476 leftmask &= ~bitmask;
477 rightmask &= ~bitmask;
478 node = node->parent;
479 indent -= 1;
480 }
481 }
482 }
483 printf("nodes %d leaves %d singletons %d\n",
484 nodes, leaves, singletons);
485 }
486
487 /*
488 * Allocate an initialize a new internal node.
489 */
alloc_node(struct node * parent)490 static struct node *alloc_node(struct node *parent)
491 {
492 struct node *node;
493 int bitnum;
494
495 node = malloc(sizeof(*node));
496 node->left = node->right = NULL;
497 node->parent = parent;
498 node->leftnode = NODE;
499 node->rightnode = NODE;
500 node->keybits = 0;
501 node->keymask = 0;
502 node->mark = 0;
503 node->index = 0;
504 node->offset = -1;
505 node->size = 4;
506
507 if (node->parent) {
508 bitnum = parent->bitnum;
509 if ((bitnum & 7) == 0) {
510 node->bitnum = bitnum + 7 + 8;
511 node->nextbyte = 1;
512 } else {
513 node->bitnum = bitnum - 1;
514 node->nextbyte = 0;
515 }
516 } else {
517 node->bitnum = 7;
518 node->nextbyte = 0;
519 }
520
521 return node;
522 }
523
524 /*
525 * Insert a new leaf into the tree, and collapse any subtrees that are
526 * fully populated and end in identical leaves. A nextbyte tagged
527 * internal node will not be removed to preserve the tree's integrity.
528 * Note that due to the structure of utf8, no nextbyte tagged node
529 * will be a candidate for removal.
530 */
insert(struct tree * tree,char * key,int keylen,void * leaf)531 static int insert(struct tree *tree, char *key, int keylen, void *leaf)
532 {
533 struct node *node;
534 struct node *parent;
535 void **cursor;
536 int keybits;
537
538 assert(keylen >= 1 && keylen <= 4);
539
540 node = NULL;
541 cursor = &tree->root;
542 keybits = 8 * keylen;
543
544 /* Insert, creating path along the way. */
545 while (keybits) {
546 if (!*cursor)
547 *cursor = alloc_node(node);
548 node = *cursor;
549 if (node->nextbyte)
550 key++;
551 if (*key & (1 << (node->bitnum & 7)))
552 cursor = &node->right;
553 else
554 cursor = &node->left;
555 keybits--;
556 }
557 *cursor = leaf;
558
559 /* Merge subtrees if possible. */
560 while (node) {
561 if (*key & (1 << (node->bitnum & 7)))
562 node->rightnode = LEAF;
563 else
564 node->leftnode = LEAF;
565 if (node->nextbyte)
566 break;
567 if (node->leftnode == NODE || node->rightnode == NODE)
568 break;
569 assert(node->left);
570 assert(node->right);
571 /* Compare */
572 if (! tree->leaf_equal(node->left, node->right))
573 break;
574 /* Keep left, drop right leaf. */
575 leaf = node->left;
576 /* Check in parent */
577 parent = node->parent;
578 if (!parent) {
579 /* root of tree! */
580 tree->root = leaf;
581 tree->childnode = LEAF;
582 } else if (parent->left == node) {
583 parent->left = leaf;
584 parent->leftnode = LEAF;
585 if (parent->right) {
586 parent->keymask = 0;
587 parent->keybits = 0;
588 } else {
589 parent->keymask |= (1 << node->bitnum);
590 }
591 } else if (parent->right == node) {
592 parent->right = leaf;
593 parent->rightnode = LEAF;
594 if (parent->left) {
595 parent->keymask = 0;
596 parent->keybits = 0;
597 } else {
598 parent->keymask |= (1 << node->bitnum);
599 parent->keybits |= (1 << node->bitnum);
600 }
601 } else {
602 /* internal tree error */
603 assert(0);
604 }
605 free(node);
606 node = parent;
607 }
608
609 /* Propagate keymasks up along singleton chains. */
610 while (node) {
611 parent = node->parent;
612 if (!parent)
613 break;
614 /* Nix the mask for parents with two children. */
615 if (node->keymask == 0) {
616 parent->keymask = 0;
617 parent->keybits = 0;
618 } else if (parent->left && parent->right) {
619 parent->keymask = 0;
620 parent->keybits = 0;
621 } else {
622 assert((parent->keymask & node->keymask) == 0);
623 parent->keymask |= node->keymask;
624 parent->keymask |= (1 << parent->bitnum);
625 parent->keybits |= node->keybits;
626 if (parent->right)
627 parent->keybits |= (1 << parent->bitnum);
628 }
629 node = parent;
630 }
631
632 return 0;
633 }
634
635 /*
636 * Prune internal nodes.
637 *
638 * Fully populated subtrees that end at the same leaf have already
639 * been collapsed. There are still internal nodes that have for both
640 * their left and right branches a sequence of singletons that make
641 * identical choices and end in identical leaves. The keymask and
642 * keybits collected in the nodes describe the choices made in these
643 * singleton chains. When they are identical for the left and right
644 * branch of a node, and the two leaves comare identical, the node in
645 * question can be removed.
646 *
647 * Note that nodes with the nextbyte tag set will not be removed by
648 * this to ensure tree integrity. Note as well that the structure of
649 * utf8 ensures that these nodes would not have been candidates for
650 * removal in any case.
651 */
prune(struct tree * tree)652 static void prune(struct tree *tree)
653 {
654 struct node *node;
655 struct node *left;
656 struct node *right;
657 struct node *parent;
658 void *leftleaf;
659 void *rightleaf;
660 unsigned int leftmask;
661 unsigned int rightmask;
662 unsigned int bitmask;
663 int count;
664
665 if (verbose > 0)
666 printf("Pruning %s_%x\n", tree->type, tree->maxage);
667
668 count = 0;
669 if (tree->childnode == LEAF)
670 return;
671 if (!tree->root)
672 return;
673
674 leftmask = rightmask = 0;
675 node = tree->root;
676 while (node) {
677 if (node->nextbyte)
678 goto advance;
679 if (node->leftnode == LEAF)
680 goto advance;
681 if (node->rightnode == LEAF)
682 goto advance;
683 if (!node->left)
684 goto advance;
685 if (!node->right)
686 goto advance;
687 left = node->left;
688 right = node->right;
689 if (left->keymask == 0)
690 goto advance;
691 if (right->keymask == 0)
692 goto advance;
693 if (left->keymask != right->keymask)
694 goto advance;
695 if (left->keybits != right->keybits)
696 goto advance;
697 leftleaf = NULL;
698 while (!leftleaf) {
699 assert(left->left || left->right);
700 if (left->leftnode == LEAF)
701 leftleaf = left->left;
702 else if (left->rightnode == LEAF)
703 leftleaf = left->right;
704 else if (left->left)
705 left = left->left;
706 else if (left->right)
707 left = left->right;
708 else
709 assert(0);
710 }
711 rightleaf = NULL;
712 while (!rightleaf) {
713 assert(right->left || right->right);
714 if (right->leftnode == LEAF)
715 rightleaf = right->left;
716 else if (right->rightnode == LEAF)
717 rightleaf = right->right;
718 else if (right->left)
719 right = right->left;
720 else if (right->right)
721 right = right->right;
722 else
723 assert(0);
724 }
725 if (! tree->leaf_equal(leftleaf, rightleaf))
726 goto advance;
727 /*
728 * This node has identical singleton-only subtrees.
729 * Remove it.
730 */
731 parent = node->parent;
732 left = node->left;
733 right = node->right;
734 if (parent->left == node)
735 parent->left = left;
736 else if (parent->right == node)
737 parent->right = left;
738 else
739 assert(0);
740 left->parent = parent;
741 left->keymask |= (1 << node->bitnum);
742 node->left = NULL;
743 while (node) {
744 bitmask = 1 << node->bitnum;
745 leftmask &= ~bitmask;
746 rightmask &= ~bitmask;
747 if (node->leftnode == NODE && node->left) {
748 left = node->left;
749 free(node);
750 count++;
751 node = left;
752 } else if (node->rightnode == NODE && node->right) {
753 right = node->right;
754 free(node);
755 count++;
756 node = right;
757 } else {
758 node = NULL;
759 }
760 }
761 /* Propagate keymasks up along singleton chains. */
762 node = parent;
763 /* Force re-check */
764 bitmask = 1 << node->bitnum;
765 leftmask &= ~bitmask;
766 rightmask &= ~bitmask;
767 for (;;) {
768 if (node->left && node->right)
769 break;
770 if (node->left) {
771 left = node->left;
772 node->keymask |= left->keymask;
773 node->keybits |= left->keybits;
774 }
775 if (node->right) {
776 right = node->right;
777 node->keymask |= right->keymask;
778 node->keybits |= right->keybits;
779 }
780 node->keymask |= (1 << node->bitnum);
781 node = node->parent;
782 /* Force re-check */
783 bitmask = 1 << node->bitnum;
784 leftmask &= ~bitmask;
785 rightmask &= ~bitmask;
786 }
787 advance:
788 bitmask = 1 << node->bitnum;
789 if ((leftmask & bitmask) == 0 &&
790 node->leftnode == NODE &&
791 node->left) {
792 leftmask |= bitmask;
793 node = node->left;
794 } else if ((rightmask & bitmask) == 0 &&
795 node->rightnode == NODE &&
796 node->right) {
797 rightmask |= bitmask;
798 node = node->right;
799 } else {
800 leftmask &= ~bitmask;
801 rightmask &= ~bitmask;
802 node = node->parent;
803 }
804 }
805 if (verbose > 0)
806 printf("Pruned %d nodes\n", count);
807 }
808
809 /*
810 * Mark the nodes in the tree that lead to leaves that must be
811 * emitted.
812 */
mark_nodes(struct tree * tree)813 static void mark_nodes(struct tree *tree)
814 {
815 struct node *node;
816 struct node *n;
817 unsigned int leftmask;
818 unsigned int rightmask;
819 unsigned int bitmask;
820 int marked;
821
822 marked = 0;
823 if (verbose > 0)
824 printf("Marking %s_%x\n", tree->type, tree->maxage);
825 if (tree->childnode == LEAF)
826 goto done;
827
828 assert(tree->childnode == NODE);
829 node = tree->root;
830 leftmask = rightmask = 0;
831 while (node) {
832 bitmask = 1 << node->bitnum;
833 if ((leftmask & bitmask) == 0) {
834 leftmask |= bitmask;
835 if (node->leftnode == LEAF) {
836 assert(node->left);
837 if (tree->leaf_mark(node->left)) {
838 n = node;
839 while (n && !n->mark) {
840 marked++;
841 n->mark = 1;
842 n = n->parent;
843 }
844 }
845 } else if (node->left) {
846 assert(node->leftnode == NODE);
847 node = node->left;
848 continue;
849 }
850 }
851 if ((rightmask & bitmask) == 0) {
852 rightmask |= bitmask;
853 if (node->rightnode == LEAF) {
854 assert(node->right);
855 if (tree->leaf_mark(node->right)) {
856 n = node;
857 while (n && !n->mark) {
858 marked++;
859 n->mark = 1;
860 n = n->parent;
861 }
862 }
863 } else if (node->right) {
864 assert(node->rightnode == NODE);
865 node = node->right;
866 continue;
867 }
868 }
869 leftmask &= ~bitmask;
870 rightmask &= ~bitmask;
871 node = node->parent;
872 }
873
874 /* second pass: left siblings and singletons */
875
876 assert(tree->childnode == NODE);
877 node = tree->root;
878 leftmask = rightmask = 0;
879 while (node) {
880 bitmask = 1 << node->bitnum;
881 if ((leftmask & bitmask) == 0) {
882 leftmask |= bitmask;
883 if (node->leftnode == LEAF) {
884 assert(node->left);
885 if (tree->leaf_mark(node->left)) {
886 n = node;
887 while (n && !n->mark) {
888 marked++;
889 n->mark = 1;
890 n = n->parent;
891 }
892 }
893 } else if (node->left) {
894 assert(node->leftnode == NODE);
895 node = node->left;
896 if (!node->mark && node->parent->mark) {
897 marked++;
898 node->mark = 1;
899 }
900 continue;
901 }
902 }
903 if ((rightmask & bitmask) == 0) {
904 rightmask |= bitmask;
905 if (node->rightnode == LEAF) {
906 assert(node->right);
907 if (tree->leaf_mark(node->right)) {
908 n = node;
909 while (n && !n->mark) {
910 marked++;
911 n->mark = 1;
912 n = n->parent;
913 }
914 }
915 } else if (node->right) {
916 assert(node->rightnode == NODE);
917 node = node->right;
918 if (!node->mark && node->parent->mark &&
919 !node->parent->left) {
920 marked++;
921 node->mark = 1;
922 }
923 continue;
924 }
925 }
926 leftmask &= ~bitmask;
927 rightmask &= ~bitmask;
928 node = node->parent;
929 }
930 done:
931 if (verbose > 0)
932 printf("Marked %d nodes\n", marked);
933 }
934
935 /*
936 * Compute the index of each node and leaf, which is the offset in the
937 * emitted trie. These values must be pre-computed because relative
938 * offsets between nodes are used to navigate the tree.
939 */
index_nodes(struct tree * tree,int index)940 static int index_nodes(struct tree *tree, int index)
941 {
942 struct node *node;
943 unsigned int leftmask;
944 unsigned int rightmask;
945 unsigned int bitmask;
946 int count;
947 int indent;
948
949 /* Align to a cache line (or half a cache line?). */
950 while (index % 64)
951 index++;
952 tree->index = index;
953 indent = 1;
954 count = 0;
955
956 if (verbose > 0)
957 printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
958 if (tree->childnode == LEAF) {
959 index += tree->leaf_size(tree->root);
960 goto done;
961 }
962
963 assert(tree->childnode == NODE);
964 node = tree->root;
965 leftmask = rightmask = 0;
966 while (node) {
967 if (!node->mark)
968 goto skip;
969 count++;
970 if (node->index != index)
971 node->index = index;
972 index += node->size;
973 skip:
974 while (node) {
975 bitmask = 1 << node->bitnum;
976 if (node->mark && (leftmask & bitmask) == 0) {
977 leftmask |= bitmask;
978 if (node->leftnode == LEAF) {
979 assert(node->left);
980 *tree->leaf_index(tree, node->left) =
981 index;
982 index += tree->leaf_size(node->left);
983 count++;
984 } else if (node->left) {
985 assert(node->leftnode == NODE);
986 indent += 1;
987 node = node->left;
988 break;
989 }
990 }
991 if (node->mark && (rightmask & bitmask) == 0) {
992 rightmask |= bitmask;
993 if (node->rightnode == LEAF) {
994 assert(node->right);
995 *tree->leaf_index(tree, node->right) = index;
996 index += tree->leaf_size(node->right);
997 count++;
998 } else if (node->right) {
999 assert(node->rightnode == NODE);
1000 indent += 1;
1001 node = node->right;
1002 break;
1003 }
1004 }
1005 leftmask &= ~bitmask;
1006 rightmask &= ~bitmask;
1007 node = node->parent;
1008 indent -= 1;
1009 }
1010 }
1011 done:
1012 /* Round up to a multiple of 16 */
1013 while (index % 16)
1014 index++;
1015 if (verbose > 0)
1016 printf("Final index %d\n", index);
1017 return index;
1018 }
1019
1020 /*
1021 * Mark the nodes in a subtree, helper for size_nodes().
1022 */
mark_subtree(struct node * node)1023 static int mark_subtree(struct node *node)
1024 {
1025 int changed;
1026
1027 if (!node || node->mark)
1028 return 0;
1029 node->mark = 1;
1030 node->index = node->parent->index;
1031 changed = 1;
1032 if (node->leftnode == NODE)
1033 changed += mark_subtree(node->left);
1034 if (node->rightnode == NODE)
1035 changed += mark_subtree(node->right);
1036 return changed;
1037 }
1038
1039 /*
1040 * Compute the size of nodes and leaves. We start by assuming that
1041 * each node needs to store a three-byte offset. The indexes of the
1042 * nodes are calculated based on that, and then this function is
1043 * called to see if the sizes of some nodes can be reduced. This is
1044 * repeated until no more changes are seen.
1045 */
size_nodes(struct tree * tree)1046 static int size_nodes(struct tree *tree)
1047 {
1048 struct tree *next;
1049 struct node *node;
1050 struct node *right;
1051 struct node *n;
1052 unsigned int leftmask;
1053 unsigned int rightmask;
1054 unsigned int bitmask;
1055 unsigned int pathbits;
1056 unsigned int pathmask;
1057 unsigned int nbit;
1058 int changed;
1059 int offset;
1060 int size;
1061 int indent;
1062
1063 indent = 1;
1064 changed = 0;
1065 size = 0;
1066
1067 if (verbose > 0)
1068 printf("Sizing %s_%x\n", tree->type, tree->maxage);
1069 if (tree->childnode == LEAF)
1070 goto done;
1071
1072 assert(tree->childnode == NODE);
1073 pathbits = 0;
1074 pathmask = 0;
1075 node = tree->root;
1076 leftmask = rightmask = 0;
1077 while (node) {
1078 if (!node->mark)
1079 goto skip;
1080 offset = 0;
1081 if (!node->left || !node->right) {
1082 size = 1;
1083 } else {
1084 if (node->rightnode == NODE) {
1085 /*
1086 * If the right node is not marked,
1087 * look for a corresponding node in
1088 * the next tree. Such a node need
1089 * not exist.
1090 */
1091 right = node->right;
1092 next = tree->next;
1093 while (!right->mark) {
1094 assert(next);
1095 n = next->root;
1096 while (n->bitnum != node->bitnum) {
1097 nbit = 1 << n->bitnum;
1098 if (!(pathmask & nbit))
1099 break;
1100 if (pathbits & nbit) {
1101 if (n->rightnode == LEAF)
1102 break;
1103 n = n->right;
1104 } else {
1105 if (n->leftnode == LEAF)
1106 break;
1107 n = n->left;
1108 }
1109 }
1110 if (n->bitnum != node->bitnum)
1111 break;
1112 n = n->right;
1113 right = n;
1114 next = next->next;
1115 }
1116 /* Make sure the right node is marked. */
1117 if (!right->mark)
1118 changed += mark_subtree(right);
1119 offset = right->index - node->index;
1120 } else {
1121 offset = *tree->leaf_index(tree, node->right);
1122 offset -= node->index;
1123 }
1124 assert(offset >= 0);
1125 assert(offset <= 0xffffff);
1126 if (offset <= 0xff) {
1127 size = 2;
1128 } else if (offset <= 0xffff) {
1129 size = 3;
1130 } else { /* offset <= 0xffffff */
1131 size = 4;
1132 }
1133 }
1134 if (node->size != size || node->offset != offset) {
1135 node->size = size;
1136 node->offset = offset;
1137 changed++;
1138 }
1139 skip:
1140 while (node) {
1141 bitmask = 1 << node->bitnum;
1142 pathmask |= bitmask;
1143 if (node->mark && (leftmask & bitmask) == 0) {
1144 leftmask |= bitmask;
1145 if (node->leftnode == LEAF) {
1146 assert(node->left);
1147 } else if (node->left) {
1148 assert(node->leftnode == NODE);
1149 indent += 1;
1150 node = node->left;
1151 break;
1152 }
1153 }
1154 if (node->mark && (rightmask & bitmask) == 0) {
1155 rightmask |= bitmask;
1156 pathbits |= bitmask;
1157 if (node->rightnode == LEAF) {
1158 assert(node->right);
1159 } else if (node->right) {
1160 assert(node->rightnode == NODE);
1161 indent += 1;
1162 node = node->right;
1163 break;
1164 }
1165 }
1166 leftmask &= ~bitmask;
1167 rightmask &= ~bitmask;
1168 pathmask &= ~bitmask;
1169 pathbits &= ~bitmask;
1170 node = node->parent;
1171 indent -= 1;
1172 }
1173 }
1174 done:
1175 if (verbose > 0)
1176 printf("Found %d changes\n", changed);
1177 return changed;
1178 }
1179
1180 /*
1181 * Emit a trie for the given tree into the data array.
1182 */
emit(struct tree * tree,unsigned char * data)1183 static void emit(struct tree *tree, unsigned char *data)
1184 {
1185 struct node *node;
1186 unsigned int leftmask;
1187 unsigned int rightmask;
1188 unsigned int bitmask;
1189 int offlen;
1190 int offset;
1191 int index;
1192 int indent;
1193 int size;
1194 int bytes;
1195 int leaves;
1196 int nodes[4];
1197 unsigned char byte;
1198
1199 nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
1200 leaves = 0;
1201 bytes = 0;
1202 index = tree->index;
1203 data += index;
1204 indent = 1;
1205 if (verbose > 0)
1206 printf("Emitting %s_%x\n", tree->type, tree->maxage);
1207 if (tree->childnode == LEAF) {
1208 assert(tree->root);
1209 tree->leaf_emit(tree->root, data);
1210 size = tree->leaf_size(tree->root);
1211 index += size;
1212 leaves++;
1213 goto done;
1214 }
1215
1216 assert(tree->childnode == NODE);
1217 node = tree->root;
1218 leftmask = rightmask = 0;
1219 while (node) {
1220 if (!node->mark)
1221 goto skip;
1222 assert(node->offset != -1);
1223 assert(node->index == index);
1224
1225 byte = 0;
1226 if (node->nextbyte)
1227 byte |= NEXTBYTE;
1228 byte |= (node->bitnum & BITNUM);
1229 if (node->left && node->right) {
1230 if (node->leftnode == NODE)
1231 byte |= LEFTNODE;
1232 if (node->rightnode == NODE)
1233 byte |= RIGHTNODE;
1234 if (node->offset <= 0xff)
1235 offlen = 1;
1236 else if (node->offset <= 0xffff)
1237 offlen = 2;
1238 else
1239 offlen = 3;
1240 nodes[offlen]++;
1241 offset = node->offset;
1242 byte |= offlen << OFFLEN_SHIFT;
1243 *data++ = byte;
1244 index++;
1245 while (offlen--) {
1246 *data++ = offset & 0xff;
1247 index++;
1248 offset >>= 8;
1249 }
1250 } else if (node->left) {
1251 if (node->leftnode == NODE)
1252 byte |= TRIENODE;
1253 nodes[0]++;
1254 *data++ = byte;
1255 index++;
1256 } else if (node->right) {
1257 byte |= RIGHTNODE;
1258 if (node->rightnode == NODE)
1259 byte |= TRIENODE;
1260 nodes[0]++;
1261 *data++ = byte;
1262 index++;
1263 } else {
1264 assert(0);
1265 }
1266 skip:
1267 while (node) {
1268 bitmask = 1 << node->bitnum;
1269 if (node->mark && (leftmask & bitmask) == 0) {
1270 leftmask |= bitmask;
1271 if (node->leftnode == LEAF) {
1272 assert(node->left);
1273 data = tree->leaf_emit(node->left,
1274 data);
1275 size = tree->leaf_size(node->left);
1276 index += size;
1277 bytes += size;
1278 leaves++;
1279 } else if (node->left) {
1280 assert(node->leftnode == NODE);
1281 indent += 1;
1282 node = node->left;
1283 break;
1284 }
1285 }
1286 if (node->mark && (rightmask & bitmask) == 0) {
1287 rightmask |= bitmask;
1288 if (node->rightnode == LEAF) {
1289 assert(node->right);
1290 data = tree->leaf_emit(node->right,
1291 data);
1292 size = tree->leaf_size(node->right);
1293 index += size;
1294 bytes += size;
1295 leaves++;
1296 } else if (node->right) {
1297 assert(node->rightnode == NODE);
1298 indent += 1;
1299 node = node->right;
1300 break;
1301 }
1302 }
1303 leftmask &= ~bitmask;
1304 rightmask &= ~bitmask;
1305 node = node->parent;
1306 indent -= 1;
1307 }
1308 }
1309 done:
1310 if (verbose > 0) {
1311 printf("Emitted %d (%d) leaves",
1312 leaves, bytes);
1313 printf(" %d (%d+%d+%d+%d) nodes",
1314 nodes[0] + nodes[1] + nodes[2] + nodes[3],
1315 nodes[0], nodes[1], nodes[2], nodes[3]);
1316 printf(" %d total\n", index - tree->index);
1317 }
1318 }
1319
1320 /* ------------------------------------------------------------------ */
1321
1322 /*
1323 * Unicode data.
1324 *
1325 * We need to keep track of the Canonical Combining Class, the Age,
1326 * and decompositions for a code point.
1327 *
1328 * For the Age, we store the index into the ages table. Effectively
1329 * this is a generation number that the table maps to a unicode
1330 * version.
1331 *
1332 * The correction field is used to indicate that this entry is in the
1333 * corrections array, which contains decompositions that were
1334 * corrected in later revisions. The value of the correction field is
1335 * the Unicode version in which the mapping was corrected.
1336 */
1337 struct unicode_data {
1338 unsigned int code;
1339 int ccc;
1340 int gen;
1341 int correction;
1342 unsigned int *utf32nfkdi;
1343 unsigned int *utf32nfkdicf;
1344 char *utf8nfkdi;
1345 char *utf8nfkdicf;
1346 };
1347
1348 struct unicode_data unicode_data[0x110000];
1349 struct unicode_data *corrections;
1350 int corrections_count;
1351
1352 struct tree *nfkdi_tree;
1353 struct tree *nfkdicf_tree;
1354
1355 struct tree *trees;
1356 int trees_count;
1357
1358 /*
1359 * Check the corrections array to see if this entry was corrected at
1360 * some point.
1361 */
corrections_lookup(struct unicode_data * u)1362 static struct unicode_data *corrections_lookup(struct unicode_data *u)
1363 {
1364 int i;
1365
1366 for (i = 0; i != corrections_count; i++)
1367 if (u->code == corrections[i].code)
1368 return &corrections[i];
1369 return u;
1370 }
1371
nfkdi_equal(void * l,void * r)1372 static int nfkdi_equal(void *l, void *r)
1373 {
1374 struct unicode_data *left = l;
1375 struct unicode_data *right = r;
1376
1377 if (left->gen != right->gen)
1378 return 0;
1379 if (left->ccc != right->ccc)
1380 return 0;
1381 if (left->utf8nfkdi && right->utf8nfkdi &&
1382 strcmp(left->utf8nfkdi, right->utf8nfkdi) == 0)
1383 return 1;
1384 if (left->utf8nfkdi || right->utf8nfkdi)
1385 return 0;
1386 return 1;
1387 }
1388
nfkdicf_equal(void * l,void * r)1389 static int nfkdicf_equal(void *l, void *r)
1390 {
1391 struct unicode_data *left = l;
1392 struct unicode_data *right = r;
1393
1394 if (left->gen != right->gen)
1395 return 0;
1396 if (left->ccc != right->ccc)
1397 return 0;
1398 if (left->utf8nfkdicf && right->utf8nfkdicf &&
1399 strcmp(left->utf8nfkdicf, right->utf8nfkdicf) == 0)
1400 return 1;
1401 if (left->utf8nfkdicf && right->utf8nfkdicf)
1402 return 0;
1403 if (left->utf8nfkdicf || right->utf8nfkdicf)
1404 return 0;
1405 if (left->utf8nfkdi && right->utf8nfkdi &&
1406 strcmp(left->utf8nfkdi, right->utf8nfkdi) == 0)
1407 return 1;
1408 if (left->utf8nfkdi || right->utf8nfkdi)
1409 return 0;
1410 return 1;
1411 }
1412
nfkdi_print(void * l,int indent)1413 static void nfkdi_print(void *l, int indent)
1414 {
1415 struct unicode_data *leaf = l;
1416
1417 printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1418 leaf->code, leaf->ccc, leaf->gen);
1419 if (leaf->utf8nfkdi && leaf->utf8nfkdi[0] == HANGUL)
1420 printf(" nfkdi \"%s\"", "HANGUL SYLLABLE");
1421 else if (leaf->utf8nfkdi)
1422 printf(" nfkdi \"%s\"", (const char*)leaf->utf8nfkdi);
1423 printf("\n");
1424 }
1425
nfkdicf_print(void * l,int indent)1426 static void nfkdicf_print(void *l, int indent)
1427 {
1428 struct unicode_data *leaf = l;
1429
1430 printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1431 leaf->code, leaf->ccc, leaf->gen);
1432 if (leaf->utf8nfkdicf)
1433 printf(" nfkdicf \"%s\"", (const char*)leaf->utf8nfkdicf);
1434 else if (leaf->utf8nfkdi && leaf->utf8nfkdi[0] == HANGUL)
1435 printf(" nfkdi \"%s\"", "HANGUL SYLLABLE");
1436 else if (leaf->utf8nfkdi)
1437 printf(" nfkdi \"%s\"", (const char*)leaf->utf8nfkdi);
1438 printf("\n");
1439 }
1440
nfkdi_mark(void * l)1441 static int nfkdi_mark(void *l)
1442 {
1443 return 1;
1444 }
1445
nfkdicf_mark(void * l)1446 static int nfkdicf_mark(void *l)
1447 {
1448 struct unicode_data *leaf = l;
1449
1450 if (leaf->utf8nfkdicf)
1451 return 1;
1452 return 0;
1453 }
1454
correction_mark(void * l)1455 static int correction_mark(void *l)
1456 {
1457 struct unicode_data *leaf = l;
1458
1459 return leaf->correction;
1460 }
1461
nfkdi_size(void * l)1462 static int nfkdi_size(void *l)
1463 {
1464 struct unicode_data *leaf = l;
1465
1466 int size = 2;
1467 if (HANGUL_SYLLABLE(leaf->code))
1468 size += 1;
1469 else if (leaf->utf8nfkdi)
1470 size += strlen(leaf->utf8nfkdi) + 1;
1471 return size;
1472 }
1473
nfkdicf_size(void * l)1474 static int nfkdicf_size(void *l)
1475 {
1476 struct unicode_data *leaf = l;
1477
1478 int size = 2;
1479 if (HANGUL_SYLLABLE(leaf->code))
1480 size += 1;
1481 else if (leaf->utf8nfkdicf)
1482 size += strlen(leaf->utf8nfkdicf) + 1;
1483 else if (leaf->utf8nfkdi)
1484 size += strlen(leaf->utf8nfkdi) + 1;
1485 return size;
1486 }
1487
nfkdi_index(struct tree * tree,void * l)1488 static int *nfkdi_index(struct tree *tree, void *l)
1489 {
1490 struct unicode_data *leaf = l;
1491
1492 return &tree->leafindex[leaf->code];
1493 }
1494
nfkdicf_index(struct tree * tree,void * l)1495 static int *nfkdicf_index(struct tree *tree, void *l)
1496 {
1497 struct unicode_data *leaf = l;
1498
1499 return &tree->leafindex[leaf->code];
1500 }
1501
nfkdi_emit(void * l,unsigned char * data)1502 static unsigned char *nfkdi_emit(void *l, unsigned char *data)
1503 {
1504 struct unicode_data *leaf = l;
1505 unsigned char *s;
1506
1507 *data++ = leaf->gen;
1508 if (HANGUL_SYLLABLE(leaf->code)) {
1509 *data++ = DECOMPOSE;
1510 *data++ = HANGUL;
1511 } else if (leaf->utf8nfkdi) {
1512 *data++ = DECOMPOSE;
1513 s = (unsigned char*)leaf->utf8nfkdi;
1514 while ((*data++ = *s++) != 0)
1515 ;
1516 } else {
1517 *data++ = leaf->ccc;
1518 }
1519 return data;
1520 }
1521
nfkdicf_emit(void * l,unsigned char * data)1522 static unsigned char *nfkdicf_emit(void *l, unsigned char *data)
1523 {
1524 struct unicode_data *leaf = l;
1525 unsigned char *s;
1526
1527 *data++ = leaf->gen;
1528 if (HANGUL_SYLLABLE(leaf->code)) {
1529 *data++ = DECOMPOSE;
1530 *data++ = HANGUL;
1531 } else if (leaf->utf8nfkdicf) {
1532 *data++ = DECOMPOSE;
1533 s = (unsigned char*)leaf->utf8nfkdicf;
1534 while ((*data++ = *s++) != 0)
1535 ;
1536 } else if (leaf->utf8nfkdi) {
1537 *data++ = DECOMPOSE;
1538 s = (unsigned char*)leaf->utf8nfkdi;
1539 while ((*data++ = *s++) != 0)
1540 ;
1541 } else {
1542 *data++ = leaf->ccc;
1543 }
1544 return data;
1545 }
1546
utf8_create(struct unicode_data * data)1547 static void utf8_create(struct unicode_data *data)
1548 {
1549 char utf[18*4+1];
1550 char *u;
1551 unsigned int *um;
1552 int i;
1553
1554 if (data->utf8nfkdi) {
1555 assert(data->utf8nfkdi[0] == HANGUL);
1556 return;
1557 }
1558
1559 u = utf;
1560 um = data->utf32nfkdi;
1561 if (um) {
1562 for (i = 0; um[i]; i++)
1563 u += utf8encode(u, um[i]);
1564 *u = '\0';
1565 data->utf8nfkdi = strdup(utf);
1566 }
1567 u = utf;
1568 um = data->utf32nfkdicf;
1569 if (um) {
1570 for (i = 0; um[i]; i++)
1571 u += utf8encode(u, um[i]);
1572 *u = '\0';
1573 if (!data->utf8nfkdi || strcmp(data->utf8nfkdi, utf))
1574 data->utf8nfkdicf = strdup(utf);
1575 }
1576 }
1577
utf8_init(void)1578 static void utf8_init(void)
1579 {
1580 unsigned int unichar;
1581 int i;
1582
1583 for (unichar = 0; unichar != 0x110000; unichar++)
1584 utf8_create(&unicode_data[unichar]);
1585
1586 for (i = 0; i != corrections_count; i++)
1587 utf8_create(&corrections[i]);
1588 }
1589
trees_init(void)1590 static void trees_init(void)
1591 {
1592 struct unicode_data *data;
1593 unsigned int maxage;
1594 unsigned int nextage;
1595 int count;
1596 int i;
1597 int j;
1598
1599 /* Count the number of different ages. */
1600 count = 0;
1601 nextage = (unsigned int)-1;
1602 do {
1603 maxage = nextage;
1604 nextage = 0;
1605 for (i = 0; i <= corrections_count; i++) {
1606 data = &corrections[i];
1607 if (nextage < data->correction &&
1608 data->correction < maxage)
1609 nextage = data->correction;
1610 }
1611 count++;
1612 } while (nextage);
1613
1614 /* Two trees per age: nfkdi and nfkdicf */
1615 trees_count = count * 2;
1616 trees = calloc(trees_count, sizeof(struct tree));
1617
1618 /* Assign ages to the trees. */
1619 count = trees_count;
1620 nextage = (unsigned int)-1;
1621 do {
1622 maxage = nextage;
1623 trees[--count].maxage = maxage;
1624 trees[--count].maxage = maxage;
1625 nextage = 0;
1626 for (i = 0; i <= corrections_count; i++) {
1627 data = &corrections[i];
1628 if (nextage < data->correction &&
1629 data->correction < maxage)
1630 nextage = data->correction;
1631 }
1632 } while (nextage);
1633
1634 /* The ages assigned above are off by one. */
1635 for (i = 0; i != trees_count; i++) {
1636 j = 0;
1637 while (ages[j] < trees[i].maxage)
1638 j++;
1639 trees[i].maxage = ages[j-1];
1640 }
1641
1642 /* Set up the forwarding between trees. */
1643 trees[trees_count-2].next = &trees[trees_count-1];
1644 trees[trees_count-1].leaf_mark = nfkdi_mark;
1645 trees[trees_count-2].leaf_mark = nfkdicf_mark;
1646 for (i = 0; i != trees_count-2; i += 2) {
1647 trees[i].next = &trees[trees_count-2];
1648 trees[i].leaf_mark = correction_mark;
1649 trees[i+1].next = &trees[trees_count-1];
1650 trees[i+1].leaf_mark = correction_mark;
1651 }
1652
1653 /* Assign the callouts. */
1654 for (i = 0; i != trees_count; i += 2) {
1655 trees[i].type = "nfkdicf";
1656 trees[i].leaf_equal = nfkdicf_equal;
1657 trees[i].leaf_print = nfkdicf_print;
1658 trees[i].leaf_size = nfkdicf_size;
1659 trees[i].leaf_index = nfkdicf_index;
1660 trees[i].leaf_emit = nfkdicf_emit;
1661
1662 trees[i+1].type = "nfkdi";
1663 trees[i+1].leaf_equal = nfkdi_equal;
1664 trees[i+1].leaf_print = nfkdi_print;
1665 trees[i+1].leaf_size = nfkdi_size;
1666 trees[i+1].leaf_index = nfkdi_index;
1667 trees[i+1].leaf_emit = nfkdi_emit;
1668 }
1669
1670 /* Finish init. */
1671 for (i = 0; i != trees_count; i++)
1672 trees[i].childnode = NODE;
1673 }
1674
trees_populate(void)1675 static void trees_populate(void)
1676 {
1677 struct unicode_data *data;
1678 unsigned int unichar;
1679 char keyval[4];
1680 int keylen;
1681 int i;
1682
1683 for (i = 0; i != trees_count; i++) {
1684 if (verbose > 0) {
1685 printf("Populating %s_%x\n",
1686 trees[i].type, trees[i].maxage);
1687 }
1688 for (unichar = 0; unichar != 0x110000; unichar++) {
1689 if (unicode_data[unichar].gen < 0)
1690 continue;
1691 keylen = utf8encode(keyval, unichar);
1692 data = corrections_lookup(&unicode_data[unichar]);
1693 if (data->correction <= trees[i].maxage)
1694 data = &unicode_data[unichar];
1695 insert(&trees[i], keyval, keylen, data);
1696 }
1697 }
1698 }
1699
trees_reduce(void)1700 static void trees_reduce(void)
1701 {
1702 int i;
1703 int size;
1704 int changed;
1705
1706 for (i = 0; i != trees_count; i++)
1707 prune(&trees[i]);
1708 for (i = 0; i != trees_count; i++)
1709 mark_nodes(&trees[i]);
1710 do {
1711 size = 0;
1712 for (i = 0; i != trees_count; i++)
1713 size = index_nodes(&trees[i], size);
1714 changed = 0;
1715 for (i = 0; i != trees_count; i++)
1716 changed += size_nodes(&trees[i]);
1717 } while (changed);
1718
1719 utf8data = calloc(size, 1);
1720 utf8data_size = size;
1721 for (i = 0; i != trees_count; i++)
1722 emit(&trees[i], utf8data);
1723
1724 if (verbose > 0) {
1725 for (i = 0; i != trees_count; i++) {
1726 printf("%s_%x idx %d\n",
1727 trees[i].type, trees[i].maxage, trees[i].index);
1728 }
1729 }
1730
1731 nfkdi = utf8data + trees[trees_count-1].index;
1732 nfkdicf = utf8data + trees[trees_count-2].index;
1733
1734 nfkdi_tree = &trees[trees_count-1];
1735 nfkdicf_tree = &trees[trees_count-2];
1736 }
1737
verify(struct tree * tree)1738 static void verify(struct tree *tree)
1739 {
1740 struct unicode_data *data;
1741 utf8leaf_t *leaf;
1742 unsigned int unichar;
1743 char key[4];
1744 unsigned char hangul[UTF8HANGULLEAF];
1745 int report;
1746 int nocf;
1747
1748 if (verbose > 0)
1749 printf("Verifying %s_%x\n", tree->type, tree->maxage);
1750 nocf = strcmp(tree->type, "nfkdicf");
1751
1752 for (unichar = 0; unichar != 0x110000; unichar++) {
1753 report = 0;
1754 data = corrections_lookup(&unicode_data[unichar]);
1755 if (data->correction <= tree->maxage)
1756 data = &unicode_data[unichar];
1757 utf8encode(key,unichar);
1758 leaf = utf8lookup(tree, hangul, key);
1759
1760 if (!leaf) {
1761 if (data->gen != -1)
1762 report++;
1763 if (unichar < 0xd800 || unichar > 0xdfff)
1764 report++;
1765 } else {
1766 if (unichar >= 0xd800 && unichar <= 0xdfff)
1767 report++;
1768 if (data->gen == -1)
1769 report++;
1770 if (data->gen != LEAF_GEN(leaf))
1771 report++;
1772 if (LEAF_CCC(leaf) == DECOMPOSE) {
1773 if (HANGUL_SYLLABLE(data->code)) {
1774 if (data->utf8nfkdi[0] != HANGUL)
1775 report++;
1776 } else if (nocf) {
1777 if (!data->utf8nfkdi) {
1778 report++;
1779 } else if (strcmp(data->utf8nfkdi,
1780 LEAF_STR(leaf))) {
1781 report++;
1782 }
1783 } else {
1784 if (!data->utf8nfkdicf &&
1785 !data->utf8nfkdi) {
1786 report++;
1787 } else if (data->utf8nfkdicf) {
1788 if (strcmp(data->utf8nfkdicf,
1789 LEAF_STR(leaf)))
1790 report++;
1791 } else if (strcmp(data->utf8nfkdi,
1792 LEAF_STR(leaf))) {
1793 report++;
1794 }
1795 }
1796 } else if (data->ccc != LEAF_CCC(leaf)) {
1797 report++;
1798 }
1799 }
1800 if (report) {
1801 printf("%X code %X gen %d ccc %d"
1802 " nfkdi -> \"%s\"",
1803 unichar, data->code, data->gen,
1804 data->ccc,
1805 data->utf8nfkdi);
1806 if (leaf) {
1807 printf(" gen %d ccc %d"
1808 " nfkdi -> \"%s\"",
1809 LEAF_GEN(leaf),
1810 LEAF_CCC(leaf),
1811 LEAF_CCC(leaf) == DECOMPOSE ?
1812 LEAF_STR(leaf) : "");
1813 }
1814 printf("\n");
1815 }
1816 }
1817 }
1818
trees_verify(void)1819 static void trees_verify(void)
1820 {
1821 int i;
1822
1823 for (i = 0; i != trees_count; i++)
1824 verify(&trees[i]);
1825 }
1826
1827 /* ------------------------------------------------------------------ */
1828
help(void)1829 static void help(void)
1830 {
1831 printf("Usage: %s [options]\n", argv0);
1832 printf("\n");
1833 printf("This program creates an a data trie used for parsing and\n");
1834 printf("normalization of UTF-8 strings. The trie is derived from\n");
1835 printf("a set of input files from the Unicode character database\n");
1836 printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
1837 printf("\n");
1838 printf("The generated tree supports two normalization forms:\n");
1839 printf("\n");
1840 printf("\tnfkdi:\n");
1841 printf("\t- Apply unicode normalization form NFKD.\n");
1842 printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1843 printf("\n");
1844 printf("\tnfkdicf:\n");
1845 printf("\t- Apply unicode normalization form NFKD.\n");
1846 printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1847 printf("\t- Apply a full casefold (C + F).\n");
1848 printf("\n");
1849 printf("These forms were chosen as being most useful when dealing\n");
1850 printf("with file names: NFKD catches most cases where characters\n");
1851 printf("should be considered equivalent. The ignorables are mostly\n");
1852 printf("invisible, making names hard to type.\n");
1853 printf("\n");
1854 printf("The options to specify the files to be used are listed\n");
1855 printf("below with their default values, which are the names used\n");
1856 printf("by version 11.0.0 of the Unicode Character Database.\n");
1857 printf("\n");
1858 printf("The input files:\n");
1859 printf("\t-a %s\n", AGE_NAME);
1860 printf("\t-c %s\n", CCC_NAME);
1861 printf("\t-p %s\n", PROP_NAME);
1862 printf("\t-d %s\n", DATA_NAME);
1863 printf("\t-f %s\n", FOLD_NAME);
1864 printf("\t-n %s\n", NORM_NAME);
1865 printf("\n");
1866 printf("Additionally, the generated tables are tested using:\n");
1867 printf("\t-t %s\n", TEST_NAME);
1868 printf("\n");
1869 printf("Finally, the output file:\n");
1870 printf("\t-o %s\n", UTF8_NAME);
1871 printf("\n");
1872 }
1873
usage(void)1874 static void usage(void)
1875 {
1876 help();
1877 exit(1);
1878 }
1879
open_fail(const char * name,int error)1880 static void open_fail(const char *name, int error)
1881 {
1882 printf("Error %d opening %s: %s\n", error, name, strerror(error));
1883 exit(1);
1884 }
1885
file_fail(const char * filename)1886 static void file_fail(const char *filename)
1887 {
1888 printf("Error parsing %s\n", filename);
1889 exit(1);
1890 }
1891
line_fail(const char * filename,const char * line)1892 static void line_fail(const char *filename, const char *line)
1893 {
1894 printf("Error parsing %s:%s\n", filename, line);
1895 exit(1);
1896 }
1897
1898 /* ------------------------------------------------------------------ */
1899
print_utf32(unsigned int * utf32str)1900 static void print_utf32(unsigned int *utf32str)
1901 {
1902 int i;
1903
1904 for (i = 0; utf32str[i]; i++)
1905 printf(" %X", utf32str[i]);
1906 }
1907
print_utf32nfkdi(unsigned int unichar)1908 static void print_utf32nfkdi(unsigned int unichar)
1909 {
1910 printf(" %X ->", unichar);
1911 print_utf32(unicode_data[unichar].utf32nfkdi);
1912 printf("\n");
1913 }
1914
print_utf32nfkdicf(unsigned int unichar)1915 static void print_utf32nfkdicf(unsigned int unichar)
1916 {
1917 printf(" %X ->", unichar);
1918 print_utf32(unicode_data[unichar].utf32nfkdicf);
1919 printf("\n");
1920 }
1921
1922 /* ------------------------------------------------------------------ */
1923
age_init(void)1924 static void age_init(void)
1925 {
1926 FILE *file;
1927 unsigned int first;
1928 unsigned int last;
1929 unsigned int unichar;
1930 unsigned int major;
1931 unsigned int minor;
1932 unsigned int revision;
1933 int gen;
1934 int count;
1935 int ret;
1936
1937 if (verbose > 0)
1938 printf("Parsing %s\n", age_name);
1939
1940 file = fopen(age_name, "r");
1941 if (!file)
1942 open_fail(age_name, errno);
1943 count = 0;
1944
1945 gen = 0;
1946 while (fgets(line, LINESIZE, file)) {
1947 ret = sscanf(line, "# Age=V%d_%d_%d",
1948 &major, &minor, &revision);
1949 if (ret == 3) {
1950 ages_count++;
1951 if (verbose > 1)
1952 printf(" Age V%d_%d_%d\n",
1953 major, minor, revision);
1954 if (!age_valid(major, minor, revision))
1955 line_fail(age_name, line);
1956 continue;
1957 }
1958 ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1959 if (ret == 2) {
1960 ages_count++;
1961 if (verbose > 1)
1962 printf(" Age V%d_%d\n", major, minor);
1963 if (!age_valid(major, minor, 0))
1964 line_fail(age_name, line);
1965 continue;
1966 }
1967 }
1968
1969 /* We must have found something above. */
1970 if (verbose > 1)
1971 printf("%d age entries\n", ages_count);
1972 if (ages_count == 0 || ages_count > MAXGEN)
1973 file_fail(age_name);
1974
1975 /* There is a 0 entry. */
1976 ages_count++;
1977 ages = calloc(ages_count + 1, sizeof(*ages));
1978 /* And a guard entry. */
1979 ages[ages_count] = (unsigned int)-1;
1980
1981 rewind(file);
1982 count = 0;
1983 gen = 0;
1984 while (fgets(line, LINESIZE, file)) {
1985 ret = sscanf(line, "# Age=V%d_%d_%d",
1986 &major, &minor, &revision);
1987 if (ret == 3) {
1988 ages[++gen] =
1989 UNICODE_AGE(major, minor, revision);
1990 if (verbose > 1)
1991 printf(" Age V%d_%d_%d = gen %d\n",
1992 major, minor, revision, gen);
1993 if (!age_valid(major, minor, revision))
1994 line_fail(age_name, line);
1995 continue;
1996 }
1997 ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1998 if (ret == 2) {
1999 ages[++gen] = UNICODE_AGE(major, minor, 0);
2000 if (verbose > 1)
2001 printf(" Age V%d_%d = %d\n",
2002 major, minor, gen);
2003 if (!age_valid(major, minor, 0))
2004 line_fail(age_name, line);
2005 continue;
2006 }
2007 ret = sscanf(line, "%X..%X ; %d.%d #",
2008 &first, &last, &major, &minor);
2009 if (ret == 4) {
2010 for (unichar = first; unichar <= last; unichar++)
2011 unicode_data[unichar].gen = gen;
2012 count += 1 + last - first;
2013 if (verbose > 1)
2014 printf(" %X..%X gen %d\n", first, last, gen);
2015 if (!utf32valid(first) || !utf32valid(last))
2016 line_fail(age_name, line);
2017 continue;
2018 }
2019 ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
2020 if (ret == 3) {
2021 unicode_data[unichar].gen = gen;
2022 count++;
2023 if (verbose > 1)
2024 printf(" %X gen %d\n", unichar, gen);
2025 if (!utf32valid(unichar))
2026 line_fail(age_name, line);
2027 continue;
2028 }
2029 }
2030 unicode_maxage = ages[gen];
2031 fclose(file);
2032
2033 /* Nix surrogate block */
2034 if (verbose > 1)
2035 printf(" Removing surrogate block D800..DFFF\n");
2036 for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
2037 unicode_data[unichar].gen = -1;
2038
2039 if (verbose > 0)
2040 printf("Found %d entries\n", count);
2041 if (count == 0)
2042 file_fail(age_name);
2043 }
2044
ccc_init(void)2045 static void ccc_init(void)
2046 {
2047 FILE *file;
2048 unsigned int first;
2049 unsigned int last;
2050 unsigned int unichar;
2051 unsigned int value;
2052 int count;
2053 int ret;
2054
2055 if (verbose > 0)
2056 printf("Parsing %s\n", ccc_name);
2057
2058 file = fopen(ccc_name, "r");
2059 if (!file)
2060 open_fail(ccc_name, errno);
2061
2062 count = 0;
2063 while (fgets(line, LINESIZE, file)) {
2064 ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
2065 if (ret == 3) {
2066 for (unichar = first; unichar <= last; unichar++) {
2067 unicode_data[unichar].ccc = value;
2068 count++;
2069 }
2070 if (verbose > 1)
2071 printf(" %X..%X ccc %d\n", first, last, value);
2072 if (!utf32valid(first) || !utf32valid(last))
2073 line_fail(ccc_name, line);
2074 continue;
2075 }
2076 ret = sscanf(line, "%X ; %d #", &unichar, &value);
2077 if (ret == 2) {
2078 unicode_data[unichar].ccc = value;
2079 count++;
2080 if (verbose > 1)
2081 printf(" %X ccc %d\n", unichar, value);
2082 if (!utf32valid(unichar))
2083 line_fail(ccc_name, line);
2084 continue;
2085 }
2086 }
2087 fclose(file);
2088
2089 if (verbose > 0)
2090 printf("Found %d entries\n", count);
2091 if (count == 0)
2092 file_fail(ccc_name);
2093 }
2094
nfkdi_init(void)2095 static void nfkdi_init(void)
2096 {
2097 FILE *file;
2098 unsigned int unichar;
2099 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2100 char *s;
2101 unsigned int *um;
2102 int count;
2103 int i;
2104 int ret;
2105
2106 if (verbose > 0)
2107 printf("Parsing %s\n", data_name);
2108 file = fopen(data_name, "r");
2109 if (!file)
2110 open_fail(data_name, errno);
2111
2112 count = 0;
2113 while (fgets(line, LINESIZE, file)) {
2114 ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
2115 &unichar, buf0);
2116 if (ret != 2)
2117 continue;
2118 if (!utf32valid(unichar))
2119 line_fail(data_name, line);
2120
2121 s = buf0;
2122 /* skip over <tag> */
2123 if (*s == '<')
2124 while (*s++ != ' ')
2125 ;
2126 /* decode the decomposition into UTF-32 */
2127 i = 0;
2128 while (*s) {
2129 mapping[i] = strtoul(s, &s, 16);
2130 if (!utf32valid(mapping[i]))
2131 line_fail(data_name, line);
2132 i++;
2133 }
2134 mapping[i++] = 0;
2135
2136 um = malloc(i * sizeof(unsigned int));
2137 memcpy(um, mapping, i * sizeof(unsigned int));
2138 unicode_data[unichar].utf32nfkdi = um;
2139
2140 if (verbose > 1)
2141 print_utf32nfkdi(unichar);
2142 count++;
2143 }
2144 fclose(file);
2145 if (verbose > 0)
2146 printf("Found %d entries\n", count);
2147 if (count == 0)
2148 file_fail(data_name);
2149 }
2150
nfkdicf_init(void)2151 static void nfkdicf_init(void)
2152 {
2153 FILE *file;
2154 unsigned int unichar;
2155 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2156 char status;
2157 char *s;
2158 unsigned int *um;
2159 int i;
2160 int count;
2161 int ret;
2162
2163 if (verbose > 0)
2164 printf("Parsing %s\n", fold_name);
2165 file = fopen(fold_name, "r");
2166 if (!file)
2167 open_fail(fold_name, errno);
2168
2169 count = 0;
2170 while (fgets(line, LINESIZE, file)) {
2171 ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
2172 if (ret != 3)
2173 continue;
2174 if (!utf32valid(unichar))
2175 line_fail(fold_name, line);
2176 /* Use the C+F casefold. */
2177 if (status != 'C' && status != 'F')
2178 continue;
2179 s = buf0;
2180 if (*s == '<')
2181 while (*s++ != ' ')
2182 ;
2183 i = 0;
2184 while (*s) {
2185 mapping[i] = strtoul(s, &s, 16);
2186 if (!utf32valid(mapping[i]))
2187 line_fail(fold_name, line);
2188 i++;
2189 }
2190 mapping[i++] = 0;
2191
2192 um = malloc(i * sizeof(unsigned int));
2193 memcpy(um, mapping, i * sizeof(unsigned int));
2194 unicode_data[unichar].utf32nfkdicf = um;
2195
2196 if (verbose > 1)
2197 print_utf32nfkdicf(unichar);
2198 count++;
2199 }
2200 fclose(file);
2201 if (verbose > 0)
2202 printf("Found %d entries\n", count);
2203 if (count == 0)
2204 file_fail(fold_name);
2205 }
2206
ignore_init(void)2207 static void ignore_init(void)
2208 {
2209 FILE *file;
2210 unsigned int unichar;
2211 unsigned int first;
2212 unsigned int last;
2213 unsigned int *um;
2214 int count;
2215 int ret;
2216
2217 if (verbose > 0)
2218 printf("Parsing %s\n", prop_name);
2219 file = fopen(prop_name, "r");
2220 if (!file)
2221 open_fail(prop_name, errno);
2222 assert(file);
2223 count = 0;
2224 while (fgets(line, LINESIZE, file)) {
2225 ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
2226 if (ret == 3) {
2227 if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2228 continue;
2229 if (!utf32valid(first) || !utf32valid(last))
2230 line_fail(prop_name, line);
2231 for (unichar = first; unichar <= last; unichar++) {
2232 free(unicode_data[unichar].utf32nfkdi);
2233 um = malloc(sizeof(unsigned int));
2234 *um = 0;
2235 unicode_data[unichar].utf32nfkdi = um;
2236 free(unicode_data[unichar].utf32nfkdicf);
2237 um = malloc(sizeof(unsigned int));
2238 *um = 0;
2239 unicode_data[unichar].utf32nfkdicf = um;
2240 count++;
2241 }
2242 if (verbose > 1)
2243 printf(" %X..%X Default_Ignorable_Code_Point\n",
2244 first, last);
2245 continue;
2246 }
2247 ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
2248 if (ret == 2) {
2249 if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2250 continue;
2251 if (!utf32valid(unichar))
2252 line_fail(prop_name, line);
2253 free(unicode_data[unichar].utf32nfkdi);
2254 um = malloc(sizeof(unsigned int));
2255 *um = 0;
2256 unicode_data[unichar].utf32nfkdi = um;
2257 free(unicode_data[unichar].utf32nfkdicf);
2258 um = malloc(sizeof(unsigned int));
2259 *um = 0;
2260 unicode_data[unichar].utf32nfkdicf = um;
2261 if (verbose > 1)
2262 printf(" %X Default_Ignorable_Code_Point\n",
2263 unichar);
2264 count++;
2265 continue;
2266 }
2267 }
2268 fclose(file);
2269
2270 if (verbose > 0)
2271 printf("Found %d entries\n", count);
2272 if (count == 0)
2273 file_fail(prop_name);
2274 }
2275
corrections_init(void)2276 static void corrections_init(void)
2277 {
2278 FILE *file;
2279 unsigned int unichar;
2280 unsigned int major;
2281 unsigned int minor;
2282 unsigned int revision;
2283 unsigned int age;
2284 unsigned int *um;
2285 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2286 char *s;
2287 int i;
2288 int count;
2289 int ret;
2290
2291 if (verbose > 0)
2292 printf("Parsing %s\n", norm_name);
2293 file = fopen(norm_name, "r");
2294 if (!file)
2295 open_fail(norm_name, errno);
2296
2297 count = 0;
2298 while (fgets(line, LINESIZE, file)) {
2299 ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2300 &unichar, buf0, buf1,
2301 &major, &minor, &revision);
2302 if (ret != 6)
2303 continue;
2304 if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2305 line_fail(norm_name, line);
2306 count++;
2307 }
2308 corrections = calloc(count, sizeof(struct unicode_data));
2309 corrections_count = count;
2310 rewind(file);
2311
2312 count = 0;
2313 while (fgets(line, LINESIZE, file)) {
2314 ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2315 &unichar, buf0, buf1,
2316 &major, &minor, &revision);
2317 if (ret != 6)
2318 continue;
2319 if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2320 line_fail(norm_name, line);
2321 corrections[count] = unicode_data[unichar];
2322 assert(corrections[count].code == unichar);
2323 age = UNICODE_AGE(major, minor, revision);
2324 corrections[count].correction = age;
2325
2326 i = 0;
2327 s = buf0;
2328 while (*s) {
2329 mapping[i] = strtoul(s, &s, 16);
2330 if (!utf32valid(mapping[i]))
2331 line_fail(norm_name, line);
2332 i++;
2333 }
2334 mapping[i++] = 0;
2335
2336 um = malloc(i * sizeof(unsigned int));
2337 memcpy(um, mapping, i * sizeof(unsigned int));
2338 corrections[count].utf32nfkdi = um;
2339
2340 if (verbose > 1)
2341 printf(" %X -> %s -> %s V%d_%d_%d\n",
2342 unichar, buf0, buf1, major, minor, revision);
2343 count++;
2344 }
2345 fclose(file);
2346
2347 if (verbose > 0)
2348 printf("Found %d entries\n", count);
2349 if (count == 0)
2350 file_fail(norm_name);
2351 }
2352
2353 /* ------------------------------------------------------------------ */
2354
2355 /*
2356 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2357 *
2358 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2359 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2360 *
2361 * SBase = 0xAC00
2362 * LBase = 0x1100
2363 * VBase = 0x1161
2364 * TBase = 0x11A7
2365 * LCount = 19
2366 * VCount = 21
2367 * TCount = 28
2368 * NCount = 588 (VCount * TCount)
2369 * SCount = 11172 (LCount * NCount)
2370 *
2371 * Decomposition:
2372 * SIndex = s - SBase
2373 *
2374 * LV (Canonical/Full)
2375 * LIndex = SIndex / NCount
2376 * VIndex = (Sindex % NCount) / TCount
2377 * LPart = LBase + LIndex
2378 * VPart = VBase + VIndex
2379 *
2380 * LVT (Canonical)
2381 * LVIndex = (SIndex / TCount) * TCount
2382 * TIndex = (Sindex % TCount)
2383 * LVPart = SBase + LVIndex
2384 * TPart = TBase + TIndex
2385 *
2386 * LVT (Full)
2387 * LIndex = SIndex / NCount
2388 * VIndex = (Sindex % NCount) / TCount
2389 * TIndex = (Sindex % TCount)
2390 * LPart = LBase + LIndex
2391 * VPart = VBase + VIndex
2392 * if (TIndex == 0) {
2393 * d = <LPart, VPart>
2394 * } else {
2395 * TPart = TBase + TIndex
2396 * d = <LPart, VPart, TPart>
2397 * }
2398 *
2399 */
2400
hangul_decompose(void)2401 static void hangul_decompose(void)
2402 {
2403 unsigned int sb = 0xAC00;
2404 unsigned int lb = 0x1100;
2405 unsigned int vb = 0x1161;
2406 unsigned int tb = 0x11a7;
2407 /* unsigned int lc = 19; */
2408 unsigned int vc = 21;
2409 unsigned int tc = 28;
2410 unsigned int nc = (vc * tc);
2411 /* unsigned int sc = (lc * nc); */
2412 unsigned int unichar;
2413 unsigned int mapping[4];
2414 unsigned int *um;
2415 int count;
2416 int i;
2417
2418 if (verbose > 0)
2419 printf("Decomposing hangul\n");
2420 /* Hangul */
2421 count = 0;
2422 for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
2423 unsigned int si = unichar - sb;
2424 unsigned int li = si / nc;
2425 unsigned int vi = (si % nc) / tc;
2426 unsigned int ti = si % tc;
2427
2428 i = 0;
2429 mapping[i++] = lb + li;
2430 mapping[i++] = vb + vi;
2431 if (ti)
2432 mapping[i++] = tb + ti;
2433 mapping[i++] = 0;
2434
2435 assert(!unicode_data[unichar].utf32nfkdi);
2436 um = malloc(i * sizeof(unsigned int));
2437 memcpy(um, mapping, i * sizeof(unsigned int));
2438 unicode_data[unichar].utf32nfkdi = um;
2439
2440 assert(!unicode_data[unichar].utf32nfkdicf);
2441 um = malloc(i * sizeof(unsigned int));
2442 memcpy(um, mapping, i * sizeof(unsigned int));
2443 unicode_data[unichar].utf32nfkdicf = um;
2444
2445 /*
2446 * Add a cookie as a reminder that the hangul syllable
2447 * decompositions must not be stored in the generated
2448 * trie.
2449 */
2450 unicode_data[unichar].utf8nfkdi = malloc(2);
2451 unicode_data[unichar].utf8nfkdi[0] = HANGUL;
2452 unicode_data[unichar].utf8nfkdi[1] = '\0';
2453
2454 if (verbose > 1)
2455 print_utf32nfkdi(unichar);
2456
2457 count++;
2458 }
2459 if (verbose > 0)
2460 printf("Created %d entries\n", count);
2461 }
2462
nfkdi_decompose(void)2463 static void nfkdi_decompose(void)
2464 {
2465 unsigned int unichar;
2466 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2467 unsigned int *um;
2468 unsigned int *dc;
2469 int count;
2470 int i;
2471 int j;
2472 int ret;
2473
2474 if (verbose > 0)
2475 printf("Decomposing nfkdi\n");
2476
2477 count = 0;
2478 for (unichar = 0; unichar != 0x110000; unichar++) {
2479 if (!unicode_data[unichar].utf32nfkdi)
2480 continue;
2481 for (;;) {
2482 ret = 1;
2483 i = 0;
2484 um = unicode_data[unichar].utf32nfkdi;
2485 while (*um) {
2486 dc = unicode_data[*um].utf32nfkdi;
2487 if (dc) {
2488 for (j = 0; dc[j]; j++)
2489 mapping[i++] = dc[j];
2490 ret = 0;
2491 } else {
2492 mapping[i++] = *um;
2493 }
2494 um++;
2495 }
2496 mapping[i++] = 0;
2497 if (ret)
2498 break;
2499 free(unicode_data[unichar].utf32nfkdi);
2500 um = malloc(i * sizeof(unsigned int));
2501 memcpy(um, mapping, i * sizeof(unsigned int));
2502 unicode_data[unichar].utf32nfkdi = um;
2503 }
2504 /* Add this decomposition to nfkdicf if there is no entry. */
2505 if (!unicode_data[unichar].utf32nfkdicf) {
2506 um = malloc(i * sizeof(unsigned int));
2507 memcpy(um, mapping, i * sizeof(unsigned int));
2508 unicode_data[unichar].utf32nfkdicf = um;
2509 }
2510 if (verbose > 1)
2511 print_utf32nfkdi(unichar);
2512 count++;
2513 }
2514 if (verbose > 0)
2515 printf("Processed %d entries\n", count);
2516 }
2517
nfkdicf_decompose(void)2518 static void nfkdicf_decompose(void)
2519 {
2520 unsigned int unichar;
2521 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2522 unsigned int *um;
2523 unsigned int *dc;
2524 int count;
2525 int i;
2526 int j;
2527 int ret;
2528
2529 if (verbose > 0)
2530 printf("Decomposing nfkdicf\n");
2531 count = 0;
2532 for (unichar = 0; unichar != 0x110000; unichar++) {
2533 if (!unicode_data[unichar].utf32nfkdicf)
2534 continue;
2535 for (;;) {
2536 ret = 1;
2537 i = 0;
2538 um = unicode_data[unichar].utf32nfkdicf;
2539 while (*um) {
2540 dc = unicode_data[*um].utf32nfkdicf;
2541 if (dc) {
2542 for (j = 0; dc[j]; j++)
2543 mapping[i++] = dc[j];
2544 ret = 0;
2545 } else {
2546 mapping[i++] = *um;
2547 }
2548 um++;
2549 }
2550 mapping[i++] = 0;
2551 if (ret)
2552 break;
2553 free(unicode_data[unichar].utf32nfkdicf);
2554 um = malloc(i * sizeof(unsigned int));
2555 memcpy(um, mapping, i * sizeof(unsigned int));
2556 unicode_data[unichar].utf32nfkdicf = um;
2557 }
2558 if (verbose > 1)
2559 print_utf32nfkdicf(unichar);
2560 count++;
2561 }
2562 if (verbose > 0)
2563 printf("Processed %d entries\n", count);
2564 }
2565
2566 /* ------------------------------------------------------------------ */
2567
2568 int utf8agemax(struct tree *, const char *);
2569 int utf8nagemax(struct tree *, const char *, size_t);
2570 int utf8agemin(struct tree *, const char *);
2571 int utf8nagemin(struct tree *, const char *, size_t);
2572 ssize_t utf8len(struct tree *, const char *);
2573 ssize_t utf8nlen(struct tree *, const char *, size_t);
2574 struct utf8cursor;
2575 int utf8cursor(struct utf8cursor *, struct tree *, const char *);
2576 int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
2577 int utf8byte(struct utf8cursor *);
2578
2579 /*
2580 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2581 *
2582 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2583 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2584 *
2585 * SBase = 0xAC00
2586 * LBase = 0x1100
2587 * VBase = 0x1161
2588 * TBase = 0x11A7
2589 * LCount = 19
2590 * VCount = 21
2591 * TCount = 28
2592 * NCount = 588 (VCount * TCount)
2593 * SCount = 11172 (LCount * NCount)
2594 *
2595 * Decomposition:
2596 * SIndex = s - SBase
2597 *
2598 * LV (Canonical/Full)
2599 * LIndex = SIndex / NCount
2600 * VIndex = (Sindex % NCount) / TCount
2601 * LPart = LBase + LIndex
2602 * VPart = VBase + VIndex
2603 *
2604 * LVT (Canonical)
2605 * LVIndex = (SIndex / TCount) * TCount
2606 * TIndex = (Sindex % TCount)
2607 * LVPart = SBase + LVIndex
2608 * TPart = TBase + TIndex
2609 *
2610 * LVT (Full)
2611 * LIndex = SIndex / NCount
2612 * VIndex = (Sindex % NCount) / TCount
2613 * TIndex = (Sindex % TCount)
2614 * LPart = LBase + LIndex
2615 * VPart = VBase + VIndex
2616 * if (TIndex == 0) {
2617 * d = <LPart, VPart>
2618 * } else {
2619 * TPart = TBase + TIndex
2620 * d = <LPart, VPart, TPart>
2621 * }
2622 */
2623
2624 /* Constants */
2625 #define SB (0xAC00)
2626 #define LB (0x1100)
2627 #define VB (0x1161)
2628 #define TB (0x11A7)
2629 #define LC (19)
2630 #define VC (21)
2631 #define TC (28)
2632 #define NC (VC * TC)
2633 #define SC (LC * NC)
2634
2635 /* Algorithmic decomposition of hangul syllable. */
utf8hangul(const char * str,unsigned char * hangul)2636 static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
2637 {
2638 unsigned int si;
2639 unsigned int li;
2640 unsigned int vi;
2641 unsigned int ti;
2642 unsigned char *h;
2643
2644 /* Calculate the SI, LI, VI, and TI values. */
2645 si = utf8decode(str) - SB;
2646 li = si / NC;
2647 vi = (si % NC) / TC;
2648 ti = si % TC;
2649
2650 /* Fill in base of leaf. */
2651 h = hangul;
2652 LEAF_GEN(h) = 2;
2653 LEAF_CCC(h) = DECOMPOSE;
2654 h += 2;
2655
2656 /* Add LPart, a 3-byte UTF-8 sequence. */
2657 h += utf8encode((char *)h, li + LB);
2658
2659 /* Add VPart, a 3-byte UTF-8 sequence. */
2660 h += utf8encode((char *)h, vi + VB);
2661
2662 /* Add TPart if required, also a 3-byte UTF-8 sequence. */
2663 if (ti)
2664 h += utf8encode((char *)h, ti + TB);
2665
2666 /* Terminate string. */
2667 h[0] = '\0';
2668
2669 return hangul;
2670 }
2671
2672 /*
2673 * Use trie to scan s, touching at most len bytes.
2674 * Returns the leaf if one exists, NULL otherwise.
2675 *
2676 * A non-NULL return guarantees that the UTF-8 sequence starting at s
2677 * is well-formed and corresponds to a known unicode code point. The
2678 * shorthand for this will be "is valid UTF-8 unicode".
2679 */
utf8nlookup(struct tree * tree,unsigned char * hangul,const char * s,size_t len)2680 static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
2681 const char *s, size_t len)
2682 {
2683 utf8trie_t *trie = utf8data + tree->index;
2684 int offlen;
2685 int offset;
2686 int mask;
2687 int node;
2688
2689 if (!tree)
2690 return NULL;
2691 if (len == 0)
2692 return NULL;
2693 node = 1;
2694 while (node) {
2695 offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
2696 if (*trie & NEXTBYTE) {
2697 if (--len == 0)
2698 return NULL;
2699 s++;
2700 }
2701 mask = 1 << (*trie & BITNUM);
2702 if (*s & mask) {
2703 /* Right leg */
2704 if (offlen) {
2705 /* Right node at offset of trie */
2706 node = (*trie & RIGHTNODE);
2707 offset = trie[offlen];
2708 while (--offlen) {
2709 offset <<= 8;
2710 offset |= trie[offlen];
2711 }
2712 trie += offset;
2713 } else if (*trie & RIGHTPATH) {
2714 /* Right node after this node */
2715 node = (*trie & TRIENODE);
2716 trie++;
2717 } else {
2718 /* No right node. */
2719 return NULL;
2720 }
2721 } else {
2722 /* Left leg */
2723 if (offlen) {
2724 /* Left node after this node. */
2725 node = (*trie & LEFTNODE);
2726 trie += offlen + 1;
2727 } else if (*trie & RIGHTPATH) {
2728 /* No left node. */
2729 return NULL;
2730 } else {
2731 /* Left node after this node */
2732 node = (*trie & TRIENODE);
2733 trie++;
2734 }
2735 }
2736 }
2737 /*
2738 * Hangul decomposition is done algorithmically. These are the
2739 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
2740 * always 3 bytes long, so s has been advanced twice, and the
2741 * start of the sequence is at s-2.
2742 */
2743 if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
2744 trie = utf8hangul(s - 2, hangul);
2745 return trie;
2746 }
2747
2748 /*
2749 * Use trie to scan s.
2750 * Returns the leaf if one exists, NULL otherwise.
2751 *
2752 * Forwards to trie_nlookup().
2753 */
utf8lookup(struct tree * tree,unsigned char * hangul,const char * s)2754 static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
2755 const char *s)
2756 {
2757 return utf8nlookup(tree, hangul, s, (size_t)-1);
2758 }
2759
2760 /*
2761 * Return the number of bytes used by the current UTF-8 sequence.
2762 * Assumes the input points to the first byte of a valid UTF-8
2763 * sequence.
2764 */
utf8clen(const char * s)2765 static inline int utf8clen(const char *s)
2766 {
2767 unsigned char c = *s;
2768 return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
2769 }
2770
2771 /*
2772 * Maximum age of any character in s.
2773 * Return -1 if s is not valid UTF-8 unicode.
2774 * Return 0 if only non-assigned code points are used.
2775 */
utf8agemax(struct tree * tree,const char * s)2776 int utf8agemax(struct tree *tree, const char *s)
2777 {
2778 utf8leaf_t *leaf;
2779 int age = 0;
2780 int leaf_age;
2781 unsigned char hangul[UTF8HANGULLEAF];
2782
2783 if (!tree)
2784 return -1;
2785
2786 while (*s) {
2787 leaf = utf8lookup(tree, hangul, s);
2788 if (!leaf)
2789 return -1;
2790 leaf_age = ages[LEAF_GEN(leaf)];
2791 if (leaf_age <= tree->maxage && leaf_age > age)
2792 age = leaf_age;
2793 s += utf8clen(s);
2794 }
2795 return age;
2796 }
2797
2798 /*
2799 * Minimum age of any character in s.
2800 * Return -1 if s is not valid UTF-8 unicode.
2801 * Return 0 if non-assigned code points are used.
2802 */
utf8agemin(struct tree * tree,const char * s)2803 int utf8agemin(struct tree *tree, const char *s)
2804 {
2805 utf8leaf_t *leaf;
2806 int age;
2807 int leaf_age;
2808 unsigned char hangul[UTF8HANGULLEAF];
2809
2810 if (!tree)
2811 return -1;
2812 age = tree->maxage;
2813 while (*s) {
2814 leaf = utf8lookup(tree, hangul, s);
2815 if (!leaf)
2816 return -1;
2817 leaf_age = ages[LEAF_GEN(leaf)];
2818 if (leaf_age <= tree->maxage && leaf_age < age)
2819 age = leaf_age;
2820 s += utf8clen(s);
2821 }
2822 return age;
2823 }
2824
2825 /*
2826 * Maximum age of any character in s, touch at most len bytes.
2827 * Return -1 if s is not valid UTF-8 unicode.
2828 */
utf8nagemax(struct tree * tree,const char * s,size_t len)2829 int utf8nagemax(struct tree *tree, const char *s, size_t len)
2830 {
2831 utf8leaf_t *leaf;
2832 int age = 0;
2833 int leaf_age;
2834 unsigned char hangul[UTF8HANGULLEAF];
2835
2836 if (!tree)
2837 return -1;
2838
2839 while (len && *s) {
2840 leaf = utf8nlookup(tree, hangul, s, len);
2841 if (!leaf)
2842 return -1;
2843 leaf_age = ages[LEAF_GEN(leaf)];
2844 if (leaf_age <= tree->maxage && leaf_age > age)
2845 age = leaf_age;
2846 len -= utf8clen(s);
2847 s += utf8clen(s);
2848 }
2849 return age;
2850 }
2851
2852 /*
2853 * Maximum age of any character in s, touch at most len bytes.
2854 * Return -1 if s is not valid UTF-8 unicode.
2855 */
utf8nagemin(struct tree * tree,const char * s,size_t len)2856 int utf8nagemin(struct tree *tree, const char *s, size_t len)
2857 {
2858 utf8leaf_t *leaf;
2859 int leaf_age;
2860 int age;
2861 unsigned char hangul[UTF8HANGULLEAF];
2862
2863 if (!tree)
2864 return -1;
2865 age = tree->maxage;
2866 while (len && *s) {
2867 leaf = utf8nlookup(tree, hangul, s, len);
2868 if (!leaf)
2869 return -1;
2870 leaf_age = ages[LEAF_GEN(leaf)];
2871 if (leaf_age <= tree->maxage && leaf_age < age)
2872 age = leaf_age;
2873 len -= utf8clen(s);
2874 s += utf8clen(s);
2875 }
2876 return age;
2877 }
2878
2879 /*
2880 * Length of the normalization of s.
2881 * Return -1 if s is not valid UTF-8 unicode.
2882 *
2883 * A string of Default_Ignorable_Code_Point has length 0.
2884 */
utf8len(struct tree * tree,const char * s)2885 ssize_t utf8len(struct tree *tree, const char *s)
2886 {
2887 utf8leaf_t *leaf;
2888 size_t ret = 0;
2889 unsigned char hangul[UTF8HANGULLEAF];
2890
2891 if (!tree)
2892 return -1;
2893 while (*s) {
2894 leaf = utf8lookup(tree, hangul, s);
2895 if (!leaf)
2896 return -1;
2897 if (ages[LEAF_GEN(leaf)] > tree->maxage)
2898 ret += utf8clen(s);
2899 else if (LEAF_CCC(leaf) == DECOMPOSE)
2900 ret += strlen(LEAF_STR(leaf));
2901 else
2902 ret += utf8clen(s);
2903 s += utf8clen(s);
2904 }
2905 return ret;
2906 }
2907
2908 /*
2909 * Length of the normalization of s, touch at most len bytes.
2910 * Return -1 if s is not valid UTF-8 unicode.
2911 */
utf8nlen(struct tree * tree,const char * s,size_t len)2912 ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
2913 {
2914 utf8leaf_t *leaf;
2915 size_t ret = 0;
2916 unsigned char hangul[UTF8HANGULLEAF];
2917
2918 if (!tree)
2919 return -1;
2920 while (len && *s) {
2921 leaf = utf8nlookup(tree, hangul, s, len);
2922 if (!leaf)
2923 return -1;
2924 if (ages[LEAF_GEN(leaf)] > tree->maxage)
2925 ret += utf8clen(s);
2926 else if (LEAF_CCC(leaf) == DECOMPOSE)
2927 ret += strlen(LEAF_STR(leaf));
2928 else
2929 ret += utf8clen(s);
2930 len -= utf8clen(s);
2931 s += utf8clen(s);
2932 }
2933 return ret;
2934 }
2935
2936 /*
2937 * Cursor structure used by the normalizer.
2938 */
2939 struct utf8cursor {
2940 struct tree *tree;
2941 const char *s;
2942 const char *p;
2943 const char *ss;
2944 const char *sp;
2945 unsigned int len;
2946 unsigned int slen;
2947 short int ccc;
2948 short int nccc;
2949 unsigned int unichar;
2950 unsigned char hangul[UTF8HANGULLEAF];
2951 };
2952
2953 /*
2954 * Set up an utf8cursor for use by utf8byte().
2955 *
2956 * s : string.
2957 * len : length of s.
2958 * u8c : pointer to cursor.
2959 * trie : utf8trie_t to use for normalization.
2960 *
2961 * Returns -1 on error, 0 on success.
2962 */
utf8ncursor(struct utf8cursor * u8c,struct tree * tree,const char * s,size_t len)2963 int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
2964 size_t len)
2965 {
2966 if (!tree)
2967 return -1;
2968 if (!s)
2969 return -1;
2970 u8c->tree = tree;
2971 u8c->s = s;
2972 u8c->p = NULL;
2973 u8c->ss = NULL;
2974 u8c->sp = NULL;
2975 u8c->len = len;
2976 u8c->slen = 0;
2977 u8c->ccc = STOPPER;
2978 u8c->nccc = STOPPER;
2979 u8c->unichar = 0;
2980 /* Check we didn't clobber the maximum length. */
2981 if (u8c->len != len)
2982 return -1;
2983 /* The first byte of s may not be an utf8 continuation. */
2984 if (len > 0 && (*s & 0xC0) == 0x80)
2985 return -1;
2986 return 0;
2987 }
2988
2989 /*
2990 * Set up an utf8cursor for use by utf8byte().
2991 *
2992 * s : NUL-terminated string.
2993 * u8c : pointer to cursor.
2994 * trie : utf8trie_t to use for normalization.
2995 *
2996 * Returns -1 on error, 0 on success.
2997 */
utf8cursor(struct utf8cursor * u8c,struct tree * tree,const char * s)2998 int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
2999 {
3000 return utf8ncursor(u8c, tree, s, (unsigned int)-1);
3001 }
3002
3003 /*
3004 * Get one byte from the normalized form of the string described by u8c.
3005 *
3006 * Returns the byte cast to an unsigned char on succes, and -1 on failure.
3007 *
3008 * The cursor keeps track of the location in the string in u8c->s.
3009 * When a character is decomposed, the current location is stored in
3010 * u8c->p, and u8c->s is set to the start of the decomposition. Note
3011 * that bytes from a decomposition do not count against u8c->len.
3012 *
3013 * Characters are emitted if they match the current CCC in u8c->ccc.
3014 * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
3015 * and the function returns 0 in that case.
3016 *
3017 * Sorting by CCC is done by repeatedly scanning the string. The
3018 * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
3019 * the start of the scan. The first pass finds the lowest CCC to be
3020 * emitted and stores it in u8c->nccc, the second pass emits the
3021 * characters with this CCC and finds the next lowest CCC. This limits
3022 * the number of passes to 1 + the number of different CCCs in the
3023 * sequence being scanned.
3024 *
3025 * Therefore:
3026 * u8c->p != NULL -> a decomposition is being scanned.
3027 * u8c->ss != NULL -> this is a repeating scan.
3028 * u8c->ccc == -1 -> this is the first scan of a repeating scan.
3029 */
utf8byte(struct utf8cursor * u8c)3030 int utf8byte(struct utf8cursor *u8c)
3031 {
3032 utf8leaf_t *leaf;
3033 int ccc;
3034
3035 for (;;) {
3036 /* Check for the end of a decomposed character. */
3037 if (u8c->p && *u8c->s == '\0') {
3038 u8c->s = u8c->p;
3039 u8c->p = NULL;
3040 }
3041
3042 /* Check for end-of-string. */
3043 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
3044 /* There is no next byte. */
3045 if (u8c->ccc == STOPPER)
3046 return 0;
3047 /* End-of-string during a scan counts as a stopper. */
3048 ccc = STOPPER;
3049 goto ccc_mismatch;
3050 } else if ((*u8c->s & 0xC0) == 0x80) {
3051 /* This is a continuation of the current character. */
3052 if (!u8c->p)
3053 u8c->len--;
3054 return (unsigned char)*u8c->s++;
3055 }
3056
3057 /* Look up the data for the current character. */
3058 if (u8c->p) {
3059 leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3060 } else {
3061 leaf = utf8nlookup(u8c->tree, u8c->hangul,
3062 u8c->s, u8c->len);
3063 }
3064
3065 /* No leaf found implies that the input is a binary blob. */
3066 if (!leaf)
3067 return -1;
3068
3069 /* Characters that are too new have CCC 0. */
3070 if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
3071 ccc = STOPPER;
3072 } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
3073 u8c->len -= utf8clen(u8c->s);
3074 u8c->p = u8c->s + utf8clen(u8c->s);
3075 u8c->s = LEAF_STR(leaf);
3076 /* Empty decomposition implies CCC 0. */
3077 if (*u8c->s == '\0') {
3078 if (u8c->ccc == STOPPER)
3079 continue;
3080 ccc = STOPPER;
3081 goto ccc_mismatch;
3082 }
3083 leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3084 ccc = LEAF_CCC(leaf);
3085 }
3086 u8c->unichar = utf8decode(u8c->s);
3087
3088 /*
3089 * If this is not a stopper, then see if it updates
3090 * the next canonical class to be emitted.
3091 */
3092 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
3093 u8c->nccc = ccc;
3094
3095 /*
3096 * Return the current byte if this is the current
3097 * combining class.
3098 */
3099 if (ccc == u8c->ccc) {
3100 if (!u8c->p)
3101 u8c->len--;
3102 return (unsigned char)*u8c->s++;
3103 }
3104
3105 /* Current combining class mismatch. */
3106 ccc_mismatch:
3107 if (u8c->nccc == STOPPER) {
3108 /*
3109 * Scan forward for the first canonical class
3110 * to be emitted. Save the position from
3111 * which to restart.
3112 */
3113 assert(u8c->ccc == STOPPER);
3114 u8c->ccc = MINCCC - 1;
3115 u8c->nccc = ccc;
3116 u8c->sp = u8c->p;
3117 u8c->ss = u8c->s;
3118 u8c->slen = u8c->len;
3119 if (!u8c->p)
3120 u8c->len -= utf8clen(u8c->s);
3121 u8c->s += utf8clen(u8c->s);
3122 } else if (ccc != STOPPER) {
3123 /* Not a stopper, and not the ccc we're emitting. */
3124 if (!u8c->p)
3125 u8c->len -= utf8clen(u8c->s);
3126 u8c->s += utf8clen(u8c->s);
3127 } else if (u8c->nccc != MAXCCC + 1) {
3128 /* At a stopper, restart for next ccc. */
3129 u8c->ccc = u8c->nccc;
3130 u8c->nccc = MAXCCC + 1;
3131 u8c->s = u8c->ss;
3132 u8c->p = u8c->sp;
3133 u8c->len = u8c->slen;
3134 } else {
3135 /* All done, proceed from here. */
3136 u8c->ccc = STOPPER;
3137 u8c->nccc = STOPPER;
3138 u8c->sp = NULL;
3139 u8c->ss = NULL;
3140 u8c->slen = 0;
3141 }
3142 }
3143 }
3144
3145 /* ------------------------------------------------------------------ */
3146
normalize_line(struct tree * tree)3147 static int normalize_line(struct tree *tree)
3148 {
3149 char *s;
3150 char *t;
3151 int c;
3152 struct utf8cursor u8c;
3153
3154 /* First test: null-terminated string. */
3155 s = buf2;
3156 t = buf3;
3157 if (utf8cursor(&u8c, tree, s))
3158 return -1;
3159 while ((c = utf8byte(&u8c)) > 0)
3160 if (c != (unsigned char)*t++)
3161 return -1;
3162 if (c < 0)
3163 return -1;
3164 if (*t != 0)
3165 return -1;
3166
3167 /* Second test: length-limited string. */
3168 s = buf2;
3169 /* Replace NUL with a value that will cause an error if seen. */
3170 s[strlen(s) + 1] = -1;
3171 t = buf3;
3172 if (utf8cursor(&u8c, tree, s))
3173 return -1;
3174 while ((c = utf8byte(&u8c)) > 0)
3175 if (c != (unsigned char)*t++)
3176 return -1;
3177 if (c < 0)
3178 return -1;
3179 if (*t != 0)
3180 return -1;
3181
3182 return 0;
3183 }
3184
normalization_test(void)3185 static void normalization_test(void)
3186 {
3187 FILE *file;
3188 unsigned int unichar;
3189 struct unicode_data *data;
3190 char *s;
3191 char *t;
3192 int ret;
3193 int ignorables;
3194 int tests = 0;
3195 int failures = 0;
3196
3197 if (verbose > 0)
3198 printf("Parsing %s\n", test_name);
3199 /* Step one, read data from file. */
3200 file = fopen(test_name, "r");
3201 if (!file)
3202 open_fail(test_name, errno);
3203
3204 while (fgets(line, LINESIZE, file)) {
3205 ret = sscanf(line, "%[^;];%*[^;];%*[^;];%*[^;];%[^;];",
3206 buf0, buf1);
3207 if (ret != 2 || *line == '#')
3208 continue;
3209 s = buf0;
3210 t = buf2;
3211 while (*s) {
3212 unichar = strtoul(s, &s, 16);
3213 t += utf8encode(t, unichar);
3214 }
3215 *t = '\0';
3216
3217 ignorables = 0;
3218 s = buf1;
3219 t = buf3;
3220 while (*s) {
3221 unichar = strtoul(s, &s, 16);
3222 data = &unicode_data[unichar];
3223 if (data->utf8nfkdi && !*data->utf8nfkdi)
3224 ignorables = 1;
3225 else
3226 t += utf8encode(t, unichar);
3227 }
3228 *t = '\0';
3229
3230 tests++;
3231 if (normalize_line(nfkdi_tree) < 0) {
3232 printf("Line %s -> %s", buf0, buf1);
3233 if (ignorables)
3234 printf(" (ignorables removed)");
3235 printf(" failure\n");
3236 failures++;
3237 }
3238 }
3239 fclose(file);
3240 if (verbose > 0)
3241 printf("Ran %d tests with %d failures\n", tests, failures);
3242 if (failures)
3243 file_fail(test_name);
3244 }
3245
3246 /* ------------------------------------------------------------------ */
3247
write_file(void)3248 static void write_file(void)
3249 {
3250 FILE *file;
3251 int i;
3252 int j;
3253 int t;
3254 int gen;
3255
3256 if (verbose > 0)
3257 printf("Writing %s\n", utf8_name);
3258 file = fopen(utf8_name, "w");
3259 if (!file)
3260 open_fail(utf8_name, errno);
3261
3262 fprintf(file, "/* This file is generated code, do not edit. */\n");
3263 fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
3264 fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
3265 fprintf(file, "#endif\n");
3266 fprintf(file, "\n");
3267 fprintf(file, "static const unsigned int utf8vers = %#x;\n",
3268 unicode_maxage);
3269 fprintf(file, "\n");
3270 fprintf(file, "static const unsigned int utf8agetab[] = {\n");
3271 for (i = 0; i != ages_count; i++)
3272 fprintf(file, "\t%#x%s\n", ages[i],
3273 ages[i] == unicode_maxage ? "" : ",");
3274 fprintf(file, "};\n");
3275 fprintf(file, "\n");
3276 fprintf(file, "static const struct utf8data utf8nfkdicfdata[] = {\n");
3277 t = 0;
3278 for (gen = 0; gen < ages_count; gen++) {
3279 fprintf(file, "\t{ %#x, %d }%s\n",
3280 ages[gen], trees[t].index,
3281 ages[gen] == unicode_maxage ? "" : ",");
3282 if (trees[t].maxage == ages[gen])
3283 t += 2;
3284 }
3285 fprintf(file, "};\n");
3286 fprintf(file, "\n");
3287 fprintf(file, "static const struct utf8data utf8nfkdidata[] = {\n");
3288 t = 1;
3289 for (gen = 0; gen < ages_count; gen++) {
3290 fprintf(file, "\t{ %#x, %d }%s\n",
3291 ages[gen], trees[t].index,
3292 ages[gen] == unicode_maxage ? "" : ",");
3293 if (trees[t].maxage == ages[gen])
3294 t += 2;
3295 }
3296 fprintf(file, "};\n");
3297 fprintf(file, "\n");
3298 fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
3299 utf8data_size);
3300 t = 0;
3301 for (i = 0; i != utf8data_size; i += 16) {
3302 if (i == trees[t].index) {
3303 fprintf(file, "\t/* %s_%x */\n",
3304 trees[t].type, trees[t].maxage);
3305 if (t < trees_count-1)
3306 t++;
3307 }
3308 fprintf(file, "\t");
3309 for (j = i; j != i + 16; j++)
3310 fprintf(file, "0x%.2x%s", utf8data[j],
3311 (j < utf8data_size -1 ? "," : ""));
3312 fprintf(file, "\n");
3313 }
3314 fprintf(file, "};\n");
3315 fclose(file);
3316 }
3317
3318 /* ------------------------------------------------------------------ */
3319
main(int argc,char * argv[])3320 int main(int argc, char *argv[])
3321 {
3322 unsigned int unichar;
3323 int opt;
3324
3325 argv0 = argv[0];
3326
3327 while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
3328 switch (opt) {
3329 case 'a':
3330 age_name = optarg;
3331 break;
3332 case 'c':
3333 ccc_name = optarg;
3334 break;
3335 case 'd':
3336 data_name = optarg;
3337 break;
3338 case 'f':
3339 fold_name = optarg;
3340 break;
3341 case 'n':
3342 norm_name = optarg;
3343 break;
3344 case 'o':
3345 utf8_name = optarg;
3346 break;
3347 case 'p':
3348 prop_name = optarg;
3349 break;
3350 case 't':
3351 test_name = optarg;
3352 break;
3353 case 'v':
3354 verbose++;
3355 break;
3356 case 'h':
3357 help();
3358 exit(0);
3359 default:
3360 usage();
3361 }
3362 }
3363
3364 if (verbose > 1)
3365 help();
3366 for (unichar = 0; unichar != 0x110000; unichar++)
3367 unicode_data[unichar].code = unichar;
3368 age_init();
3369 ccc_init();
3370 nfkdi_init();
3371 nfkdicf_init();
3372 ignore_init();
3373 corrections_init();
3374 hangul_decompose();
3375 nfkdi_decompose();
3376 nfkdicf_decompose();
3377 utf8_init();
3378 trees_init();
3379 trees_populate();
3380 trees_reduce();
3381 trees_verify();
3382 /* Prevent "unused function" warning. */
3383 (void)lookup(nfkdi_tree, " ");
3384 if (verbose > 2)
3385 tree_walk(nfkdi_tree);
3386 if (verbose > 2)
3387 tree_walk(nfkdicf_tree);
3388 normalization_test();
3389 write_file();
3390
3391 return 0;
3392 }
3393