• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2014 SGI.
3  * All rights reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 
19 /* Generator for a compact trie for unicode normalization */
20 
21 #include <sys/types.h>
22 #include <stddef.h>
23 #include <stdlib.h>
24 #include <stdio.h>
25 #include <assert.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <errno.h>
29 
30 /* Default names of the in- and output files. */
31 
32 #define AGE_NAME	"DerivedAge.txt"
33 #define CCC_NAME	"DerivedCombiningClass.txt"
34 #define PROP_NAME	"DerivedCoreProperties.txt"
35 #define DATA_NAME	"UnicodeData.txt"
36 #define FOLD_NAME	"CaseFolding.txt"
37 #define NORM_NAME	"NormalizationCorrections.txt"
38 #define TEST_NAME	"NormalizationTest.txt"
39 #define UTF8_NAME	"utf8data.h"
40 
41 const char	*age_name  = AGE_NAME;
42 const char	*ccc_name  = CCC_NAME;
43 const char	*prop_name = PROP_NAME;
44 const char	*data_name = DATA_NAME;
45 const char	*fold_name = FOLD_NAME;
46 const char	*norm_name = NORM_NAME;
47 const char	*test_name = TEST_NAME;
48 const char	*utf8_name = UTF8_NAME;
49 
50 int verbose = 0;
51 
52 /* An arbitrary line size limit on input lines. */
53 
54 #define LINESIZE	1024
55 char line[LINESIZE];
56 char buf0[LINESIZE];
57 char buf1[LINESIZE];
58 char buf2[LINESIZE];
59 char buf3[LINESIZE];
60 
61 const char *argv0;
62 
63 /* ------------------------------------------------------------------ */
64 
65 /*
66  * Unicode version numbers consist of three parts: major, minor, and a
67  * revision.  These numbers are packed into an unsigned int to obtain
68  * a single version number.
69  *
70  * To save space in the generated trie, the unicode version is not
71  * stored directly, instead we calculate a generation number from the
72  * unicode versions seen in the DerivedAge file, and use that as an
73  * index into a table of unicode versions.
74  */
75 #define UNICODE_MAJ_SHIFT		(16)
76 #define UNICODE_MIN_SHIFT		(8)
77 
78 #define UNICODE_MAJ_MAX			((unsigned short)-1)
79 #define UNICODE_MIN_MAX			((unsigned char)-1)
80 #define UNICODE_REV_MAX			((unsigned char)-1)
81 
82 #define UNICODE_AGE(MAJ,MIN,REV)			\
83 	(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |	\
84 	 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |	\
85 	 ((unsigned int)(REV)))
86 
87 unsigned int *ages;
88 int ages_count;
89 
90 unsigned int unicode_maxage;
91 
age_valid(unsigned int major,unsigned int minor,unsigned int revision)92 static int age_valid(unsigned int major, unsigned int minor,
93 		     unsigned int revision)
94 {
95 	if (major > UNICODE_MAJ_MAX)
96 		return 0;
97 	if (minor > UNICODE_MIN_MAX)
98 		return 0;
99 	if (revision > UNICODE_REV_MAX)
100 		return 0;
101 	return 1;
102 }
103 
104 /* ------------------------------------------------------------------ */
105 
106 /*
107  * utf8trie_t
108  *
109  * A compact binary tree, used to decode UTF-8 characters.
110  *
111  * Internal nodes are one byte for the node itself, and up to three
112  * bytes for an offset into the tree.  The first byte contains the
113  * following information:
114  *  NEXTBYTE  - flag        - advance to next byte if set
115  *  BITNUM    - 3 bit field - the bit number to tested
116  *  OFFLEN    - 2 bit field - number of bytes in the offset
117  * if offlen == 0 (non-branching node)
118  *  RIGHTPATH - 1 bit field - set if the following node is for the
119  *                            right-hand path (tested bit is set)
120  *  TRIENODE  - 1 bit field - set if the following node is an internal
121  *                            node, otherwise it is a leaf node
122  * if offlen != 0 (branching node)
123  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
124  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
125  *
126  * Due to the way utf8 works, there cannot be branching nodes with
127  * NEXTBYTE set, and moreover those nodes always have a righthand
128  * descendant.
129  */
130 typedef unsigned char utf8trie_t;
131 #define BITNUM		0x07
132 #define NEXTBYTE	0x08
133 #define OFFLEN		0x30
134 #define OFFLEN_SHIFT	4
135 #define RIGHTPATH	0x40
136 #define TRIENODE	0x80
137 #define RIGHTNODE	0x40
138 #define LEFTNODE	0x80
139 
140 /*
141  * utf8leaf_t
142  *
143  * The leaves of the trie are embedded in the trie, and so the same
144  * underlying datatype, unsigned char.
145  *
146  * leaf[0]: The unicode version, stored as a generation number that is
147  *          an index into utf8agetab[].  With this we can filter code
148  *          points based on the unicode version in which they were
149  *          defined.  The CCC of a non-defined code point is 0.
150  * leaf[1]: Canonical Combining Class. During normalization, we need
151  *          to do a stable sort into ascending order of all characters
152  *          with a non-zero CCC that occur between two characters with
153  *          a CCC of 0, or at the begin or end of a string.
154  *          The unicode standard guarantees that all CCC values are
155  *          between 0 and 254 inclusive, which leaves 255 available as
156  *          a special value.
157  *          Code points with CCC 0 are known as stoppers.
158  * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
159  *          start of a NUL-terminated string that is the decomposition
160  *          of the character.
161  *          The CCC of a decomposable character is the same as the CCC
162  *          of the first character of its decomposition.
163  *          Some characters decompose as the empty string: these are
164  *          characters with the Default_Ignorable_Code_Point property.
165  *          These do affect normalization, as they all have CCC 0.
166  *
167  * The decompositions in the trie have been fully expanded.
168  *
169  * Casefolding, if applicable, is also done using decompositions.
170  */
171 typedef unsigned char utf8leaf_t;
172 
173 #define LEAF_GEN(LEAF)	((LEAF)[0])
174 #define LEAF_CCC(LEAF)	((LEAF)[1])
175 #define LEAF_STR(LEAF)	((const char*)((LEAF) + 2))
176 
177 #define MAXGEN		(255)
178 
179 #define MINCCC		(0)
180 #define MAXCCC		(254)
181 #define STOPPER		(0)
182 #define DECOMPOSE	(255)
183 #define HANGUL		((char)(255))
184 
185 #define UTF8HANGULLEAF	(12)
186 
187 struct tree;
188 static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
189 			       const char *, size_t);
190 static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
191 
192 unsigned char *utf8data;
193 size_t utf8data_size;
194 
195 utf8trie_t *nfkdi;
196 utf8trie_t *nfkdicf;
197 
198 /* ------------------------------------------------------------------ */
199 
200 /*
201  * UTF8 valid ranges.
202  *
203  * The UTF-8 encoding spreads the bits of a 32bit word over several
204  * bytes. This table gives the ranges that can be held and how they'd
205  * be represented.
206  *
207  * 0x00000000 0x0000007F: 0xxxxxxx
208  * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
209  * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
210  * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
211  * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
212  * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
213  *
214  * There is an additional requirement on UTF-8, in that only the
215  * shortest representation of a 32bit value is to be used.  A decoder
216  * must not decode sequences that do not satisfy this requirement.
217  * Thus the allowed ranges have a lower bound.
218  *
219  * 0x00000000 0x0000007F: 0xxxxxxx
220  * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
221  * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
222  * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
223  * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
224  * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
225  *
226  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
227  * 17 planes of 65536 values.  This limits the sequences actually seen
228  * even more, to just the following.
229  *
230  *          0 -     0x7f: 0                     0x7f
231  *       0x80 -    0x7ff: 0xc2 0x80             0xdf 0xbf
232  *      0x800 -   0xffff: 0xe0 0xa0 0x80        0xef 0xbf 0xbf
233  *    0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80   0xf4 0x8f 0xbf 0xbf
234  *
235  * Even within those ranges not all values are allowed: the surrogates
236  * 0xd800 - 0xdfff should never be seen.
237  *
238  * Note that the longest sequence seen with valid usage is 4 bytes,
239  * the same a single UTF-32 character.  This makes the UTF-8
240  * representation of Unicode strictly smaller than UTF-32.
241  *
242  * The shortest sequence requirement was introduced by:
243  *    Corrigendum #1: UTF-8 Shortest Form
244  * It can be found here:
245  *    http://www.unicode.org/versions/corrigendum1.html
246  *
247  */
248 
249 #define UTF8_2_BITS     0xC0
250 #define UTF8_3_BITS     0xE0
251 #define UTF8_4_BITS     0xF0
252 #define UTF8_N_BITS     0x80
253 #define UTF8_2_MASK     0xE0
254 #define UTF8_3_MASK     0xF0
255 #define UTF8_4_MASK     0xF8
256 #define UTF8_N_MASK     0xC0
257 #define UTF8_V_MASK     0x3F
258 #define UTF8_V_SHIFT    6
259 
utf8encode(char * str,unsigned int val)260 static int utf8encode(char *str, unsigned int val)
261 {
262 	int len;
263 
264 	if (val < 0x80) {
265 		str[0] = val;
266 		len = 1;
267 	} else if (val < 0x800) {
268 		str[1] = val & UTF8_V_MASK;
269 		str[1] |= UTF8_N_BITS;
270 		val >>= UTF8_V_SHIFT;
271 		str[0] = val;
272 		str[0] |= UTF8_2_BITS;
273 		len = 2;
274 	} else if (val < 0x10000) {
275 		str[2] = val & UTF8_V_MASK;
276 		str[2] |= UTF8_N_BITS;
277 		val >>= UTF8_V_SHIFT;
278 		str[1] = val & UTF8_V_MASK;
279 		str[1] |= UTF8_N_BITS;
280 		val >>= UTF8_V_SHIFT;
281 		str[0] = val;
282 		str[0] |= UTF8_3_BITS;
283 		len = 3;
284 	} else if (val < 0x110000) {
285 		str[3] = val & UTF8_V_MASK;
286 		str[3] |= UTF8_N_BITS;
287 		val >>= UTF8_V_SHIFT;
288 		str[2] = val & UTF8_V_MASK;
289 		str[2] |= UTF8_N_BITS;
290 		val >>= UTF8_V_SHIFT;
291 		str[1] = val & UTF8_V_MASK;
292 		str[1] |= UTF8_N_BITS;
293 		val >>= UTF8_V_SHIFT;
294 		str[0] = val;
295 		str[0] |= UTF8_4_BITS;
296 		len = 4;
297 	} else {
298 		printf("%#x: illegal val\n", val);
299 		len = 0;
300 	}
301 	return len;
302 }
303 
utf8decode(const char * str)304 static unsigned int utf8decode(const char *str)
305 {
306 	const unsigned char *s = (const unsigned char*)str;
307 	unsigned int unichar = 0;
308 
309 	if (*s < 0x80) {
310 		unichar = *s;
311 	} else if (*s < UTF8_3_BITS) {
312 		unichar = *s++ & 0x1F;
313 		unichar <<= UTF8_V_SHIFT;
314 		unichar |= *s & 0x3F;
315 	} else if (*s < UTF8_4_BITS) {
316 		unichar = *s++ & 0x0F;
317 		unichar <<= UTF8_V_SHIFT;
318 		unichar |= *s++ & 0x3F;
319 		unichar <<= UTF8_V_SHIFT;
320 		unichar |= *s & 0x3F;
321 	} else {
322 		unichar = *s++ & 0x0F;
323 		unichar <<= UTF8_V_SHIFT;
324 		unichar |= *s++ & 0x3F;
325 		unichar <<= UTF8_V_SHIFT;
326 		unichar |= *s++ & 0x3F;
327 		unichar <<= UTF8_V_SHIFT;
328 		unichar |= *s & 0x3F;
329 	}
330 	return unichar;
331 }
332 
utf32valid(unsigned int unichar)333 static int utf32valid(unsigned int unichar)
334 {
335 	return unichar < 0x110000;
336 }
337 
338 #define HANGUL_SYLLABLE(U)	((U) >= 0xAC00 && (U) <= 0xD7A3)
339 
340 #define NODE 1
341 #define LEAF 0
342 
343 struct tree {
344 	void *root;
345 	int childnode;
346 	const char *type;
347 	unsigned int maxage;
348 	struct tree *next;
349 	int (*leaf_equal)(void *, void *);
350 	void (*leaf_print)(void *, int);
351 	int (*leaf_mark)(void *);
352 	int (*leaf_size)(void *);
353 	int *(*leaf_index)(struct tree *, void *);
354 	unsigned char *(*leaf_emit)(void *, unsigned char *);
355 	int leafindex[0x110000];
356 	int index;
357 };
358 
359 struct node {
360 	int index;
361 	int offset;
362 	int mark;
363 	int size;
364 	struct node *parent;
365 	void *left;
366 	void *right;
367 	unsigned char bitnum;
368 	unsigned char nextbyte;
369 	unsigned char leftnode;
370 	unsigned char rightnode;
371 	unsigned int keybits;
372 	unsigned int keymask;
373 };
374 
375 /*
376  * Example lookup function for a tree.
377  */
lookup(struct tree * tree,const char * key)378 static void *lookup(struct tree *tree, const char *key)
379 {
380 	struct node *node;
381 	void *leaf = NULL;
382 
383 	node = tree->root;
384 	while (!leaf && node) {
385 		if (node->nextbyte)
386 			key++;
387 		if (*key & (1 << (node->bitnum & 7))) {
388 			/* Right leg */
389 			if (node->rightnode == NODE) {
390 				node = node->right;
391 			} else if (node->rightnode == LEAF) {
392 				leaf = node->right;
393 			} else {
394 				node = NULL;
395 			}
396 		} else {
397 			/* Left leg */
398 			if (node->leftnode == NODE) {
399 				node = node->left;
400 			} else if (node->leftnode == LEAF) {
401 				leaf = node->left;
402 			} else {
403 				node = NULL;
404 			}
405 		}
406 	}
407 
408 	return leaf;
409 }
410 
411 /*
412  * A simple non-recursive tree walker: keep track of visits to the
413  * left and right branches in the leftmask and rightmask.
414  */
tree_walk(struct tree * tree)415 static void tree_walk(struct tree *tree)
416 {
417 	struct node *node;
418 	unsigned int leftmask;
419 	unsigned int rightmask;
420 	unsigned int bitmask;
421 	int indent = 1;
422 	int nodes, singletons, leaves;
423 
424 	nodes = singletons = leaves = 0;
425 
426 	printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
427 	if (tree->childnode == LEAF) {
428 		assert(tree->root);
429 		tree->leaf_print(tree->root, indent);
430 		leaves = 1;
431 	} else {
432 		assert(tree->childnode == NODE);
433 		node = tree->root;
434 		leftmask = rightmask = 0;
435 		while (node) {
436 			printf("%*snode @ %p bitnum %d nextbyte %d"
437 			       " left %p right %p mask %x bits %x\n",
438 				indent, "", node,
439 				node->bitnum, node->nextbyte,
440 				node->left, node->right,
441 				node->keymask, node->keybits);
442 			nodes += 1;
443 			if (!(node->left && node->right))
444 				singletons += 1;
445 
446 			while (node) {
447 				bitmask = 1 << node->bitnum;
448 				if ((leftmask & bitmask) == 0) {
449 					leftmask |= bitmask;
450 					if (node->leftnode == LEAF) {
451 						assert(node->left);
452 						tree->leaf_print(node->left,
453 								 indent+1);
454 						leaves += 1;
455 					} else if (node->left) {
456 						assert(node->leftnode == NODE);
457 						indent += 1;
458 						node = node->left;
459 						break;
460 					}
461 				}
462 				if ((rightmask & bitmask) == 0) {
463 					rightmask |= bitmask;
464 					if (node->rightnode == LEAF) {
465 						assert(node->right);
466 						tree->leaf_print(node->right,
467 								 indent+1);
468 						leaves += 1;
469 					} else if (node->right) {
470 						assert(node->rightnode == NODE);
471 						indent += 1;
472 						node = node->right;
473 						break;
474 					}
475 				}
476 				leftmask &= ~bitmask;
477 				rightmask &= ~bitmask;
478 				node = node->parent;
479 				indent -= 1;
480 			}
481 		}
482 	}
483 	printf("nodes %d leaves %d singletons %d\n",
484 	       nodes, leaves, singletons);
485 }
486 
487 /*
488  * Allocate an initialize a new internal node.
489  */
alloc_node(struct node * parent)490 static struct node *alloc_node(struct node *parent)
491 {
492 	struct node *node;
493 	int bitnum;
494 
495 	node = malloc(sizeof(*node));
496 	node->left = node->right = NULL;
497 	node->parent = parent;
498 	node->leftnode = NODE;
499 	node->rightnode = NODE;
500 	node->keybits = 0;
501 	node->keymask = 0;
502 	node->mark = 0;
503 	node->index = 0;
504 	node->offset = -1;
505 	node->size = 4;
506 
507 	if (node->parent) {
508 		bitnum = parent->bitnum;
509 		if ((bitnum & 7) == 0) {
510 			node->bitnum = bitnum + 7 + 8;
511 			node->nextbyte = 1;
512 		} else {
513 			node->bitnum = bitnum - 1;
514 			node->nextbyte = 0;
515 		}
516 	} else {
517 		node->bitnum = 7;
518 		node->nextbyte = 0;
519 	}
520 
521 	return node;
522 }
523 
524 /*
525  * Insert a new leaf into the tree, and collapse any subtrees that are
526  * fully populated and end in identical leaves. A nextbyte tagged
527  * internal node will not be removed to preserve the tree's integrity.
528  * Note that due to the structure of utf8, no nextbyte tagged node
529  * will be a candidate for removal.
530  */
insert(struct tree * tree,char * key,int keylen,void * leaf)531 static int insert(struct tree *tree, char *key, int keylen, void *leaf)
532 {
533 	struct node *node;
534 	struct node *parent;
535 	void **cursor;
536 	int keybits;
537 
538 	assert(keylen >= 1 && keylen <= 4);
539 
540 	node = NULL;
541 	cursor = &tree->root;
542 	keybits = 8 * keylen;
543 
544 	/* Insert, creating path along the way. */
545 	while (keybits) {
546 		if (!*cursor)
547 			*cursor = alloc_node(node);
548 		node = *cursor;
549 		if (node->nextbyte)
550 			key++;
551 		if (*key & (1 << (node->bitnum & 7)))
552 			cursor = &node->right;
553 		else
554 			cursor = &node->left;
555 		keybits--;
556 	}
557 	*cursor = leaf;
558 
559 	/* Merge subtrees if possible. */
560 	while (node) {
561 		if (*key & (1 << (node->bitnum & 7)))
562 			node->rightnode = LEAF;
563 		else
564 			node->leftnode = LEAF;
565 		if (node->nextbyte)
566 			break;
567 		if (node->leftnode == NODE || node->rightnode == NODE)
568 			break;
569 		assert(node->left);
570 		assert(node->right);
571 		/* Compare */
572 		if (! tree->leaf_equal(node->left, node->right))
573 			break;
574 		/* Keep left, drop right leaf. */
575 		leaf = node->left;
576 		/* Check in parent */
577 		parent = node->parent;
578 		if (!parent) {
579 			/* root of tree! */
580 			tree->root = leaf;
581 			tree->childnode = LEAF;
582 		} else if (parent->left == node) {
583 			parent->left = leaf;
584 			parent->leftnode = LEAF;
585 			if (parent->right) {
586 				parent->keymask = 0;
587 				parent->keybits = 0;
588 			} else {
589 				parent->keymask |= (1 << node->bitnum);
590 			}
591 		} else if (parent->right == node) {
592 			parent->right = leaf;
593 			parent->rightnode = LEAF;
594 			if (parent->left) {
595 				parent->keymask = 0;
596 				parent->keybits = 0;
597 			} else {
598 				parent->keymask |= (1 << node->bitnum);
599 				parent->keybits |= (1 << node->bitnum);
600 			}
601 		} else {
602 			/* internal tree error */
603 			assert(0);
604 		}
605 		free(node);
606 		node = parent;
607 	}
608 
609 	/* Propagate keymasks up along singleton chains. */
610 	while (node) {
611 		parent = node->parent;
612 		if (!parent)
613 			break;
614 		/* Nix the mask for parents with two children. */
615 		if (node->keymask == 0) {
616 			parent->keymask = 0;
617 			parent->keybits = 0;
618 		} else if (parent->left && parent->right) {
619 			parent->keymask = 0;
620 			parent->keybits = 0;
621 		} else {
622 			assert((parent->keymask & node->keymask) == 0);
623 			parent->keymask |= node->keymask;
624 			parent->keymask |= (1 << parent->bitnum);
625 			parent->keybits |= node->keybits;
626 			if (parent->right)
627 				parent->keybits |= (1 << parent->bitnum);
628 		}
629 		node = parent;
630 	}
631 
632 	return 0;
633 }
634 
635 /*
636  * Prune internal nodes.
637  *
638  * Fully populated subtrees that end at the same leaf have already
639  * been collapsed.  There are still internal nodes that have for both
640  * their left and right branches a sequence of singletons that make
641  * identical choices and end in identical leaves.  The keymask and
642  * keybits collected in the nodes describe the choices made in these
643  * singleton chains.  When they are identical for the left and right
644  * branch of a node, and the two leaves comare identical, the node in
645  * question can be removed.
646  *
647  * Note that nodes with the nextbyte tag set will not be removed by
648  * this to ensure tree integrity.  Note as well that the structure of
649  * utf8 ensures that these nodes would not have been candidates for
650  * removal in any case.
651  */
prune(struct tree * tree)652 static void prune(struct tree *tree)
653 {
654 	struct node *node;
655 	struct node *left;
656 	struct node *right;
657 	struct node *parent;
658 	void *leftleaf;
659 	void *rightleaf;
660 	unsigned int leftmask;
661 	unsigned int rightmask;
662 	unsigned int bitmask;
663 	int count;
664 
665 	if (verbose > 0)
666 		printf("Pruning %s_%x\n", tree->type, tree->maxage);
667 
668 	count = 0;
669 	if (tree->childnode == LEAF)
670 		return;
671 	if (!tree->root)
672 		return;
673 
674 	leftmask = rightmask = 0;
675 	node = tree->root;
676 	while (node) {
677 		if (node->nextbyte)
678 			goto advance;
679 		if (node->leftnode == LEAF)
680 			goto advance;
681 		if (node->rightnode == LEAF)
682 			goto advance;
683 		if (!node->left)
684 			goto advance;
685 		if (!node->right)
686 			goto advance;
687 		left = node->left;
688 		right = node->right;
689 		if (left->keymask == 0)
690 			goto advance;
691 		if (right->keymask == 0)
692 			goto advance;
693 		if (left->keymask != right->keymask)
694 			goto advance;
695 		if (left->keybits != right->keybits)
696 			goto advance;
697 		leftleaf = NULL;
698 		while (!leftleaf) {
699 			assert(left->left || left->right);
700 			if (left->leftnode == LEAF)
701 				leftleaf = left->left;
702 			else if (left->rightnode == LEAF)
703 				leftleaf = left->right;
704 			else if (left->left)
705 				left = left->left;
706 			else if (left->right)
707 				left = left->right;
708 			else
709 				assert(0);
710 		}
711 		rightleaf = NULL;
712 		while (!rightleaf) {
713 			assert(right->left || right->right);
714 			if (right->leftnode == LEAF)
715 				rightleaf = right->left;
716 			else if (right->rightnode == LEAF)
717 				rightleaf = right->right;
718 			else if (right->left)
719 				right = right->left;
720 			else if (right->right)
721 				right = right->right;
722 			else
723 				assert(0);
724 		}
725 		if (! tree->leaf_equal(leftleaf, rightleaf))
726 			goto advance;
727 		/*
728 		 * This node has identical singleton-only subtrees.
729 		 * Remove it.
730 		 */
731 		parent = node->parent;
732 		left = node->left;
733 		right = node->right;
734 		if (parent->left == node)
735 			parent->left = left;
736 		else if (parent->right == node)
737 			parent->right = left;
738 		else
739 			assert(0);
740 		left->parent = parent;
741 		left->keymask |= (1 << node->bitnum);
742 		node->left = NULL;
743 		while (node) {
744 			bitmask = 1 << node->bitnum;
745 			leftmask &= ~bitmask;
746 			rightmask &= ~bitmask;
747 			if (node->leftnode == NODE && node->left) {
748 				left = node->left;
749 				free(node);
750 				count++;
751 				node = left;
752 			} else if (node->rightnode == NODE && node->right) {
753 				right = node->right;
754 				free(node);
755 				count++;
756 				node = right;
757 			} else {
758 				node = NULL;
759 			}
760 		}
761 		/* Propagate keymasks up along singleton chains. */
762 		node = parent;
763 		/* Force re-check */
764 		bitmask = 1 << node->bitnum;
765 		leftmask &= ~bitmask;
766 		rightmask &= ~bitmask;
767 		for (;;) {
768 			if (node->left && node->right)
769 				break;
770 			if (node->left) {
771 				left = node->left;
772 				node->keymask |= left->keymask;
773 				node->keybits |= left->keybits;
774 			}
775 			if (node->right) {
776 				right = node->right;
777 				node->keymask |= right->keymask;
778 				node->keybits |= right->keybits;
779 			}
780 			node->keymask |= (1 << node->bitnum);
781 			node = node->parent;
782 			/* Force re-check */
783 			bitmask = 1 << node->bitnum;
784 			leftmask &= ~bitmask;
785 			rightmask &= ~bitmask;
786 		}
787 	advance:
788 		bitmask = 1 << node->bitnum;
789 		if ((leftmask & bitmask) == 0 &&
790 		    node->leftnode == NODE &&
791 		    node->left) {
792 			leftmask |= bitmask;
793 			node = node->left;
794 		} else if ((rightmask & bitmask) == 0 &&
795 			   node->rightnode == NODE &&
796 			   node->right) {
797 			rightmask |= bitmask;
798 			node = node->right;
799 		} else {
800 			leftmask &= ~bitmask;
801 			rightmask &= ~bitmask;
802 			node = node->parent;
803 		}
804 	}
805 	if (verbose > 0)
806 		printf("Pruned %d nodes\n", count);
807 }
808 
809 /*
810  * Mark the nodes in the tree that lead to leaves that must be
811  * emitted.
812  */
mark_nodes(struct tree * tree)813 static void mark_nodes(struct tree *tree)
814 {
815 	struct node *node;
816 	struct node *n;
817 	unsigned int leftmask;
818 	unsigned int rightmask;
819 	unsigned int bitmask;
820 	int marked;
821 
822 	marked = 0;
823 	if (verbose > 0)
824 		printf("Marking %s_%x\n", tree->type, tree->maxage);
825 	if (tree->childnode == LEAF)
826 		goto done;
827 
828 	assert(tree->childnode == NODE);
829 	node = tree->root;
830 	leftmask = rightmask = 0;
831 	while (node) {
832 		bitmask = 1 << node->bitnum;
833 		if ((leftmask & bitmask) == 0) {
834 			leftmask |= bitmask;
835 			if (node->leftnode == LEAF) {
836 				assert(node->left);
837 				if (tree->leaf_mark(node->left)) {
838 					n = node;
839 					while (n && !n->mark) {
840 						marked++;
841 						n->mark = 1;
842 						n = n->parent;
843 					}
844 				}
845 			} else if (node->left) {
846 				assert(node->leftnode == NODE);
847 				node = node->left;
848 				continue;
849 			}
850 		}
851 		if ((rightmask & bitmask) == 0) {
852 			rightmask |= bitmask;
853 			if (node->rightnode == LEAF) {
854 				assert(node->right);
855 				if (tree->leaf_mark(node->right)) {
856 					n = node;
857 					while (n && !n->mark) {
858 						marked++;
859 						n->mark = 1;
860 						n = n->parent;
861 					}
862 				}
863 			} else if (node->right) {
864 				assert(node->rightnode == NODE);
865 				node = node->right;
866 				continue;
867 			}
868 		}
869 		leftmask &= ~bitmask;
870 		rightmask &= ~bitmask;
871 		node = node->parent;
872 	}
873 
874 	/* second pass: left siblings and singletons */
875 
876 	assert(tree->childnode == NODE);
877 	node = tree->root;
878 	leftmask = rightmask = 0;
879 	while (node) {
880 		bitmask = 1 << node->bitnum;
881 		if ((leftmask & bitmask) == 0) {
882 			leftmask |= bitmask;
883 			if (node->leftnode == LEAF) {
884 				assert(node->left);
885 				if (tree->leaf_mark(node->left)) {
886 					n = node;
887 					while (n && !n->mark) {
888 						marked++;
889 						n->mark = 1;
890 						n = n->parent;
891 					}
892 				}
893 			} else if (node->left) {
894 				assert(node->leftnode == NODE);
895 				node = node->left;
896 				if (!node->mark && node->parent->mark) {
897 					marked++;
898 					node->mark = 1;
899 				}
900 				continue;
901 			}
902 		}
903 		if ((rightmask & bitmask) == 0) {
904 			rightmask |= bitmask;
905 			if (node->rightnode == LEAF) {
906 				assert(node->right);
907 				if (tree->leaf_mark(node->right)) {
908 					n = node;
909 					while (n && !n->mark) {
910 						marked++;
911 						n->mark = 1;
912 						n = n->parent;
913 					}
914 				}
915 			} else if (node->right) {
916 				assert(node->rightnode == NODE);
917 				node = node->right;
918 				if (!node->mark && node->parent->mark &&
919 				    !node->parent->left) {
920 					marked++;
921 					node->mark = 1;
922 				}
923 				continue;
924 			}
925 		}
926 		leftmask &= ~bitmask;
927 		rightmask &= ~bitmask;
928 		node = node->parent;
929 	}
930 done:
931 	if (verbose > 0)
932 		printf("Marked %d nodes\n", marked);
933 }
934 
935 /*
936  * Compute the index of each node and leaf, which is the offset in the
937  * emitted trie.  These values must be pre-computed because relative
938  * offsets between nodes are used to navigate the tree.
939  */
index_nodes(struct tree * tree,int index)940 static int index_nodes(struct tree *tree, int index)
941 {
942 	struct node *node;
943 	unsigned int leftmask;
944 	unsigned int rightmask;
945 	unsigned int bitmask;
946 	int count;
947 	int indent;
948 
949 	/* Align to a cache line (or half a cache line?). */
950 	while (index % 64)
951 		index++;
952 	tree->index = index;
953 	indent = 1;
954 	count = 0;
955 
956 	if (verbose > 0)
957 		printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
958 	if (tree->childnode == LEAF) {
959 		index += tree->leaf_size(tree->root);
960 		goto done;
961 	}
962 
963 	assert(tree->childnode == NODE);
964 	node = tree->root;
965 	leftmask = rightmask = 0;
966 	while (node) {
967 		if (!node->mark)
968 			goto skip;
969 		count++;
970 		if (node->index != index)
971 			node->index = index;
972 		index += node->size;
973 skip:
974 		while (node) {
975 			bitmask = 1 << node->bitnum;
976 			if (node->mark && (leftmask & bitmask) == 0) {
977 				leftmask |= bitmask;
978 				if (node->leftnode == LEAF) {
979 					assert(node->left);
980 					*tree->leaf_index(tree, node->left) =
981 									index;
982 					index += tree->leaf_size(node->left);
983 					count++;
984 				} else if (node->left) {
985 					assert(node->leftnode == NODE);
986 					indent += 1;
987 					node = node->left;
988 					break;
989 				}
990 			}
991 			if (node->mark && (rightmask & bitmask) == 0) {
992 				rightmask |= bitmask;
993 				if (node->rightnode == LEAF) {
994 					assert(node->right);
995 					*tree->leaf_index(tree, node->right) = index;
996 					index += tree->leaf_size(node->right);
997 					count++;
998 				} else if (node->right) {
999 					assert(node->rightnode == NODE);
1000 					indent += 1;
1001 					node = node->right;
1002 					break;
1003 				}
1004 			}
1005 			leftmask &= ~bitmask;
1006 			rightmask &= ~bitmask;
1007 			node = node->parent;
1008 			indent -= 1;
1009 		}
1010 	}
1011 done:
1012 	/* Round up to a multiple of 16 */
1013 	while (index % 16)
1014 		index++;
1015 	if (verbose > 0)
1016 		printf("Final index %d\n", index);
1017 	return index;
1018 }
1019 
1020 /*
1021  * Mark the nodes in a subtree, helper for size_nodes().
1022  */
mark_subtree(struct node * node)1023 static int mark_subtree(struct node *node)
1024 {
1025 	int changed;
1026 
1027 	if (!node || node->mark)
1028 		return 0;
1029 	node->mark = 1;
1030 	node->index = node->parent->index;
1031 	changed = 1;
1032 	if (node->leftnode == NODE)
1033 		changed += mark_subtree(node->left);
1034 	if (node->rightnode == NODE)
1035 		changed += mark_subtree(node->right);
1036 	return changed;
1037 }
1038 
1039 /*
1040  * Compute the size of nodes and leaves. We start by assuming that
1041  * each node needs to store a three-byte offset. The indexes of the
1042  * nodes are calculated based on that, and then this function is
1043  * called to see if the sizes of some nodes can be reduced.  This is
1044  * repeated until no more changes are seen.
1045  */
size_nodes(struct tree * tree)1046 static int size_nodes(struct tree *tree)
1047 {
1048 	struct tree *next;
1049 	struct node *node;
1050 	struct node *right;
1051 	struct node *n;
1052 	unsigned int leftmask;
1053 	unsigned int rightmask;
1054 	unsigned int bitmask;
1055 	unsigned int pathbits;
1056 	unsigned int pathmask;
1057 	unsigned int nbit;
1058 	int changed;
1059 	int offset;
1060 	int size;
1061 	int indent;
1062 
1063 	indent = 1;
1064 	changed = 0;
1065 	size = 0;
1066 
1067 	if (verbose > 0)
1068 		printf("Sizing %s_%x\n", tree->type, tree->maxage);
1069 	if (tree->childnode == LEAF)
1070 		goto done;
1071 
1072 	assert(tree->childnode == NODE);
1073 	pathbits = 0;
1074 	pathmask = 0;
1075 	node = tree->root;
1076 	leftmask = rightmask = 0;
1077 	while (node) {
1078 		if (!node->mark)
1079 			goto skip;
1080 		offset = 0;
1081 		if (!node->left || !node->right) {
1082 			size = 1;
1083 		} else {
1084 			if (node->rightnode == NODE) {
1085 				/*
1086 				 * If the right node is not marked,
1087 				 * look for a corresponding node in
1088 				 * the next tree.  Such a node need
1089 				 * not exist.
1090 				 */
1091 				right = node->right;
1092 				next = tree->next;
1093 				while (!right->mark) {
1094 					assert(next);
1095 					n = next->root;
1096 					while (n->bitnum != node->bitnum) {
1097 						nbit = 1 << n->bitnum;
1098 						if (!(pathmask & nbit))
1099 							break;
1100 						if (pathbits & nbit) {
1101 							if (n->rightnode == LEAF)
1102 								break;
1103 							n = n->right;
1104 						} else {
1105 							if (n->leftnode == LEAF)
1106 								break;
1107 							n = n->left;
1108 						}
1109 					}
1110 					if (n->bitnum != node->bitnum)
1111 						break;
1112 					n = n->right;
1113 					right = n;
1114 					next = next->next;
1115 				}
1116 				/* Make sure the right node is marked. */
1117 				if (!right->mark)
1118 					changed += mark_subtree(right);
1119 				offset = right->index - node->index;
1120 			} else {
1121 				offset = *tree->leaf_index(tree, node->right);
1122 				offset -= node->index;
1123 			}
1124 			assert(offset >= 0);
1125 			assert(offset <= 0xffffff);
1126 			if (offset <= 0xff) {
1127 				size = 2;
1128 			} else if (offset <= 0xffff) {
1129 				size = 3;
1130 			} else { /* offset <= 0xffffff */
1131 				size = 4;
1132 			}
1133 		}
1134 		if (node->size != size || node->offset != offset) {
1135 			node->size = size;
1136 			node->offset = offset;
1137 			changed++;
1138 		}
1139 skip:
1140 		while (node) {
1141 			bitmask = 1 << node->bitnum;
1142 			pathmask |= bitmask;
1143 			if (node->mark && (leftmask & bitmask) == 0) {
1144 				leftmask |= bitmask;
1145 				if (node->leftnode == LEAF) {
1146 					assert(node->left);
1147 				} else if (node->left) {
1148 					assert(node->leftnode == NODE);
1149 					indent += 1;
1150 					node = node->left;
1151 					break;
1152 				}
1153 			}
1154 			if (node->mark && (rightmask & bitmask) == 0) {
1155 				rightmask |= bitmask;
1156 				pathbits |= bitmask;
1157 				if (node->rightnode == LEAF) {
1158 					assert(node->right);
1159 				} else if (node->right) {
1160 					assert(node->rightnode == NODE);
1161 					indent += 1;
1162 					node = node->right;
1163 					break;
1164 				}
1165 			}
1166 			leftmask &= ~bitmask;
1167 			rightmask &= ~bitmask;
1168 			pathmask &= ~bitmask;
1169 			pathbits &= ~bitmask;
1170 			node = node->parent;
1171 			indent -= 1;
1172 		}
1173 	}
1174 done:
1175 	if (verbose > 0)
1176 		printf("Found %d changes\n", changed);
1177 	return changed;
1178 }
1179 
1180 /*
1181  * Emit a trie for the given tree into the data array.
1182  */
emit(struct tree * tree,unsigned char * data)1183 static void emit(struct tree *tree, unsigned char *data)
1184 {
1185 	struct node *node;
1186 	unsigned int leftmask;
1187 	unsigned int rightmask;
1188 	unsigned int bitmask;
1189 	int offlen;
1190 	int offset;
1191 	int index;
1192 	int indent;
1193 	int size;
1194 	int bytes;
1195 	int leaves;
1196 	int nodes[4];
1197 	unsigned char byte;
1198 
1199 	nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
1200 	leaves = 0;
1201 	bytes = 0;
1202 	index = tree->index;
1203 	data += index;
1204 	indent = 1;
1205 	if (verbose > 0)
1206 		printf("Emitting %s_%x\n", tree->type, tree->maxage);
1207 	if (tree->childnode == LEAF) {
1208 		assert(tree->root);
1209 		tree->leaf_emit(tree->root, data);
1210 		size = tree->leaf_size(tree->root);
1211 		index += size;
1212 		leaves++;
1213 		goto done;
1214 	}
1215 
1216 	assert(tree->childnode == NODE);
1217 	node = tree->root;
1218 	leftmask = rightmask = 0;
1219 	while (node) {
1220 		if (!node->mark)
1221 			goto skip;
1222 		assert(node->offset != -1);
1223 		assert(node->index == index);
1224 
1225 		byte = 0;
1226 		if (node->nextbyte)
1227 			byte |= NEXTBYTE;
1228 		byte |= (node->bitnum & BITNUM);
1229 		if (node->left && node->right) {
1230 			if (node->leftnode == NODE)
1231 				byte |= LEFTNODE;
1232 			if (node->rightnode == NODE)
1233 				byte |= RIGHTNODE;
1234 			if (node->offset <= 0xff)
1235 				offlen = 1;
1236 			else if (node->offset <= 0xffff)
1237 				offlen = 2;
1238 			else
1239 				offlen = 3;
1240 			nodes[offlen]++;
1241 			offset = node->offset;
1242 			byte |= offlen << OFFLEN_SHIFT;
1243 			*data++ = byte;
1244 			index++;
1245 			while (offlen--) {
1246 				*data++ = offset & 0xff;
1247 				index++;
1248 				offset >>= 8;
1249 			}
1250 		} else if (node->left) {
1251 			if (node->leftnode == NODE)
1252 				byte |= TRIENODE;
1253 			nodes[0]++;
1254 			*data++ = byte;
1255 			index++;
1256 		} else if (node->right) {
1257 			byte |= RIGHTNODE;
1258 			if (node->rightnode == NODE)
1259 				byte |= TRIENODE;
1260 			nodes[0]++;
1261 			*data++ = byte;
1262 			index++;
1263 		} else {
1264 			assert(0);
1265 		}
1266 skip:
1267 		while (node) {
1268 			bitmask = 1 << node->bitnum;
1269 			if (node->mark && (leftmask & bitmask) == 0) {
1270 				leftmask |= bitmask;
1271 				if (node->leftnode == LEAF) {
1272 					assert(node->left);
1273 					data = tree->leaf_emit(node->left,
1274 							       data);
1275 					size = tree->leaf_size(node->left);
1276 					index += size;
1277 					bytes += size;
1278 					leaves++;
1279 				} else if (node->left) {
1280 					assert(node->leftnode == NODE);
1281 					indent += 1;
1282 					node = node->left;
1283 					break;
1284 				}
1285 			}
1286 			if (node->mark && (rightmask & bitmask) == 0) {
1287 				rightmask |= bitmask;
1288 				if (node->rightnode == LEAF) {
1289 					assert(node->right);
1290 					data = tree->leaf_emit(node->right,
1291 							       data);
1292 					size = tree->leaf_size(node->right);
1293 					index += size;
1294 					bytes += size;
1295 					leaves++;
1296 				} else if (node->right) {
1297 					assert(node->rightnode == NODE);
1298 					indent += 1;
1299 					node = node->right;
1300 					break;
1301 				}
1302 			}
1303 			leftmask &= ~bitmask;
1304 			rightmask &= ~bitmask;
1305 			node = node->parent;
1306 			indent -= 1;
1307 		}
1308 	}
1309 done:
1310 	if (verbose > 0) {
1311 		printf("Emitted %d (%d) leaves",
1312 			leaves, bytes);
1313 		printf(" %d (%d+%d+%d+%d) nodes",
1314 			nodes[0] + nodes[1] + nodes[2] + nodes[3],
1315 			nodes[0], nodes[1], nodes[2], nodes[3]);
1316 		printf(" %d total\n", index - tree->index);
1317 	}
1318 }
1319 
1320 /* ------------------------------------------------------------------ */
1321 
1322 /*
1323  * Unicode data.
1324  *
1325  * We need to keep track of the Canonical Combining Class, the Age,
1326  * and decompositions for a code point.
1327  *
1328  * For the Age, we store the index into the ages table.  Effectively
1329  * this is a generation number that the table maps to a unicode
1330  * version.
1331  *
1332  * The correction field is used to indicate that this entry is in the
1333  * corrections array, which contains decompositions that were
1334  * corrected in later revisions.  The value of the correction field is
1335  * the Unicode version in which the mapping was corrected.
1336  */
1337 struct unicode_data {
1338 	unsigned int code;
1339 	int ccc;
1340 	int gen;
1341 	int correction;
1342 	unsigned int *utf32nfkdi;
1343 	unsigned int *utf32nfkdicf;
1344 	char *utf8nfkdi;
1345 	char *utf8nfkdicf;
1346 };
1347 
1348 struct unicode_data unicode_data[0x110000];
1349 struct unicode_data *corrections;
1350 int    corrections_count;
1351 
1352 struct tree *nfkdi_tree;
1353 struct tree *nfkdicf_tree;
1354 
1355 struct tree *trees;
1356 int          trees_count;
1357 
1358 /*
1359  * Check the corrections array to see if this entry was corrected at
1360  * some point.
1361  */
corrections_lookup(struct unicode_data * u)1362 static struct unicode_data *corrections_lookup(struct unicode_data *u)
1363 {
1364 	int i;
1365 
1366 	for (i = 0; i != corrections_count; i++)
1367 		if (u->code == corrections[i].code)
1368 			return &corrections[i];
1369 	return u;
1370 }
1371 
nfkdi_equal(void * l,void * r)1372 static int nfkdi_equal(void *l, void *r)
1373 {
1374 	struct unicode_data *left = l;
1375 	struct unicode_data *right = r;
1376 
1377 	if (left->gen != right->gen)
1378 		return 0;
1379 	if (left->ccc != right->ccc)
1380 		return 0;
1381 	if (left->utf8nfkdi && right->utf8nfkdi &&
1382 	    strcmp(left->utf8nfkdi, right->utf8nfkdi) == 0)
1383 		return 1;
1384 	if (left->utf8nfkdi || right->utf8nfkdi)
1385 		return 0;
1386 	return 1;
1387 }
1388 
nfkdicf_equal(void * l,void * r)1389 static int nfkdicf_equal(void *l, void *r)
1390 {
1391 	struct unicode_data *left = l;
1392 	struct unicode_data *right = r;
1393 
1394 	if (left->gen != right->gen)
1395 		return 0;
1396 	if (left->ccc != right->ccc)
1397 		return 0;
1398 	if (left->utf8nfkdicf && right->utf8nfkdicf &&
1399 	    strcmp(left->utf8nfkdicf, right->utf8nfkdicf) == 0)
1400 		return 1;
1401 	if (left->utf8nfkdicf && right->utf8nfkdicf)
1402 		return 0;
1403 	if (left->utf8nfkdicf || right->utf8nfkdicf)
1404 		return 0;
1405 	if (left->utf8nfkdi && right->utf8nfkdi &&
1406 	    strcmp(left->utf8nfkdi, right->utf8nfkdi) == 0)
1407 		return 1;
1408 	if (left->utf8nfkdi || right->utf8nfkdi)
1409 		return 0;
1410 	return 1;
1411 }
1412 
nfkdi_print(void * l,int indent)1413 static void nfkdi_print(void *l, int indent)
1414 {
1415 	struct unicode_data *leaf = l;
1416 
1417 	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1418 		leaf->code, leaf->ccc, leaf->gen);
1419 	if (leaf->utf8nfkdi && leaf->utf8nfkdi[0] == HANGUL)
1420 		printf(" nfkdi \"%s\"", "HANGUL SYLLABLE");
1421 	else if (leaf->utf8nfkdi)
1422 		printf(" nfkdi \"%s\"", (const char*)leaf->utf8nfkdi);
1423 	printf("\n");
1424 }
1425 
nfkdicf_print(void * l,int indent)1426 static void nfkdicf_print(void *l, int indent)
1427 {
1428 	struct unicode_data *leaf = l;
1429 
1430 	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1431 		leaf->code, leaf->ccc, leaf->gen);
1432 	if (leaf->utf8nfkdicf)
1433 		printf(" nfkdicf \"%s\"", (const char*)leaf->utf8nfkdicf);
1434 	else if (leaf->utf8nfkdi && leaf->utf8nfkdi[0] == HANGUL)
1435 		printf(" nfkdi \"%s\"", "HANGUL SYLLABLE");
1436 	else if (leaf->utf8nfkdi)
1437 		printf(" nfkdi \"%s\"", (const char*)leaf->utf8nfkdi);
1438 	printf("\n");
1439 }
1440 
nfkdi_mark(void * l)1441 static int nfkdi_mark(void *l)
1442 {
1443 	return 1;
1444 }
1445 
nfkdicf_mark(void * l)1446 static int nfkdicf_mark(void *l)
1447 {
1448 	struct unicode_data *leaf = l;
1449 
1450 	if (leaf->utf8nfkdicf)
1451 		return 1;
1452 	return 0;
1453 }
1454 
correction_mark(void * l)1455 static int correction_mark(void *l)
1456 {
1457 	struct unicode_data *leaf = l;
1458 
1459 	return leaf->correction;
1460 }
1461 
nfkdi_size(void * l)1462 static int nfkdi_size(void *l)
1463 {
1464 	struct unicode_data *leaf = l;
1465 
1466 	int size = 2;
1467 	if (HANGUL_SYLLABLE(leaf->code))
1468 		size += 1;
1469 	else if (leaf->utf8nfkdi)
1470 		size += strlen(leaf->utf8nfkdi) + 1;
1471 	return size;
1472 }
1473 
nfkdicf_size(void * l)1474 static int nfkdicf_size(void *l)
1475 {
1476 	struct unicode_data *leaf = l;
1477 
1478 	int size = 2;
1479 	if (HANGUL_SYLLABLE(leaf->code))
1480 		size += 1;
1481 	else if (leaf->utf8nfkdicf)
1482 		size += strlen(leaf->utf8nfkdicf) + 1;
1483 	else if (leaf->utf8nfkdi)
1484 		size += strlen(leaf->utf8nfkdi) + 1;
1485 	return size;
1486 }
1487 
nfkdi_index(struct tree * tree,void * l)1488 static int *nfkdi_index(struct tree *tree, void *l)
1489 {
1490 	struct unicode_data *leaf = l;
1491 
1492 	return &tree->leafindex[leaf->code];
1493 }
1494 
nfkdicf_index(struct tree * tree,void * l)1495 static int *nfkdicf_index(struct tree *tree, void *l)
1496 {
1497 	struct unicode_data *leaf = l;
1498 
1499 	return &tree->leafindex[leaf->code];
1500 }
1501 
nfkdi_emit(void * l,unsigned char * data)1502 static unsigned char *nfkdi_emit(void *l, unsigned char *data)
1503 {
1504 	struct unicode_data *leaf = l;
1505 	unsigned char *s;
1506 
1507 	*data++ = leaf->gen;
1508 	if (HANGUL_SYLLABLE(leaf->code)) {
1509 		*data++ = DECOMPOSE;
1510 		*data++ = HANGUL;
1511 	} else if (leaf->utf8nfkdi) {
1512 		*data++ = DECOMPOSE;
1513 		s = (unsigned char*)leaf->utf8nfkdi;
1514 		while ((*data++ = *s++) != 0)
1515 			;
1516 	} else {
1517 		*data++ = leaf->ccc;
1518 	}
1519 	return data;
1520 }
1521 
nfkdicf_emit(void * l,unsigned char * data)1522 static unsigned char *nfkdicf_emit(void *l, unsigned char *data)
1523 {
1524 	struct unicode_data *leaf = l;
1525 	unsigned char *s;
1526 
1527 	*data++ = leaf->gen;
1528 	if (HANGUL_SYLLABLE(leaf->code)) {
1529 		*data++ = DECOMPOSE;
1530 		*data++ = HANGUL;
1531 	} else if (leaf->utf8nfkdicf) {
1532 		*data++ = DECOMPOSE;
1533 		s = (unsigned char*)leaf->utf8nfkdicf;
1534 		while ((*data++ = *s++) != 0)
1535 			;
1536 	} else if (leaf->utf8nfkdi) {
1537 		*data++ = DECOMPOSE;
1538 		s = (unsigned char*)leaf->utf8nfkdi;
1539 		while ((*data++ = *s++) != 0)
1540 			;
1541 	} else {
1542 		*data++ = leaf->ccc;
1543 	}
1544 	return data;
1545 }
1546 
utf8_create(struct unicode_data * data)1547 static void utf8_create(struct unicode_data *data)
1548 {
1549 	char utf[18*4+1];
1550 	char *u;
1551 	unsigned int *um;
1552 	int i;
1553 
1554 	if (data->utf8nfkdi) {
1555 		assert(data->utf8nfkdi[0] == HANGUL);
1556 		return;
1557 	}
1558 
1559 	u = utf;
1560 	um = data->utf32nfkdi;
1561 	if (um) {
1562 		for (i = 0; um[i]; i++)
1563 			u += utf8encode(u, um[i]);
1564 		*u = '\0';
1565 		data->utf8nfkdi = strdup(utf);
1566 	}
1567 	u = utf;
1568 	um = data->utf32nfkdicf;
1569 	if (um) {
1570 		for (i = 0; um[i]; i++)
1571 			u += utf8encode(u, um[i]);
1572 		*u = '\0';
1573 		if (!data->utf8nfkdi || strcmp(data->utf8nfkdi, utf))
1574 			data->utf8nfkdicf = strdup(utf);
1575 	}
1576 }
1577 
utf8_init(void)1578 static void utf8_init(void)
1579 {
1580 	unsigned int unichar;
1581 	int i;
1582 
1583 	for (unichar = 0; unichar != 0x110000; unichar++)
1584 		utf8_create(&unicode_data[unichar]);
1585 
1586 	for (i = 0; i != corrections_count; i++)
1587 		utf8_create(&corrections[i]);
1588 }
1589 
trees_init(void)1590 static void trees_init(void)
1591 {
1592 	struct unicode_data *data;
1593 	unsigned int maxage;
1594 	unsigned int nextage;
1595 	int count;
1596 	int i;
1597 	int j;
1598 
1599 	/* Count the number of different ages. */
1600 	count = 0;
1601 	nextage = (unsigned int)-1;
1602 	do {
1603 		maxage = nextage;
1604 		nextage = 0;
1605 		for (i = 0; i <= corrections_count; i++) {
1606 			data = &corrections[i];
1607 			if (nextage < data->correction &&
1608 			    data->correction < maxage)
1609 				nextage = data->correction;
1610 		}
1611 		count++;
1612 	} while (nextage);
1613 
1614 	/* Two trees per age: nfkdi and nfkdicf */
1615 	trees_count = count * 2;
1616 	trees = calloc(trees_count, sizeof(struct tree));
1617 
1618 	/* Assign ages to the trees. */
1619 	count = trees_count;
1620 	nextage = (unsigned int)-1;
1621 	do {
1622 		maxage = nextage;
1623 		trees[--count].maxage = maxage;
1624 		trees[--count].maxage = maxage;
1625 		nextage = 0;
1626 		for (i = 0; i <= corrections_count; i++) {
1627 			data = &corrections[i];
1628 			if (nextage < data->correction &&
1629 			    data->correction < maxage)
1630 				nextage = data->correction;
1631 		}
1632 	} while (nextage);
1633 
1634 	/* The ages assigned above are off by one. */
1635 	for (i = 0; i != trees_count; i++) {
1636 		j = 0;
1637 		while (ages[j] < trees[i].maxage)
1638 			j++;
1639 		trees[i].maxage = ages[j-1];
1640 	}
1641 
1642 	/* Set up the forwarding between trees. */
1643 	trees[trees_count-2].next = &trees[trees_count-1];
1644 	trees[trees_count-1].leaf_mark = nfkdi_mark;
1645 	trees[trees_count-2].leaf_mark = nfkdicf_mark;
1646 	for (i = 0; i != trees_count-2; i += 2) {
1647 		trees[i].next = &trees[trees_count-2];
1648 		trees[i].leaf_mark = correction_mark;
1649 		trees[i+1].next = &trees[trees_count-1];
1650 		trees[i+1].leaf_mark = correction_mark;
1651 	}
1652 
1653 	/* Assign the callouts. */
1654 	for (i = 0; i != trees_count; i += 2) {
1655 		trees[i].type = "nfkdicf";
1656 		trees[i].leaf_equal = nfkdicf_equal;
1657 		trees[i].leaf_print = nfkdicf_print;
1658 		trees[i].leaf_size = nfkdicf_size;
1659 		trees[i].leaf_index = nfkdicf_index;
1660 		trees[i].leaf_emit = nfkdicf_emit;
1661 
1662 		trees[i+1].type = "nfkdi";
1663 		trees[i+1].leaf_equal = nfkdi_equal;
1664 		trees[i+1].leaf_print = nfkdi_print;
1665 		trees[i+1].leaf_size = nfkdi_size;
1666 		trees[i+1].leaf_index = nfkdi_index;
1667 		trees[i+1].leaf_emit = nfkdi_emit;
1668 	}
1669 
1670 	/* Finish init. */
1671 	for (i = 0; i != trees_count; i++)
1672 		trees[i].childnode = NODE;
1673 }
1674 
trees_populate(void)1675 static void trees_populate(void)
1676 {
1677 	struct unicode_data *data;
1678 	unsigned int unichar;
1679 	char keyval[4];
1680 	int keylen;
1681 	int i;
1682 
1683 	for (i = 0; i != trees_count; i++) {
1684 		if (verbose > 0) {
1685 			printf("Populating %s_%x\n",
1686 				trees[i].type, trees[i].maxage);
1687 		}
1688 		for (unichar = 0; unichar != 0x110000; unichar++) {
1689 			if (unicode_data[unichar].gen < 0)
1690 				continue;
1691 			keylen = utf8encode(keyval, unichar);
1692 			data = corrections_lookup(&unicode_data[unichar]);
1693 			if (data->correction <= trees[i].maxage)
1694 				data = &unicode_data[unichar];
1695 			insert(&trees[i], keyval, keylen, data);
1696 		}
1697 	}
1698 }
1699 
trees_reduce(void)1700 static void trees_reduce(void)
1701 {
1702 	int i;
1703 	int size;
1704 	int changed;
1705 
1706 	for (i = 0; i != trees_count; i++)
1707 		prune(&trees[i]);
1708 	for (i = 0; i != trees_count; i++)
1709 		mark_nodes(&trees[i]);
1710 	do {
1711 		size = 0;
1712 		for (i = 0; i != trees_count; i++)
1713 			size = index_nodes(&trees[i], size);
1714 		changed = 0;
1715 		for (i = 0; i != trees_count; i++)
1716 			changed += size_nodes(&trees[i]);
1717 	} while (changed);
1718 
1719 	utf8data = calloc(size, 1);
1720 	utf8data_size = size;
1721 	for (i = 0; i != trees_count; i++)
1722 		emit(&trees[i], utf8data);
1723 
1724 	if (verbose > 0) {
1725 		for (i = 0; i != trees_count; i++) {
1726 			printf("%s_%x idx %d\n",
1727 				trees[i].type, trees[i].maxage, trees[i].index);
1728 		}
1729 	}
1730 
1731 	nfkdi = utf8data + trees[trees_count-1].index;
1732 	nfkdicf = utf8data + trees[trees_count-2].index;
1733 
1734 	nfkdi_tree = &trees[trees_count-1];
1735 	nfkdicf_tree = &trees[trees_count-2];
1736 }
1737 
verify(struct tree * tree)1738 static void verify(struct tree *tree)
1739 {
1740 	struct unicode_data *data;
1741 	utf8leaf_t	*leaf;
1742 	unsigned int	unichar;
1743 	char		key[4];
1744 	unsigned char	hangul[UTF8HANGULLEAF];
1745 	int		report;
1746 	int		nocf;
1747 
1748 	if (verbose > 0)
1749 		printf("Verifying %s_%x\n", tree->type, tree->maxage);
1750 	nocf = strcmp(tree->type, "nfkdicf");
1751 
1752 	for (unichar = 0; unichar != 0x110000; unichar++) {
1753 		report = 0;
1754 		data = corrections_lookup(&unicode_data[unichar]);
1755 		if (data->correction <= tree->maxage)
1756 			data = &unicode_data[unichar];
1757 		utf8encode(key,unichar);
1758 		leaf = utf8lookup(tree, hangul, key);
1759 
1760 		if (!leaf) {
1761 			if (data->gen != -1)
1762 				report++;
1763 			if (unichar < 0xd800 || unichar > 0xdfff)
1764 				report++;
1765 		} else {
1766 			if (unichar >= 0xd800 && unichar <= 0xdfff)
1767 				report++;
1768 			if (data->gen == -1)
1769 				report++;
1770 			if (data->gen != LEAF_GEN(leaf))
1771 				report++;
1772 			if (LEAF_CCC(leaf) == DECOMPOSE) {
1773 				if (HANGUL_SYLLABLE(data->code)) {
1774 					if (data->utf8nfkdi[0] != HANGUL)
1775 						report++;
1776 				} else if (nocf) {
1777 					if (!data->utf8nfkdi) {
1778 						report++;
1779 					} else if (strcmp(data->utf8nfkdi,
1780 							  LEAF_STR(leaf))) {
1781 						report++;
1782 					}
1783 				} else {
1784 					if (!data->utf8nfkdicf &&
1785 					    !data->utf8nfkdi) {
1786 						report++;
1787 					} else if (data->utf8nfkdicf) {
1788 						if (strcmp(data->utf8nfkdicf,
1789 							   LEAF_STR(leaf)))
1790 							report++;
1791 					} else if (strcmp(data->utf8nfkdi,
1792 							  LEAF_STR(leaf))) {
1793 						report++;
1794 					}
1795 				}
1796 			} else if (data->ccc != LEAF_CCC(leaf)) {
1797 				report++;
1798 			}
1799 		}
1800 		if (report) {
1801 			printf("%X code %X gen %d ccc %d"
1802 				" nfkdi -> \"%s\"",
1803 				unichar, data->code, data->gen,
1804 				data->ccc,
1805 				data->utf8nfkdi);
1806 			if (leaf) {
1807 				printf(" gen %d ccc %d"
1808 					" nfkdi -> \"%s\"",
1809 					LEAF_GEN(leaf),
1810 					LEAF_CCC(leaf),
1811 					LEAF_CCC(leaf) == DECOMPOSE ?
1812 						LEAF_STR(leaf) : "");
1813 			}
1814 			printf("\n");
1815 		}
1816 	}
1817 }
1818 
trees_verify(void)1819 static void trees_verify(void)
1820 {
1821 	int i;
1822 
1823 	for (i = 0; i != trees_count; i++)
1824 		verify(&trees[i]);
1825 }
1826 
1827 /* ------------------------------------------------------------------ */
1828 
help(void)1829 static void help(void)
1830 {
1831 	printf("Usage: %s [options]\n", argv0);
1832 	printf("\n");
1833 	printf("This program creates an a data trie used for parsing and\n");
1834 	printf("normalization of UTF-8 strings. The trie is derived from\n");
1835 	printf("a set of input files from the Unicode character database\n");
1836 	printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
1837 	printf("\n");
1838 	printf("The generated tree supports two normalization forms:\n");
1839 	printf("\n");
1840 	printf("\tnfkdi:\n");
1841 	printf("\t- Apply unicode normalization form NFKD.\n");
1842 	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1843 	printf("\n");
1844 	printf("\tnfkdicf:\n");
1845 	printf("\t- Apply unicode normalization form NFKD.\n");
1846 	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1847 	printf("\t- Apply a full casefold (C + F).\n");
1848 	printf("\n");
1849 	printf("These forms were chosen as being most useful when dealing\n");
1850 	printf("with file names: NFKD catches most cases where characters\n");
1851 	printf("should be considered equivalent. The ignorables are mostly\n");
1852 	printf("invisible, making names hard to type.\n");
1853 	printf("\n");
1854 	printf("The options to specify the files to be used are listed\n");
1855 	printf("below with their default values, which are the names used\n");
1856 	printf("by version 11.0.0 of the Unicode Character Database.\n");
1857 	printf("\n");
1858 	printf("The input files:\n");
1859 	printf("\t-a %s\n", AGE_NAME);
1860 	printf("\t-c %s\n", CCC_NAME);
1861 	printf("\t-p %s\n", PROP_NAME);
1862 	printf("\t-d %s\n", DATA_NAME);
1863 	printf("\t-f %s\n", FOLD_NAME);
1864 	printf("\t-n %s\n", NORM_NAME);
1865 	printf("\n");
1866 	printf("Additionally, the generated tables are tested using:\n");
1867 	printf("\t-t %s\n", TEST_NAME);
1868 	printf("\n");
1869 	printf("Finally, the output file:\n");
1870 	printf("\t-o %s\n", UTF8_NAME);
1871 	printf("\n");
1872 }
1873 
usage(void)1874 static void usage(void)
1875 {
1876 	help();
1877 	exit(1);
1878 }
1879 
open_fail(const char * name,int error)1880 static void open_fail(const char *name, int error)
1881 {
1882 	printf("Error %d opening %s: %s\n", error, name, strerror(error));
1883 	exit(1);
1884 }
1885 
file_fail(const char * filename)1886 static void file_fail(const char *filename)
1887 {
1888 	printf("Error parsing %s\n", filename);
1889 	exit(1);
1890 }
1891 
line_fail(const char * filename,const char * line)1892 static void line_fail(const char *filename, const char *line)
1893 {
1894 	printf("Error parsing %s:%s\n", filename, line);
1895 	exit(1);
1896 }
1897 
1898 /* ------------------------------------------------------------------ */
1899 
print_utf32(unsigned int * utf32str)1900 static void print_utf32(unsigned int *utf32str)
1901 {
1902 	int	i;
1903 
1904 	for (i = 0; utf32str[i]; i++)
1905 		printf(" %X", utf32str[i]);
1906 }
1907 
print_utf32nfkdi(unsigned int unichar)1908 static void print_utf32nfkdi(unsigned int unichar)
1909 {
1910 	printf(" %X ->", unichar);
1911 	print_utf32(unicode_data[unichar].utf32nfkdi);
1912 	printf("\n");
1913 }
1914 
print_utf32nfkdicf(unsigned int unichar)1915 static void print_utf32nfkdicf(unsigned int unichar)
1916 {
1917 	printf(" %X ->", unichar);
1918 	print_utf32(unicode_data[unichar].utf32nfkdicf);
1919 	printf("\n");
1920 }
1921 
1922 /* ------------------------------------------------------------------ */
1923 
age_init(void)1924 static void age_init(void)
1925 {
1926 	FILE *file;
1927 	unsigned int first;
1928 	unsigned int last;
1929 	unsigned int unichar;
1930 	unsigned int major;
1931 	unsigned int minor;
1932 	unsigned int revision;
1933 	int gen;
1934 	int count;
1935 	int ret;
1936 
1937 	if (verbose > 0)
1938 		printf("Parsing %s\n", age_name);
1939 
1940 	file = fopen(age_name, "r");
1941 	if (!file)
1942 		open_fail(age_name, errno);
1943 	count = 0;
1944 
1945 	gen = 0;
1946 	while (fgets(line, LINESIZE, file)) {
1947 		ret = sscanf(line, "# Age=V%d_%d_%d",
1948 				&major, &minor, &revision);
1949 		if (ret == 3) {
1950 			ages_count++;
1951 			if (verbose > 1)
1952 				printf(" Age V%d_%d_%d\n",
1953 					major, minor, revision);
1954 			if (!age_valid(major, minor, revision))
1955 				line_fail(age_name, line);
1956 			continue;
1957 		}
1958 		ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1959 		if (ret == 2) {
1960 			ages_count++;
1961 			if (verbose > 1)
1962 				printf(" Age V%d_%d\n", major, minor);
1963 			if (!age_valid(major, minor, 0))
1964 				line_fail(age_name, line);
1965 			continue;
1966 		}
1967 	}
1968 
1969 	/* We must have found something above. */
1970 	if (verbose > 1)
1971 		printf("%d age entries\n", ages_count);
1972 	if (ages_count == 0 || ages_count > MAXGEN)
1973 		file_fail(age_name);
1974 
1975 	/* There is a 0 entry. */
1976 	ages_count++;
1977 	ages = calloc(ages_count + 1, sizeof(*ages));
1978 	/* And a guard entry. */
1979 	ages[ages_count] = (unsigned int)-1;
1980 
1981 	rewind(file);
1982 	count = 0;
1983 	gen = 0;
1984 	while (fgets(line, LINESIZE, file)) {
1985 		ret = sscanf(line, "# Age=V%d_%d_%d",
1986 				&major, &minor, &revision);
1987 		if (ret == 3) {
1988 			ages[++gen] =
1989 				UNICODE_AGE(major, minor, revision);
1990 			if (verbose > 1)
1991 				printf(" Age V%d_%d_%d = gen %d\n",
1992 					major, minor, revision, gen);
1993 			if (!age_valid(major, minor, revision))
1994 				line_fail(age_name, line);
1995 			continue;
1996 		}
1997 		ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1998 		if (ret == 2) {
1999 			ages[++gen] = UNICODE_AGE(major, minor, 0);
2000 			if (verbose > 1)
2001 				printf(" Age V%d_%d = %d\n",
2002 					major, minor, gen);
2003 			if (!age_valid(major, minor, 0))
2004 				line_fail(age_name, line);
2005 			continue;
2006 		}
2007 		ret = sscanf(line, "%X..%X ; %d.%d #",
2008 			     &first, &last, &major, &minor);
2009 		if (ret == 4) {
2010 			for (unichar = first; unichar <= last; unichar++)
2011 				unicode_data[unichar].gen = gen;
2012 			count += 1 + last - first;
2013 			if (verbose > 1)
2014 				printf("  %X..%X gen %d\n", first, last, gen);
2015 			if (!utf32valid(first) || !utf32valid(last))
2016 				line_fail(age_name, line);
2017 			continue;
2018 		}
2019 		ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
2020 		if (ret == 3) {
2021 			unicode_data[unichar].gen = gen;
2022 			count++;
2023 			if (verbose > 1)
2024 				printf("  %X gen %d\n", unichar, gen);
2025 			if (!utf32valid(unichar))
2026 				line_fail(age_name, line);
2027 			continue;
2028 		}
2029 	}
2030 	unicode_maxage = ages[gen];
2031 	fclose(file);
2032 
2033 	/* Nix surrogate block */
2034 	if (verbose > 1)
2035 		printf(" Removing surrogate block D800..DFFF\n");
2036 	for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
2037 		unicode_data[unichar].gen = -1;
2038 
2039 	if (verbose > 0)
2040 	        printf("Found %d entries\n", count);
2041 	if (count == 0)
2042 		file_fail(age_name);
2043 }
2044 
ccc_init(void)2045 static void ccc_init(void)
2046 {
2047 	FILE *file;
2048 	unsigned int first;
2049 	unsigned int last;
2050 	unsigned int unichar;
2051 	unsigned int value;
2052 	int count;
2053 	int ret;
2054 
2055 	if (verbose > 0)
2056 		printf("Parsing %s\n", ccc_name);
2057 
2058 	file = fopen(ccc_name, "r");
2059 	if (!file)
2060 		open_fail(ccc_name, errno);
2061 
2062 	count = 0;
2063 	while (fgets(line, LINESIZE, file)) {
2064 		ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
2065 		if (ret == 3) {
2066 			for (unichar = first; unichar <= last; unichar++) {
2067 				unicode_data[unichar].ccc = value;
2068                                 count++;
2069 			}
2070 			if (verbose > 1)
2071 				printf(" %X..%X ccc %d\n", first, last, value);
2072 			if (!utf32valid(first) || !utf32valid(last))
2073 				line_fail(ccc_name, line);
2074 			continue;
2075 		}
2076 		ret = sscanf(line, "%X ; %d #", &unichar, &value);
2077 		if (ret == 2) {
2078 			unicode_data[unichar].ccc = value;
2079                         count++;
2080 			if (verbose > 1)
2081 				printf(" %X ccc %d\n", unichar, value);
2082 			if (!utf32valid(unichar))
2083 				line_fail(ccc_name, line);
2084 			continue;
2085 		}
2086 	}
2087 	fclose(file);
2088 
2089 	if (verbose > 0)
2090 		printf("Found %d entries\n", count);
2091 	if (count == 0)
2092 		file_fail(ccc_name);
2093 }
2094 
nfkdi_init(void)2095 static void nfkdi_init(void)
2096 {
2097 	FILE *file;
2098 	unsigned int unichar;
2099 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2100 	char *s;
2101 	unsigned int *um;
2102 	int count;
2103 	int i;
2104 	int ret;
2105 
2106 	if (verbose > 0)
2107 		printf("Parsing %s\n", data_name);
2108 	file = fopen(data_name, "r");
2109 	if (!file)
2110 		open_fail(data_name, errno);
2111 
2112 	count = 0;
2113 	while (fgets(line, LINESIZE, file)) {
2114 		ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
2115 			     &unichar, buf0);
2116 		if (ret != 2)
2117 			continue;
2118 		if (!utf32valid(unichar))
2119 			line_fail(data_name, line);
2120 
2121 		s = buf0;
2122 		/* skip over <tag> */
2123 		if (*s == '<')
2124 			while (*s++ != ' ')
2125 				;
2126 		/* decode the decomposition into UTF-32 */
2127 		i = 0;
2128 		while (*s) {
2129 			mapping[i] = strtoul(s, &s, 16);
2130 			if (!utf32valid(mapping[i]))
2131 				line_fail(data_name, line);
2132 			i++;
2133 		}
2134 		mapping[i++] = 0;
2135 
2136 		um = malloc(i * sizeof(unsigned int));
2137 		memcpy(um, mapping, i * sizeof(unsigned int));
2138 		unicode_data[unichar].utf32nfkdi = um;
2139 
2140 		if (verbose > 1)
2141 			print_utf32nfkdi(unichar);
2142 		count++;
2143 	}
2144 	fclose(file);
2145 	if (verbose > 0)
2146 		printf("Found %d entries\n", count);
2147 	if (count == 0)
2148 		file_fail(data_name);
2149 }
2150 
nfkdicf_init(void)2151 static void nfkdicf_init(void)
2152 {
2153 	FILE *file;
2154 	unsigned int unichar;
2155 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2156 	char status;
2157 	char *s;
2158 	unsigned int *um;
2159 	int i;
2160 	int count;
2161 	int ret;
2162 
2163 	if (verbose > 0)
2164 		printf("Parsing %s\n", fold_name);
2165 	file = fopen(fold_name, "r");
2166 	if (!file)
2167 		open_fail(fold_name, errno);
2168 
2169 	count = 0;
2170 	while (fgets(line, LINESIZE, file)) {
2171 		ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
2172 		if (ret != 3)
2173 			continue;
2174 		if (!utf32valid(unichar))
2175 			line_fail(fold_name, line);
2176 		/* Use the C+F casefold. */
2177 		if (status != 'C' && status != 'F')
2178 			continue;
2179 		s = buf0;
2180 		if (*s == '<')
2181 			while (*s++ != ' ')
2182 				;
2183 		i = 0;
2184 		while (*s) {
2185 			mapping[i] = strtoul(s, &s, 16);
2186 			if (!utf32valid(mapping[i]))
2187 				line_fail(fold_name, line);
2188 			i++;
2189 		}
2190 		mapping[i++] = 0;
2191 
2192 		um = malloc(i * sizeof(unsigned int));
2193 		memcpy(um, mapping, i * sizeof(unsigned int));
2194 		unicode_data[unichar].utf32nfkdicf = um;
2195 
2196 		if (verbose > 1)
2197 			print_utf32nfkdicf(unichar);
2198 		count++;
2199 	}
2200 	fclose(file);
2201 	if (verbose > 0)
2202 		printf("Found %d entries\n", count);
2203 	if (count == 0)
2204 		file_fail(fold_name);
2205 }
2206 
ignore_init(void)2207 static void ignore_init(void)
2208 {
2209 	FILE *file;
2210 	unsigned int unichar;
2211 	unsigned int first;
2212 	unsigned int last;
2213 	unsigned int *um;
2214 	int count;
2215 	int ret;
2216 
2217 	if (verbose > 0)
2218 		printf("Parsing %s\n", prop_name);
2219 	file = fopen(prop_name, "r");
2220 	if (!file)
2221 		open_fail(prop_name, errno);
2222 	assert(file);
2223 	count = 0;
2224 	while (fgets(line, LINESIZE, file)) {
2225 		ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
2226 		if (ret == 3) {
2227 			if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2228 				continue;
2229 			if (!utf32valid(first) || !utf32valid(last))
2230 				line_fail(prop_name, line);
2231 			for (unichar = first; unichar <= last; unichar++) {
2232 				free(unicode_data[unichar].utf32nfkdi);
2233 				um = malloc(sizeof(unsigned int));
2234 				*um = 0;
2235 				unicode_data[unichar].utf32nfkdi = um;
2236 				free(unicode_data[unichar].utf32nfkdicf);
2237 				um = malloc(sizeof(unsigned int));
2238 				*um = 0;
2239 				unicode_data[unichar].utf32nfkdicf = um;
2240 				count++;
2241 			}
2242 			if (verbose > 1)
2243 				printf(" %X..%X Default_Ignorable_Code_Point\n",
2244 					first, last);
2245 			continue;
2246 		}
2247 		ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
2248 		if (ret == 2) {
2249 			if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2250 				continue;
2251 			if (!utf32valid(unichar))
2252 				line_fail(prop_name, line);
2253 			free(unicode_data[unichar].utf32nfkdi);
2254 			um = malloc(sizeof(unsigned int));
2255 			*um = 0;
2256 			unicode_data[unichar].utf32nfkdi = um;
2257 			free(unicode_data[unichar].utf32nfkdicf);
2258 			um = malloc(sizeof(unsigned int));
2259 			*um = 0;
2260 			unicode_data[unichar].utf32nfkdicf = um;
2261 			if (verbose > 1)
2262 				printf(" %X Default_Ignorable_Code_Point\n",
2263 					unichar);
2264 			count++;
2265 			continue;
2266 		}
2267 	}
2268 	fclose(file);
2269 
2270 	if (verbose > 0)
2271 		printf("Found %d entries\n", count);
2272 	if (count == 0)
2273 		file_fail(prop_name);
2274 }
2275 
corrections_init(void)2276 static void corrections_init(void)
2277 {
2278 	FILE *file;
2279 	unsigned int unichar;
2280 	unsigned int major;
2281 	unsigned int minor;
2282 	unsigned int revision;
2283 	unsigned int age;
2284 	unsigned int *um;
2285 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2286 	char *s;
2287 	int i;
2288 	int count;
2289 	int ret;
2290 
2291 	if (verbose > 0)
2292 		printf("Parsing %s\n", norm_name);
2293 	file = fopen(norm_name, "r");
2294 	if (!file)
2295 		open_fail(norm_name, errno);
2296 
2297 	count = 0;
2298 	while (fgets(line, LINESIZE, file)) {
2299 		ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2300 				&unichar, buf0, buf1,
2301 				&major, &minor, &revision);
2302 		if (ret != 6)
2303 			continue;
2304 		if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2305 			line_fail(norm_name, line);
2306 		count++;
2307 	}
2308 	corrections = calloc(count, sizeof(struct unicode_data));
2309 	corrections_count = count;
2310 	rewind(file);
2311 
2312 	count = 0;
2313 	while (fgets(line, LINESIZE, file)) {
2314 		ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2315 				&unichar, buf0, buf1,
2316 				&major, &minor, &revision);
2317 		if (ret != 6)
2318 			continue;
2319 		if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2320 			line_fail(norm_name, line);
2321 		corrections[count] = unicode_data[unichar];
2322 		assert(corrections[count].code == unichar);
2323 		age = UNICODE_AGE(major, minor, revision);
2324 		corrections[count].correction = age;
2325 
2326 		i = 0;
2327 		s = buf0;
2328 		while (*s) {
2329 			mapping[i] = strtoul(s, &s, 16);
2330 			if (!utf32valid(mapping[i]))
2331 				line_fail(norm_name, line);
2332 			i++;
2333 		}
2334 		mapping[i++] = 0;
2335 
2336 		um = malloc(i * sizeof(unsigned int));
2337 		memcpy(um, mapping, i * sizeof(unsigned int));
2338 		corrections[count].utf32nfkdi = um;
2339 
2340 		if (verbose > 1)
2341 			printf(" %X -> %s -> %s V%d_%d_%d\n",
2342 				unichar, buf0, buf1, major, minor, revision);
2343 		count++;
2344 	}
2345 	fclose(file);
2346 
2347 	if (verbose > 0)
2348 	        printf("Found %d entries\n", count);
2349 	if (count == 0)
2350 		file_fail(norm_name);
2351 }
2352 
2353 /* ------------------------------------------------------------------ */
2354 
2355 /*
2356  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2357  *
2358  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2359  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2360  *
2361  * SBase = 0xAC00
2362  * LBase = 0x1100
2363  * VBase = 0x1161
2364  * TBase = 0x11A7
2365  * LCount = 19
2366  * VCount = 21
2367  * TCount = 28
2368  * NCount = 588 (VCount * TCount)
2369  * SCount = 11172 (LCount * NCount)
2370  *
2371  * Decomposition:
2372  *   SIndex = s - SBase
2373  *
2374  * LV (Canonical/Full)
2375  *   LIndex = SIndex / NCount
2376  *   VIndex = (Sindex % NCount) / TCount
2377  *   LPart = LBase + LIndex
2378  *   VPart = VBase + VIndex
2379  *
2380  * LVT (Canonical)
2381  *   LVIndex = (SIndex / TCount) * TCount
2382  *   TIndex = (Sindex % TCount)
2383  *   LVPart = SBase + LVIndex
2384  *   TPart = TBase + TIndex
2385  *
2386  * LVT (Full)
2387  *   LIndex = SIndex / NCount
2388  *   VIndex = (Sindex % NCount) / TCount
2389  *   TIndex = (Sindex % TCount)
2390  *   LPart = LBase + LIndex
2391  *   VPart = VBase + VIndex
2392  *   if (TIndex == 0) {
2393  *          d = <LPart, VPart>
2394  *   } else {
2395  *          TPart = TBase + TIndex
2396  *          d = <LPart, VPart, TPart>
2397  *   }
2398  *
2399  */
2400 
hangul_decompose(void)2401 static void hangul_decompose(void)
2402 {
2403 	unsigned int sb = 0xAC00;
2404 	unsigned int lb = 0x1100;
2405 	unsigned int vb = 0x1161;
2406 	unsigned int tb = 0x11a7;
2407 	/* unsigned int lc = 19; */
2408 	unsigned int vc = 21;
2409 	unsigned int tc = 28;
2410 	unsigned int nc = (vc * tc);
2411 	/* unsigned int sc = (lc * nc); */
2412 	unsigned int unichar;
2413 	unsigned int mapping[4];
2414 	unsigned int *um;
2415         int count;
2416 	int i;
2417 
2418 	if (verbose > 0)
2419 		printf("Decomposing hangul\n");
2420 	/* Hangul */
2421 	count = 0;
2422 	for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
2423 		unsigned int si = unichar - sb;
2424 		unsigned int li = si / nc;
2425 		unsigned int vi = (si % nc) / tc;
2426 		unsigned int ti = si % tc;
2427 
2428 		i = 0;
2429 		mapping[i++] = lb + li;
2430 		mapping[i++] = vb + vi;
2431 		if (ti)
2432 			mapping[i++] = tb + ti;
2433 		mapping[i++] = 0;
2434 
2435 		assert(!unicode_data[unichar].utf32nfkdi);
2436 		um = malloc(i * sizeof(unsigned int));
2437 		memcpy(um, mapping, i * sizeof(unsigned int));
2438 		unicode_data[unichar].utf32nfkdi = um;
2439 
2440 		assert(!unicode_data[unichar].utf32nfkdicf);
2441 		um = malloc(i * sizeof(unsigned int));
2442 		memcpy(um, mapping, i * sizeof(unsigned int));
2443 		unicode_data[unichar].utf32nfkdicf = um;
2444 
2445 		/*
2446 		 * Add a cookie as a reminder that the hangul syllable
2447 		 * decompositions must not be stored in the generated
2448 		 * trie.
2449 		 */
2450 		unicode_data[unichar].utf8nfkdi = malloc(2);
2451 		unicode_data[unichar].utf8nfkdi[0] = HANGUL;
2452 		unicode_data[unichar].utf8nfkdi[1] = '\0';
2453 
2454 		if (verbose > 1)
2455 			print_utf32nfkdi(unichar);
2456 
2457 		count++;
2458 	}
2459 	if (verbose > 0)
2460 		printf("Created %d entries\n", count);
2461 }
2462 
nfkdi_decompose(void)2463 static void nfkdi_decompose(void)
2464 {
2465 	unsigned int unichar;
2466 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2467 	unsigned int *um;
2468 	unsigned int *dc;
2469 	int count;
2470 	int i;
2471 	int j;
2472 	int ret;
2473 
2474 	if (verbose > 0)
2475 		printf("Decomposing nfkdi\n");
2476 
2477 	count = 0;
2478 	for (unichar = 0; unichar != 0x110000; unichar++) {
2479 		if (!unicode_data[unichar].utf32nfkdi)
2480 			continue;
2481 		for (;;) {
2482 			ret = 1;
2483 			i = 0;
2484 			um = unicode_data[unichar].utf32nfkdi;
2485 			while (*um) {
2486 				dc = unicode_data[*um].utf32nfkdi;
2487 				if (dc) {
2488 					for (j = 0; dc[j]; j++)
2489 						mapping[i++] = dc[j];
2490 					ret = 0;
2491 				} else {
2492 					mapping[i++] = *um;
2493 				}
2494 				um++;
2495 			}
2496 			mapping[i++] = 0;
2497 			if (ret)
2498 				break;
2499 			free(unicode_data[unichar].utf32nfkdi);
2500 			um = malloc(i * sizeof(unsigned int));
2501 			memcpy(um, mapping, i * sizeof(unsigned int));
2502 			unicode_data[unichar].utf32nfkdi = um;
2503 		}
2504 		/* Add this decomposition to nfkdicf if there is no entry. */
2505 		if (!unicode_data[unichar].utf32nfkdicf) {
2506 			um = malloc(i * sizeof(unsigned int));
2507 			memcpy(um, mapping, i * sizeof(unsigned int));
2508 			unicode_data[unichar].utf32nfkdicf = um;
2509 		}
2510 		if (verbose > 1)
2511 			print_utf32nfkdi(unichar);
2512 		count++;
2513 	}
2514 	if (verbose > 0)
2515 		printf("Processed %d entries\n", count);
2516 }
2517 
nfkdicf_decompose(void)2518 static void nfkdicf_decompose(void)
2519 {
2520 	unsigned int unichar;
2521 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2522 	unsigned int *um;
2523 	unsigned int *dc;
2524 	int count;
2525 	int i;
2526 	int j;
2527 	int ret;
2528 
2529 	if (verbose > 0)
2530 		printf("Decomposing nfkdicf\n");
2531 	count = 0;
2532 	for (unichar = 0; unichar != 0x110000; unichar++) {
2533 		if (!unicode_data[unichar].utf32nfkdicf)
2534 			continue;
2535 		for (;;) {
2536 			ret = 1;
2537 			i = 0;
2538 			um = unicode_data[unichar].utf32nfkdicf;
2539 			while (*um) {
2540 				dc = unicode_data[*um].utf32nfkdicf;
2541 				if (dc) {
2542 					for (j = 0; dc[j]; j++)
2543 						mapping[i++] = dc[j];
2544 					ret = 0;
2545 				} else {
2546 					mapping[i++] = *um;
2547 				}
2548 				um++;
2549 			}
2550 			mapping[i++] = 0;
2551 			if (ret)
2552 				break;
2553 			free(unicode_data[unichar].utf32nfkdicf);
2554 			um = malloc(i * sizeof(unsigned int));
2555 			memcpy(um, mapping, i * sizeof(unsigned int));
2556 			unicode_data[unichar].utf32nfkdicf = um;
2557 		}
2558 		if (verbose > 1)
2559 			print_utf32nfkdicf(unichar);
2560 		count++;
2561 	}
2562 	if (verbose > 0)
2563 		printf("Processed %d entries\n", count);
2564 }
2565 
2566 /* ------------------------------------------------------------------ */
2567 
2568 int utf8agemax(struct tree *, const char *);
2569 int utf8nagemax(struct tree *, const char *, size_t);
2570 int utf8agemin(struct tree *, const char *);
2571 int utf8nagemin(struct tree *, const char *, size_t);
2572 ssize_t utf8len(struct tree *, const char *);
2573 ssize_t utf8nlen(struct tree *, const char *, size_t);
2574 struct utf8cursor;
2575 int utf8cursor(struct utf8cursor *, struct tree *, const char *);
2576 int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
2577 int utf8byte(struct utf8cursor *);
2578 
2579 /*
2580  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2581  *
2582  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2583  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2584  *
2585  * SBase = 0xAC00
2586  * LBase = 0x1100
2587  * VBase = 0x1161
2588  * TBase = 0x11A7
2589  * LCount = 19
2590  * VCount = 21
2591  * TCount = 28
2592  * NCount = 588 (VCount * TCount)
2593  * SCount = 11172 (LCount * NCount)
2594  *
2595  * Decomposition:
2596  *   SIndex = s - SBase
2597  *
2598  * LV (Canonical/Full)
2599  *   LIndex = SIndex / NCount
2600  *   VIndex = (Sindex % NCount) / TCount
2601  *   LPart = LBase + LIndex
2602  *   VPart = VBase + VIndex
2603  *
2604  * LVT (Canonical)
2605  *   LVIndex = (SIndex / TCount) * TCount
2606  *   TIndex = (Sindex % TCount)
2607  *   LVPart = SBase + LVIndex
2608  *   TPart = TBase + TIndex
2609  *
2610  * LVT (Full)
2611  *   LIndex = SIndex / NCount
2612  *   VIndex = (Sindex % NCount) / TCount
2613  *   TIndex = (Sindex % TCount)
2614  *   LPart = LBase + LIndex
2615  *   VPart = VBase + VIndex
2616  *   if (TIndex == 0) {
2617  *          d = <LPart, VPart>
2618  *   } else {
2619  *          TPart = TBase + TIndex
2620  *          d = <LPart, VPart, TPart>
2621  *   }
2622  */
2623 
2624 /* Constants */
2625 #define SB	(0xAC00)
2626 #define LB	(0x1100)
2627 #define VB	(0x1161)
2628 #define TB	(0x11A7)
2629 #define LC	(19)
2630 #define VC	(21)
2631 #define TC	(28)
2632 #define NC	(VC * TC)
2633 #define SC	(LC * NC)
2634 
2635 /* Algorithmic decomposition of hangul syllable. */
utf8hangul(const char * str,unsigned char * hangul)2636 static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
2637 {
2638 	unsigned int	si;
2639 	unsigned int	li;
2640 	unsigned int	vi;
2641 	unsigned int	ti;
2642 	unsigned char	*h;
2643 
2644 	/* Calculate the SI, LI, VI, and TI values. */
2645 	si = utf8decode(str) - SB;
2646 	li = si / NC;
2647 	vi = (si % NC) / TC;
2648 	ti = si % TC;
2649 
2650 	/* Fill in base of leaf. */
2651 	h = hangul;
2652 	LEAF_GEN(h) = 2;
2653 	LEAF_CCC(h) = DECOMPOSE;
2654 	h += 2;
2655 
2656 	/* Add LPart, a 3-byte UTF-8 sequence. */
2657 	h += utf8encode((char *)h, li + LB);
2658 
2659 	/* Add VPart, a 3-byte UTF-8 sequence. */
2660 	h += utf8encode((char *)h, vi + VB);
2661 
2662 	/* Add TPart if required, also a 3-byte UTF-8 sequence. */
2663 	if (ti)
2664 		h += utf8encode((char *)h, ti + TB);
2665 
2666 	/* Terminate string. */
2667 	h[0] = '\0';
2668 
2669 	return hangul;
2670 }
2671 
2672 /*
2673  * Use trie to scan s, touching at most len bytes.
2674  * Returns the leaf if one exists, NULL otherwise.
2675  *
2676  * A non-NULL return guarantees that the UTF-8 sequence starting at s
2677  * is well-formed and corresponds to a known unicode code point.  The
2678  * shorthand for this will be "is valid UTF-8 unicode".
2679  */
utf8nlookup(struct tree * tree,unsigned char * hangul,const char * s,size_t len)2680 static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
2681 			       const char *s, size_t len)
2682 {
2683 	utf8trie_t	*trie = utf8data + tree->index;
2684 	int		offlen;
2685 	int		offset;
2686 	int		mask;
2687 	int		node;
2688 
2689 	if (!tree)
2690 		return NULL;
2691 	if (len == 0)
2692 		return NULL;
2693 	node = 1;
2694 	while (node) {
2695 		offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
2696 		if (*trie & NEXTBYTE) {
2697 			if (--len == 0)
2698 				return NULL;
2699 			s++;
2700 		}
2701 		mask = 1 << (*trie & BITNUM);
2702 		if (*s & mask) {
2703 			/* Right leg */
2704 			if (offlen) {
2705 				/* Right node at offset of trie */
2706 				node = (*trie & RIGHTNODE);
2707 				offset = trie[offlen];
2708 				while (--offlen) {
2709 					offset <<= 8;
2710 					offset |= trie[offlen];
2711 				}
2712 				trie += offset;
2713 			} else if (*trie & RIGHTPATH) {
2714 				/* Right node after this node */
2715 				node = (*trie & TRIENODE);
2716 				trie++;
2717 			} else {
2718 				/* No right node. */
2719 				return NULL;
2720 			}
2721 		} else {
2722 			/* Left leg */
2723 			if (offlen) {
2724 				/* Left node after this node. */
2725 				node = (*trie & LEFTNODE);
2726 				trie += offlen + 1;
2727 			} else if (*trie & RIGHTPATH) {
2728 				/* No left node. */
2729 				return NULL;
2730 			} else {
2731 				/* Left node after this node */
2732 				node = (*trie & TRIENODE);
2733 				trie++;
2734 			}
2735 		}
2736 	}
2737 	/*
2738 	 * Hangul decomposition is done algorithmically. These are the
2739 	 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
2740 	 * always 3 bytes long, so s has been advanced twice, and the
2741 	 * start of the sequence is at s-2.
2742 	 */
2743 	if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
2744 		trie = utf8hangul(s - 2, hangul);
2745 	return trie;
2746 }
2747 
2748 /*
2749  * Use trie to scan s.
2750  * Returns the leaf if one exists, NULL otherwise.
2751  *
2752  * Forwards to trie_nlookup().
2753  */
utf8lookup(struct tree * tree,unsigned char * hangul,const char * s)2754 static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
2755 			      const char *s)
2756 {
2757 	return utf8nlookup(tree, hangul, s, (size_t)-1);
2758 }
2759 
2760 /*
2761  * Return the number of bytes used by the current UTF-8 sequence.
2762  * Assumes the input points to the first byte of a valid UTF-8
2763  * sequence.
2764  */
utf8clen(const char * s)2765 static inline int utf8clen(const char *s)
2766 {
2767 	unsigned char c = *s;
2768 	return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
2769 }
2770 
2771 /*
2772  * Maximum age of any character in s.
2773  * Return -1 if s is not valid UTF-8 unicode.
2774  * Return 0 if only non-assigned code points are used.
2775  */
utf8agemax(struct tree * tree,const char * s)2776 int utf8agemax(struct tree *tree, const char *s)
2777 {
2778 	utf8leaf_t	*leaf;
2779 	int		age = 0;
2780 	int		leaf_age;
2781 	unsigned char	hangul[UTF8HANGULLEAF];
2782 
2783 	if (!tree)
2784 		return -1;
2785 
2786 	while (*s) {
2787 		leaf = utf8lookup(tree, hangul, s);
2788 		if (!leaf)
2789 			return -1;
2790 		leaf_age = ages[LEAF_GEN(leaf)];
2791 		if (leaf_age <= tree->maxage && leaf_age > age)
2792 			age = leaf_age;
2793 		s += utf8clen(s);
2794 	}
2795 	return age;
2796 }
2797 
2798 /*
2799  * Minimum age of any character in s.
2800  * Return -1 if s is not valid UTF-8 unicode.
2801  * Return 0 if non-assigned code points are used.
2802  */
utf8agemin(struct tree * tree,const char * s)2803 int utf8agemin(struct tree *tree, const char *s)
2804 {
2805 	utf8leaf_t	*leaf;
2806 	int		age;
2807 	int		leaf_age;
2808 	unsigned char	hangul[UTF8HANGULLEAF];
2809 
2810 	if (!tree)
2811 		return -1;
2812 	age = tree->maxage;
2813 	while (*s) {
2814 		leaf = utf8lookup(tree, hangul, s);
2815 		if (!leaf)
2816 			return -1;
2817 		leaf_age = ages[LEAF_GEN(leaf)];
2818 		if (leaf_age <= tree->maxage && leaf_age < age)
2819 			age = leaf_age;
2820 		s += utf8clen(s);
2821 	}
2822 	return age;
2823 }
2824 
2825 /*
2826  * Maximum age of any character in s, touch at most len bytes.
2827  * Return -1 if s is not valid UTF-8 unicode.
2828  */
utf8nagemax(struct tree * tree,const char * s,size_t len)2829 int utf8nagemax(struct tree *tree, const char *s, size_t len)
2830 {
2831 	utf8leaf_t	*leaf;
2832 	int		age = 0;
2833 	int		leaf_age;
2834 	unsigned char	hangul[UTF8HANGULLEAF];
2835 
2836 	if (!tree)
2837 		return -1;
2838 
2839         while (len && *s) {
2840 		leaf = utf8nlookup(tree, hangul, s, len);
2841 		if (!leaf)
2842 			return -1;
2843 		leaf_age = ages[LEAF_GEN(leaf)];
2844 		if (leaf_age <= tree->maxage && leaf_age > age)
2845 			age = leaf_age;
2846 		len -= utf8clen(s);
2847 		s += utf8clen(s);
2848 	}
2849 	return age;
2850 }
2851 
2852 /*
2853  * Maximum age of any character in s, touch at most len bytes.
2854  * Return -1 if s is not valid UTF-8 unicode.
2855  */
utf8nagemin(struct tree * tree,const char * s,size_t len)2856 int utf8nagemin(struct tree *tree, const char *s, size_t len)
2857 {
2858 	utf8leaf_t	*leaf;
2859 	int		leaf_age;
2860 	int		age;
2861 	unsigned char	hangul[UTF8HANGULLEAF];
2862 
2863 	if (!tree)
2864 		return -1;
2865 	age = tree->maxage;
2866         while (len && *s) {
2867 		leaf = utf8nlookup(tree, hangul, s, len);
2868 		if (!leaf)
2869 			return -1;
2870 		leaf_age = ages[LEAF_GEN(leaf)];
2871 		if (leaf_age <= tree->maxage && leaf_age < age)
2872 			age = leaf_age;
2873 		len -= utf8clen(s);
2874 		s += utf8clen(s);
2875 	}
2876 	return age;
2877 }
2878 
2879 /*
2880  * Length of the normalization of s.
2881  * Return -1 if s is not valid UTF-8 unicode.
2882  *
2883  * A string of Default_Ignorable_Code_Point has length 0.
2884  */
utf8len(struct tree * tree,const char * s)2885 ssize_t utf8len(struct tree *tree, const char *s)
2886 {
2887 	utf8leaf_t	*leaf;
2888 	size_t		ret = 0;
2889 	unsigned char	hangul[UTF8HANGULLEAF];
2890 
2891 	if (!tree)
2892 		return -1;
2893 	while (*s) {
2894 		leaf = utf8lookup(tree, hangul, s);
2895 		if (!leaf)
2896 			return -1;
2897 		if (ages[LEAF_GEN(leaf)] > tree->maxage)
2898 			ret += utf8clen(s);
2899 		else if (LEAF_CCC(leaf) == DECOMPOSE)
2900 			ret += strlen(LEAF_STR(leaf));
2901 		else
2902 			ret += utf8clen(s);
2903 		s += utf8clen(s);
2904 	}
2905 	return ret;
2906 }
2907 
2908 /*
2909  * Length of the normalization of s, touch at most len bytes.
2910  * Return -1 if s is not valid UTF-8 unicode.
2911  */
utf8nlen(struct tree * tree,const char * s,size_t len)2912 ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
2913 {
2914 	utf8leaf_t	*leaf;
2915 	size_t		ret = 0;
2916 	unsigned char	hangul[UTF8HANGULLEAF];
2917 
2918 	if (!tree)
2919 		return -1;
2920 	while (len && *s) {
2921 		leaf = utf8nlookup(tree, hangul, s, len);
2922 		if (!leaf)
2923 			return -1;
2924 		if (ages[LEAF_GEN(leaf)] > tree->maxage)
2925 			ret += utf8clen(s);
2926 		else if (LEAF_CCC(leaf) == DECOMPOSE)
2927 			ret += strlen(LEAF_STR(leaf));
2928 		else
2929 			ret += utf8clen(s);
2930 		len -= utf8clen(s);
2931 		s += utf8clen(s);
2932 	}
2933 	return ret;
2934 }
2935 
2936 /*
2937  * Cursor structure used by the normalizer.
2938  */
2939 struct utf8cursor {
2940 	struct tree	*tree;
2941 	const char	*s;
2942 	const char	*p;
2943 	const char	*ss;
2944 	const char	*sp;
2945 	unsigned int	len;
2946 	unsigned int	slen;
2947 	short int	ccc;
2948 	short int	nccc;
2949 	unsigned int	unichar;
2950 	unsigned char	hangul[UTF8HANGULLEAF];
2951 };
2952 
2953 /*
2954  * Set up an utf8cursor for use by utf8byte().
2955  *
2956  *   s      : string.
2957  *   len    : length of s.
2958  *   u8c    : pointer to cursor.
2959  *   trie   : utf8trie_t to use for normalization.
2960  *
2961  * Returns -1 on error, 0 on success.
2962  */
utf8ncursor(struct utf8cursor * u8c,struct tree * tree,const char * s,size_t len)2963 int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
2964 		size_t len)
2965 {
2966 	if (!tree)
2967 		return -1;
2968 	if (!s)
2969 		return -1;
2970 	u8c->tree = tree;
2971 	u8c->s = s;
2972 	u8c->p = NULL;
2973 	u8c->ss = NULL;
2974 	u8c->sp = NULL;
2975 	u8c->len = len;
2976 	u8c->slen = 0;
2977 	u8c->ccc = STOPPER;
2978 	u8c->nccc = STOPPER;
2979 	u8c->unichar = 0;
2980 	/* Check we didn't clobber the maximum length. */
2981 	if (u8c->len != len)
2982 		return -1;
2983 	/* The first byte of s may not be an utf8 continuation. */
2984 	if (len > 0 && (*s & 0xC0) == 0x80)
2985 		return -1;
2986 	return 0;
2987 }
2988 
2989 /*
2990  * Set up an utf8cursor for use by utf8byte().
2991  *
2992  *   s      : NUL-terminated string.
2993  *   u8c    : pointer to cursor.
2994  *   trie   : utf8trie_t to use for normalization.
2995  *
2996  * Returns -1 on error, 0 on success.
2997  */
utf8cursor(struct utf8cursor * u8c,struct tree * tree,const char * s)2998 int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
2999 {
3000 	return utf8ncursor(u8c, tree, s, (unsigned int)-1);
3001 }
3002 
3003 /*
3004  * Get one byte from the normalized form of the string described by u8c.
3005  *
3006  * Returns the byte cast to an unsigned char on succes, and -1 on failure.
3007  *
3008  * The cursor keeps track of the location in the string in u8c->s.
3009  * When a character is decomposed, the current location is stored in
3010  * u8c->p, and u8c->s is set to the start of the decomposition. Note
3011  * that bytes from a decomposition do not count against u8c->len.
3012  *
3013  * Characters are emitted if they match the current CCC in u8c->ccc.
3014  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
3015  * and the function returns 0 in that case.
3016  *
3017  * Sorting by CCC is done by repeatedly scanning the string.  The
3018  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
3019  * the start of the scan.  The first pass finds the lowest CCC to be
3020  * emitted and stores it in u8c->nccc, the second pass emits the
3021  * characters with this CCC and finds the next lowest CCC. This limits
3022  * the number of passes to 1 + the number of different CCCs in the
3023  * sequence being scanned.
3024  *
3025  * Therefore:
3026  *  u8c->p  != NULL -> a decomposition is being scanned.
3027  *  u8c->ss != NULL -> this is a repeating scan.
3028  *  u8c->ccc == -1  -> this is the first scan of a repeating scan.
3029  */
utf8byte(struct utf8cursor * u8c)3030 int utf8byte(struct utf8cursor *u8c)
3031 {
3032 	utf8leaf_t *leaf;
3033 	int ccc;
3034 
3035 	for (;;) {
3036 		/* Check for the end of a decomposed character. */
3037 		if (u8c->p && *u8c->s == '\0') {
3038 			u8c->s = u8c->p;
3039 			u8c->p = NULL;
3040 		}
3041 
3042 		/* Check for end-of-string. */
3043 		if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
3044 			/* There is no next byte. */
3045 			if (u8c->ccc == STOPPER)
3046 				return 0;
3047 			/* End-of-string during a scan counts as a stopper. */
3048 			ccc = STOPPER;
3049 			goto ccc_mismatch;
3050 		} else if ((*u8c->s & 0xC0) == 0x80) {
3051 			/* This is a continuation of the current character. */
3052 			if (!u8c->p)
3053 				u8c->len--;
3054 			return (unsigned char)*u8c->s++;
3055 		}
3056 
3057 		/* Look up the data for the current character. */
3058 		if (u8c->p) {
3059 			leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3060 		} else {
3061 			leaf = utf8nlookup(u8c->tree, u8c->hangul,
3062 					   u8c->s, u8c->len);
3063 		}
3064 
3065 		/* No leaf found implies that the input is a binary blob. */
3066 		if (!leaf)
3067 			return -1;
3068 
3069 		/* Characters that are too new have CCC 0. */
3070 		if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
3071 			ccc = STOPPER;
3072 		} else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
3073 			u8c->len -= utf8clen(u8c->s);
3074 			u8c->p = u8c->s + utf8clen(u8c->s);
3075 			u8c->s = LEAF_STR(leaf);
3076 			/* Empty decomposition implies CCC 0. */
3077 			if (*u8c->s == '\0') {
3078 				if (u8c->ccc == STOPPER)
3079 					continue;
3080 				ccc = STOPPER;
3081 				goto ccc_mismatch;
3082 			}
3083 			leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3084 			ccc = LEAF_CCC(leaf);
3085 		}
3086 		u8c->unichar = utf8decode(u8c->s);
3087 
3088 		/*
3089 		 * If this is not a stopper, then see if it updates
3090 		 * the next canonical class to be emitted.
3091 		 */
3092 		if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
3093 			u8c->nccc = ccc;
3094 
3095 		/*
3096 		 * Return the current byte if this is the current
3097 		 * combining class.
3098 		 */
3099 		if (ccc == u8c->ccc) {
3100 			if (!u8c->p)
3101 				u8c->len--;
3102 			return (unsigned char)*u8c->s++;
3103 		}
3104 
3105 		/* Current combining class mismatch. */
3106 	ccc_mismatch:
3107 		if (u8c->nccc == STOPPER) {
3108 			/*
3109 			 * Scan forward for the first canonical class
3110 			 * to be emitted.  Save the position from
3111 			 * which to restart.
3112 			 */
3113 			assert(u8c->ccc == STOPPER);
3114 			u8c->ccc = MINCCC - 1;
3115 			u8c->nccc = ccc;
3116 			u8c->sp = u8c->p;
3117 			u8c->ss = u8c->s;
3118 			u8c->slen = u8c->len;
3119 			if (!u8c->p)
3120 				u8c->len -= utf8clen(u8c->s);
3121 			u8c->s += utf8clen(u8c->s);
3122 		} else if (ccc != STOPPER) {
3123 			/* Not a stopper, and not the ccc we're emitting. */
3124 			if (!u8c->p)
3125 				u8c->len -= utf8clen(u8c->s);
3126 			u8c->s += utf8clen(u8c->s);
3127 		} else if (u8c->nccc != MAXCCC + 1) {
3128 			/* At a stopper, restart for next ccc. */
3129 			u8c->ccc = u8c->nccc;
3130 			u8c->nccc = MAXCCC + 1;
3131 			u8c->s = u8c->ss;
3132 			u8c->p = u8c->sp;
3133 			u8c->len = u8c->slen;
3134 		} else {
3135 			/* All done, proceed from here. */
3136 			u8c->ccc = STOPPER;
3137 			u8c->nccc = STOPPER;
3138 			u8c->sp = NULL;
3139 			u8c->ss = NULL;
3140 			u8c->slen = 0;
3141 		}
3142 	}
3143 }
3144 
3145 /* ------------------------------------------------------------------ */
3146 
normalize_line(struct tree * tree)3147 static int normalize_line(struct tree *tree)
3148 {
3149 	char *s;
3150 	char *t;
3151 	int c;
3152 	struct utf8cursor u8c;
3153 
3154 	/* First test: null-terminated string. */
3155 	s = buf2;
3156 	t = buf3;
3157 	if (utf8cursor(&u8c, tree, s))
3158 		return -1;
3159 	while ((c = utf8byte(&u8c)) > 0)
3160 		if (c != (unsigned char)*t++)
3161 			return -1;
3162 	if (c < 0)
3163 		return -1;
3164 	if (*t != 0)
3165 		return -1;
3166 
3167 	/* Second test: length-limited string. */
3168 	s = buf2;
3169 	/* Replace NUL with a value that will cause an error if seen. */
3170 	s[strlen(s) + 1] = -1;
3171 	t = buf3;
3172 	if (utf8cursor(&u8c, tree, s))
3173 		return -1;
3174 	while ((c = utf8byte(&u8c)) > 0)
3175 		if (c != (unsigned char)*t++)
3176 			return -1;
3177 	if (c < 0)
3178 		return -1;
3179 	if (*t != 0)
3180 		return -1;
3181 
3182 	return 0;
3183 }
3184 
normalization_test(void)3185 static void normalization_test(void)
3186 {
3187 	FILE *file;
3188 	unsigned int unichar;
3189 	struct unicode_data *data;
3190 	char *s;
3191 	char *t;
3192 	int ret;
3193 	int ignorables;
3194 	int tests = 0;
3195 	int failures = 0;
3196 
3197 	if (verbose > 0)
3198 		printf("Parsing %s\n", test_name);
3199 	/* Step one, read data from file. */
3200 	file = fopen(test_name, "r");
3201 	if (!file)
3202 		open_fail(test_name, errno);
3203 
3204 	while (fgets(line, LINESIZE, file)) {
3205 		ret = sscanf(line, "%[^;];%*[^;];%*[^;];%*[^;];%[^;];",
3206 			     buf0, buf1);
3207 		if (ret != 2 || *line == '#')
3208 			continue;
3209 		s = buf0;
3210 		t = buf2;
3211 		while (*s) {
3212 			unichar = strtoul(s, &s, 16);
3213 			t += utf8encode(t, unichar);
3214 		}
3215 		*t = '\0';
3216 
3217 		ignorables = 0;
3218 		s = buf1;
3219 		t = buf3;
3220 		while (*s) {
3221 			unichar = strtoul(s, &s, 16);
3222 			data = &unicode_data[unichar];
3223 			if (data->utf8nfkdi && !*data->utf8nfkdi)
3224 				ignorables = 1;
3225 			else
3226 				t += utf8encode(t, unichar);
3227 		}
3228 		*t = '\0';
3229 
3230 		tests++;
3231 		if (normalize_line(nfkdi_tree) < 0) {
3232 			printf("Line %s -> %s", buf0, buf1);
3233 			if (ignorables)
3234 				printf(" (ignorables removed)");
3235 			printf(" failure\n");
3236 			failures++;
3237 		}
3238 	}
3239 	fclose(file);
3240 	if (verbose > 0)
3241 		printf("Ran %d tests with %d failures\n", tests, failures);
3242 	if (failures)
3243 		file_fail(test_name);
3244 }
3245 
3246 /* ------------------------------------------------------------------ */
3247 
write_file(void)3248 static void write_file(void)
3249 {
3250 	FILE *file;
3251 	int i;
3252 	int j;
3253 	int t;
3254 	int gen;
3255 
3256 	if (verbose > 0)
3257 		printf("Writing %s\n", utf8_name);
3258 	file = fopen(utf8_name, "w");
3259 	if (!file)
3260 		open_fail(utf8_name, errno);
3261 
3262 	fprintf(file, "/* This file is generated code, do not edit. */\n");
3263 	fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
3264 	fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
3265 	fprintf(file, "#endif\n");
3266 	fprintf(file, "\n");
3267 	fprintf(file, "static const unsigned int utf8vers = %#x;\n",
3268 		unicode_maxage);
3269 	fprintf(file, "\n");
3270 	fprintf(file, "static const unsigned int utf8agetab[] = {\n");
3271 	for (i = 0; i != ages_count; i++)
3272 		fprintf(file, "\t%#x%s\n", ages[i],
3273 			ages[i] == unicode_maxage ? "" : ",");
3274 	fprintf(file, "};\n");
3275 	fprintf(file, "\n");
3276 	fprintf(file, "static const struct utf8data utf8nfkdicfdata[] = {\n");
3277 	t = 0;
3278 	for (gen = 0; gen < ages_count; gen++) {
3279 		fprintf(file, "\t{ %#x, %d }%s\n",
3280 			ages[gen], trees[t].index,
3281 			ages[gen] == unicode_maxage ? "" : ",");
3282 		if (trees[t].maxage == ages[gen])
3283 			t += 2;
3284 	}
3285 	fprintf(file, "};\n");
3286 	fprintf(file, "\n");
3287 	fprintf(file, "static const struct utf8data utf8nfkdidata[] = {\n");
3288 	t = 1;
3289 	for (gen = 0; gen < ages_count; gen++) {
3290 		fprintf(file, "\t{ %#x, %d }%s\n",
3291 			ages[gen], trees[t].index,
3292 			ages[gen] == unicode_maxage ? "" : ",");
3293 		if (trees[t].maxage == ages[gen])
3294 			t += 2;
3295 	}
3296 	fprintf(file, "};\n");
3297 	fprintf(file, "\n");
3298 	fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
3299 		utf8data_size);
3300 	t = 0;
3301 	for (i = 0; i != utf8data_size; i += 16) {
3302 		if (i == trees[t].index) {
3303 			fprintf(file, "\t/* %s_%x */\n",
3304 				trees[t].type, trees[t].maxage);
3305 			if (t < trees_count-1)
3306 				t++;
3307 		}
3308 		fprintf(file, "\t");
3309 		for (j = i; j != i + 16; j++)
3310 			fprintf(file, "0x%.2x%s", utf8data[j],
3311 				(j < utf8data_size -1 ? "," : ""));
3312 		fprintf(file, "\n");
3313 	}
3314 	fprintf(file, "};\n");
3315 	fclose(file);
3316 }
3317 
3318 /* ------------------------------------------------------------------ */
3319 
main(int argc,char * argv[])3320 int main(int argc, char *argv[])
3321 {
3322 	unsigned int unichar;
3323 	int opt;
3324 
3325 	argv0 = argv[0];
3326 
3327 	while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
3328 		switch (opt) {
3329 		case 'a':
3330 			age_name = optarg;
3331 			break;
3332 		case 'c':
3333 			ccc_name = optarg;
3334 			break;
3335 		case 'd':
3336 			data_name = optarg;
3337 			break;
3338 		case 'f':
3339 			fold_name = optarg;
3340 			break;
3341 		case 'n':
3342 			norm_name = optarg;
3343 			break;
3344 		case 'o':
3345 			utf8_name = optarg;
3346 			break;
3347 		case 'p':
3348 			prop_name = optarg;
3349 			break;
3350 		case 't':
3351 			test_name = optarg;
3352 			break;
3353 		case 'v':
3354 			verbose++;
3355 			break;
3356 		case 'h':
3357 			help();
3358 			exit(0);
3359 		default:
3360 			usage();
3361 		}
3362 	}
3363 
3364 	if (verbose > 1)
3365 		help();
3366 	for (unichar = 0; unichar != 0x110000; unichar++)
3367 		unicode_data[unichar].code = unichar;
3368 	age_init();
3369 	ccc_init();
3370 	nfkdi_init();
3371 	nfkdicf_init();
3372 	ignore_init();
3373 	corrections_init();
3374 	hangul_decompose();
3375 	nfkdi_decompose();
3376 	nfkdicf_decompose();
3377 	utf8_init();
3378 	trees_init();
3379 	trees_populate();
3380 	trees_reduce();
3381 	trees_verify();
3382 	/* Prevent "unused function" warning. */
3383 	(void)lookup(nfkdi_tree, " ");
3384 	if (verbose > 2)
3385 		tree_walk(nfkdi_tree);
3386 	if (verbose > 2)
3387 		tree_walk(nfkdicf_tree);
3388 	normalization_test();
3389 	write_file();
3390 
3391 	return 0;
3392 }
3393