• Home
  • Raw
  • Download

Lines Matching +full:gen +full:- +full:mapping

16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
30 /* Default names of the in- and output files. */
63 /* ------------------------------------------------------------------ */
78 #define UNICODE_MAJ_MAX ((unsigned short)-1)
79 #define UNICODE_MIN_MAX ((unsigned char)-1)
80 #define UNICODE_REV_MAX ((unsigned char)-1)
104 /* ------------------------------------------------------------------ */
109 * A compact binary tree, used to decode UTF-8 characters.
114 * NEXTBYTE - flag - advance to next byte if set
115 * BITNUM - 3 bit field - the bit number to tested
116 * OFFLEN - 2 bit field - number of bytes in the offset
117 * if offlen == 0 (non-branching node)
118 * RIGHTPATH - 1 bit field - set if the following node is for the
119 * right-hand path (tested bit is set)
120 * TRIENODE - 1 bit field - set if the following node is an internal
123 * LEFTNODE - 1 bit field - set if the left-hand node is internal
124 * RIGHTNODE - 1 bit field - set if the right-hand node is internal
149 * defined. The CCC of a non-defined code point is 0.
152 * with a non-zero CCC that occur between two characters with
159 * start of a NUL-terminated string that is the decomposition
198 /* ------------------------------------------------------------------ */
203 * The UTF-8 encoding spreads the bits of a 32bit word over several
214 * There is an additional requirement on UTF-8, in that only the
226 * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
230 * 0 - 0x7f: 0 0x7f
231 * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf
232 * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf
233 * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf
236 * 0xd800 - 0xdfff should never be seen.
239 * the same a single UTF-32 character. This makes the UTF-8
240 * representation of Unicode strictly smaller than UTF-32.
243 * Corrigendum #1: UTF-8 Shortest Form
383 node = tree->root; in lookup()
385 if (node->nextbyte) in lookup()
387 if (*key & (1 << (node->bitnum & 7))) { in lookup()
389 if (node->rightnode == NODE) { in lookup()
390 node = node->right; in lookup()
391 } else if (node->rightnode == LEAF) { in lookup()
392 leaf = node->right; in lookup()
398 if (node->leftnode == NODE) { in lookup()
399 node = node->left; in lookup()
400 } else if (node->leftnode == LEAF) { in lookup()
401 leaf = node->left; in lookup()
412 * A simple non-recursive tree walker: keep track of visits to the
426 printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root); in tree_walk()
427 if (tree->childnode == LEAF) { in tree_walk()
428 assert(tree->root); in tree_walk()
429 tree->leaf_print(tree->root, indent); in tree_walk()
432 assert(tree->childnode == NODE); in tree_walk()
433 node = tree->root; in tree_walk()
439 node->bitnum, node->nextbyte, in tree_walk()
440 node->left, node->right, in tree_walk()
441 node->keymask, node->keybits); in tree_walk()
443 if (!(node->left && node->right)) in tree_walk()
447 bitmask = 1 << node->bitnum; in tree_walk()
450 if (node->leftnode == LEAF) { in tree_walk()
451 assert(node->left); in tree_walk()
452 tree->leaf_print(node->left, in tree_walk()
455 } else if (node->left) { in tree_walk()
456 assert(node->leftnode == NODE); in tree_walk()
458 node = node->left; in tree_walk()
464 if (node->rightnode == LEAF) { in tree_walk()
465 assert(node->right); in tree_walk()
466 tree->leaf_print(node->right, in tree_walk()
469 } else if (node->right) { in tree_walk()
470 assert(node->rightnode == NODE); in tree_walk()
472 node = node->right; in tree_walk()
478 node = node->parent; in tree_walk()
479 indent -= 1; in tree_walk()
496 node->left = node->right = NULL; in alloc_node()
497 node->parent = parent; in alloc_node()
498 node->leftnode = NODE; in alloc_node()
499 node->rightnode = NODE; in alloc_node()
500 node->keybits = 0; in alloc_node()
501 node->keymask = 0; in alloc_node()
502 node->mark = 0; in alloc_node()
503 node->index = 0; in alloc_node()
504 node->offset = -1; in alloc_node()
505 node->size = 4; in alloc_node()
507 if (node->parent) { in alloc_node()
508 bitnum = parent->bitnum; in alloc_node()
510 node->bitnum = bitnum + 7 + 8; in alloc_node()
511 node->nextbyte = 1; in alloc_node()
513 node->bitnum = bitnum - 1; in alloc_node()
514 node->nextbyte = 0; in alloc_node()
517 node->bitnum = 7; in alloc_node()
518 node->nextbyte = 0; in alloc_node()
541 cursor = &tree->root; in insert()
549 if (node->nextbyte) in insert()
551 if (*key & (1 << (node->bitnum & 7))) in insert()
552 cursor = &node->right; in insert()
554 cursor = &node->left; in insert()
555 keybits--; in insert()
561 if (*key & (1 << (node->bitnum & 7))) in insert()
562 node->rightnode = LEAF; in insert()
564 node->leftnode = LEAF; in insert()
565 if (node->nextbyte) in insert()
567 if (node->leftnode == NODE || node->rightnode == NODE) in insert()
569 assert(node->left); in insert()
570 assert(node->right); in insert()
572 if (! tree->leaf_equal(node->left, node->right)) in insert()
575 leaf = node->left; in insert()
577 parent = node->parent; in insert()
580 tree->root = leaf; in insert()
581 tree->childnode = LEAF; in insert()
582 } else if (parent->left == node) { in insert()
583 parent->left = leaf; in insert()
584 parent->leftnode = LEAF; in insert()
585 if (parent->right) { in insert()
586 parent->keymask = 0; in insert()
587 parent->keybits = 0; in insert()
589 parent->keymask |= (1 << node->bitnum); in insert()
591 } else if (parent->right == node) { in insert()
592 parent->right = leaf; in insert()
593 parent->rightnode = LEAF; in insert()
594 if (parent->left) { in insert()
595 parent->keymask = 0; in insert()
596 parent->keybits = 0; in insert()
598 parent->keymask |= (1 << node->bitnum); in insert()
599 parent->keybits |= (1 << node->bitnum); in insert()
611 parent = node->parent; in insert()
615 if (node->keymask == 0) { in insert()
616 parent->keymask = 0; in insert()
617 parent->keybits = 0; in insert()
618 } else if (parent->left && parent->right) { in insert()
619 parent->keymask = 0; in insert()
620 parent->keybits = 0; in insert()
622 assert((parent->keymask & node->keymask) == 0); in insert()
623 parent->keymask |= node->keymask; in insert()
624 parent->keymask |= (1 << parent->bitnum); in insert()
625 parent->keybits |= node->keybits; in insert()
626 if (parent->right) in insert()
627 parent->keybits |= (1 << parent->bitnum); in insert()
666 printf("Pruning %s_%x\n", tree->type, tree->maxage); in prune()
669 if (tree->childnode == LEAF) in prune()
671 if (!tree->root) in prune()
675 node = tree->root; in prune()
677 if (node->nextbyte) in prune()
679 if (node->leftnode == LEAF) in prune()
681 if (node->rightnode == LEAF) in prune()
683 if (!node->left) in prune()
685 if (!node->right) in prune()
687 left = node->left; in prune()
688 right = node->right; in prune()
689 if (left->keymask == 0) in prune()
691 if (right->keymask == 0) in prune()
693 if (left->keymask != right->keymask) in prune()
695 if (left->keybits != right->keybits) in prune()
699 assert(left->left || left->right); in prune()
700 if (left->leftnode == LEAF) in prune()
701 leftleaf = left->left; in prune()
702 else if (left->rightnode == LEAF) in prune()
703 leftleaf = left->right; in prune()
704 else if (left->left) in prune()
705 left = left->left; in prune()
706 else if (left->right) in prune()
707 left = left->right; in prune()
713 assert(right->left || right->right); in prune()
714 if (right->leftnode == LEAF) in prune()
715 rightleaf = right->left; in prune()
716 else if (right->rightnode == LEAF) in prune()
717 rightleaf = right->right; in prune()
718 else if (right->left) in prune()
719 right = right->left; in prune()
720 else if (right->right) in prune()
721 right = right->right; in prune()
725 if (! tree->leaf_equal(leftleaf, rightleaf)) in prune()
728 * This node has identical singleton-only subtrees. in prune()
731 parent = node->parent; in prune()
732 left = node->left; in prune()
733 right = node->right; in prune()
734 if (parent->left == node) in prune()
735 parent->left = left; in prune()
736 else if (parent->right == node) in prune()
737 parent->right = left; in prune()
740 left->parent = parent; in prune()
741 left->keymask |= (1 << node->bitnum); in prune()
742 node->left = NULL; in prune()
744 bitmask = 1 << node->bitnum; in prune()
747 if (node->leftnode == NODE && node->left) { in prune()
748 left = node->left; in prune()
752 } else if (node->rightnode == NODE && node->right) { in prune()
753 right = node->right; in prune()
763 /* Force re-check */ in prune()
764 bitmask = 1 << node->bitnum; in prune()
768 if (node->left && node->right) in prune()
770 if (node->left) { in prune()
771 left = node->left; in prune()
772 node->keymask |= left->keymask; in prune()
773 node->keybits |= left->keybits; in prune()
775 if (node->right) { in prune()
776 right = node->right; in prune()
777 node->keymask |= right->keymask; in prune()
778 node->keybits |= right->keybits; in prune()
780 node->keymask |= (1 << node->bitnum); in prune()
781 node = node->parent; in prune()
782 /* Force re-check */ in prune()
783 bitmask = 1 << node->bitnum; in prune()
788 bitmask = 1 << node->bitnum; in prune()
790 node->leftnode == NODE && in prune()
791 node->left) { in prune()
793 node = node->left; in prune()
795 node->rightnode == NODE && in prune()
796 node->right) { in prune()
798 node = node->right; in prune()
802 node = node->parent; in prune()
824 printf("Marking %s_%x\n", tree->type, tree->maxage); in mark_nodes()
825 if (tree->childnode == LEAF) in mark_nodes()
828 assert(tree->childnode == NODE); in mark_nodes()
829 node = tree->root; in mark_nodes()
832 bitmask = 1 << node->bitnum; in mark_nodes()
835 if (node->leftnode == LEAF) { in mark_nodes()
836 assert(node->left); in mark_nodes()
837 if (tree->leaf_mark(node->left)) { in mark_nodes()
839 while (n && !n->mark) { in mark_nodes()
841 n->mark = 1; in mark_nodes()
842 n = n->parent; in mark_nodes()
845 } else if (node->left) { in mark_nodes()
846 assert(node->leftnode == NODE); in mark_nodes()
847 node = node->left; in mark_nodes()
853 if (node->rightnode == LEAF) { in mark_nodes()
854 assert(node->right); in mark_nodes()
855 if (tree->leaf_mark(node->right)) { in mark_nodes()
857 while (n && !n->mark) { in mark_nodes()
859 n->mark = 1; in mark_nodes()
860 n = n->parent; in mark_nodes()
863 } else if (node->right) { in mark_nodes()
864 assert(node->rightnode == NODE); in mark_nodes()
865 node = node->right; in mark_nodes()
871 node = node->parent; in mark_nodes()
876 assert(tree->childnode == NODE); in mark_nodes()
877 node = tree->root; in mark_nodes()
880 bitmask = 1 << node->bitnum; in mark_nodes()
883 if (node->leftnode == LEAF) { in mark_nodes()
884 assert(node->left); in mark_nodes()
885 if (tree->leaf_mark(node->left)) { in mark_nodes()
887 while (n && !n->mark) { in mark_nodes()
889 n->mark = 1; in mark_nodes()
890 n = n->parent; in mark_nodes()
893 } else if (node->left) { in mark_nodes()
894 assert(node->leftnode == NODE); in mark_nodes()
895 node = node->left; in mark_nodes()
896 if (!node->mark && node->parent->mark) { in mark_nodes()
898 node->mark = 1; in mark_nodes()
905 if (node->rightnode == LEAF) { in mark_nodes()
906 assert(node->right); in mark_nodes()
907 if (tree->leaf_mark(node->right)) { in mark_nodes()
909 while (n && !n->mark) { in mark_nodes()
911 n->mark = 1; in mark_nodes()
912 n = n->parent; in mark_nodes()
915 } else if (node->right) { in mark_nodes()
916 assert(node->rightnode == NODE); in mark_nodes()
917 node = node->right; in mark_nodes()
918 if (!node->mark && node->parent->mark && in mark_nodes()
919 !node->parent->left) { in mark_nodes()
921 node->mark = 1; in mark_nodes()
928 node = node->parent; in mark_nodes()
937 * emitted trie. These values must be pre-computed because relative
952 tree->index = index; in index_nodes()
957 printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index); in index_nodes()
958 if (tree->childnode == LEAF) { in index_nodes()
959 index += tree->leaf_size(tree->root); in index_nodes()
963 assert(tree->childnode == NODE); in index_nodes()
964 node = tree->root; in index_nodes()
967 if (!node->mark) in index_nodes()
970 if (node->index != index) in index_nodes()
971 node->index = index; in index_nodes()
972 index += node->size; in index_nodes()
975 bitmask = 1 << node->bitnum; in index_nodes()
976 if (node->mark && (leftmask & bitmask) == 0) { in index_nodes()
978 if (node->leftnode == LEAF) { in index_nodes()
979 assert(node->left); in index_nodes()
980 *tree->leaf_index(tree, node->left) = in index_nodes()
982 index += tree->leaf_size(node->left); in index_nodes()
984 } else if (node->left) { in index_nodes()
985 assert(node->leftnode == NODE); in index_nodes()
987 node = node->left; in index_nodes()
991 if (node->mark && (rightmask & bitmask) == 0) { in index_nodes()
993 if (node->rightnode == LEAF) { in index_nodes()
994 assert(node->right); in index_nodes()
995 *tree->leaf_index(tree, node->right) = index; in index_nodes()
996 index += tree->leaf_size(node->right); in index_nodes()
998 } else if (node->right) { in index_nodes()
999 assert(node->rightnode == NODE); in index_nodes()
1001 node = node->right; in index_nodes()
1007 node = node->parent; in index_nodes()
1008 indent -= 1; in index_nodes()
1027 if (!node || node->mark) in mark_subtree()
1029 node->mark = 1; in mark_subtree()
1030 node->index = node->parent->index; in mark_subtree()
1032 if (node->leftnode == NODE) in mark_subtree()
1033 changed += mark_subtree(node->left); in mark_subtree()
1034 if (node->rightnode == NODE) in mark_subtree()
1035 changed += mark_subtree(node->right); in mark_subtree()
1041 * each node needs to store a three-byte offset. The indexes of the
1068 printf("Sizing %s_%x\n", tree->type, tree->maxage); in size_nodes()
1069 if (tree->childnode == LEAF) in size_nodes()
1072 assert(tree->childnode == NODE); in size_nodes()
1075 node = tree->root; in size_nodes()
1078 if (!node->mark) in size_nodes()
1081 if (!node->left || !node->right) { in size_nodes()
1084 if (node->rightnode == NODE) { in size_nodes()
1091 right = node->right; in size_nodes()
1092 next = tree->next; in size_nodes()
1093 while (!right->mark) { in size_nodes()
1095 n = next->root; in size_nodes()
1096 while (n->bitnum != node->bitnum) { in size_nodes()
1097 nbit = 1 << n->bitnum; in size_nodes()
1101 if (n->rightnode == LEAF) in size_nodes()
1103 n = n->right; in size_nodes()
1105 if (n->leftnode == LEAF) in size_nodes()
1107 n = n->left; in size_nodes()
1110 if (n->bitnum != node->bitnum) in size_nodes()
1112 n = n->right; in size_nodes()
1114 next = next->next; in size_nodes()
1117 if (!right->mark) in size_nodes()
1119 offset = right->index - node->index; in size_nodes()
1121 offset = *tree->leaf_index(tree, node->right); in size_nodes()
1122 offset -= node->index; in size_nodes()
1134 if (node->size != size || node->offset != offset) { in size_nodes()
1135 node->size = size; in size_nodes()
1136 node->offset = offset; in size_nodes()
1141 bitmask = 1 << node->bitnum; in size_nodes()
1143 if (node->mark && (leftmask & bitmask) == 0) { in size_nodes()
1145 if (node->leftnode == LEAF) { in size_nodes()
1146 assert(node->left); in size_nodes()
1147 } else if (node->left) { in size_nodes()
1148 assert(node->leftnode == NODE); in size_nodes()
1150 node = node->left; in size_nodes()
1154 if (node->mark && (rightmask & bitmask) == 0) { in size_nodes()
1157 if (node->rightnode == LEAF) { in size_nodes()
1158 assert(node->right); in size_nodes()
1159 } else if (node->right) { in size_nodes()
1160 assert(node->rightnode == NODE); in size_nodes()
1162 node = node->right; in size_nodes()
1170 node = node->parent; in size_nodes()
1171 indent -= 1; in size_nodes()
1202 index = tree->index; in emit()
1206 printf("Emitting %s_%x\n", tree->type, tree->maxage); in emit()
1207 if (tree->childnode == LEAF) { in emit()
1208 assert(tree->root); in emit()
1209 tree->leaf_emit(tree->root, data); in emit()
1210 size = tree->leaf_size(tree->root); in emit()
1216 assert(tree->childnode == NODE); in emit()
1217 node = tree->root; in emit()
1220 if (!node->mark) in emit()
1222 assert(node->offset != -1); in emit()
1223 assert(node->index == index); in emit()
1226 if (node->nextbyte) in emit()
1228 byte |= (node->bitnum & BITNUM); in emit()
1229 if (node->left && node->right) { in emit()
1230 if (node->leftnode == NODE) in emit()
1232 if (node->rightnode == NODE) in emit()
1234 if (node->offset <= 0xff) in emit()
1236 else if (node->offset <= 0xffff) in emit()
1241 offset = node->offset; in emit()
1245 while (offlen--) { in emit()
1250 } else if (node->left) { in emit()
1251 if (node->leftnode == NODE) in emit()
1256 } else if (node->right) { in emit()
1258 if (node->rightnode == NODE) in emit()
1268 bitmask = 1 << node->bitnum; in emit()
1269 if (node->mark && (leftmask & bitmask) == 0) { in emit()
1271 if (node->leftnode == LEAF) { in emit()
1272 assert(node->left); in emit()
1273 data = tree->leaf_emit(node->left, in emit()
1275 size = tree->leaf_size(node->left); in emit()
1279 } else if (node->left) { in emit()
1280 assert(node->leftnode == NODE); in emit()
1282 node = node->left; in emit()
1286 if (node->mark && (rightmask & bitmask) == 0) { in emit()
1288 if (node->rightnode == LEAF) { in emit()
1289 assert(node->right); in emit()
1290 data = tree->leaf_emit(node->right, in emit()
1292 size = tree->leaf_size(node->right); in emit()
1296 } else if (node->right) { in emit()
1297 assert(node->rightnode == NODE); in emit()
1299 node = node->right; in emit()
1305 node = node->parent; in emit()
1306 indent -= 1; in emit()
1316 printf(" %d total\n", index - tree->index); in emit()
1320 /* ------------------------------------------------------------------ */
1335 * the Unicode version in which the mapping was corrected.
1340 int gen; member
1367 if (u->code == corrections[i].code) in corrections_lookup()
1377 if (left->gen != right->gen) in nfkdi_equal()
1379 if (left->ccc != right->ccc) in nfkdi_equal()
1381 if (left->utf8nfkdi && right->utf8nfkdi && in nfkdi_equal()
1382 strcmp(left->utf8nfkdi, right->utf8nfkdi) == 0) in nfkdi_equal()
1384 if (left->utf8nfkdi || right->utf8nfkdi) in nfkdi_equal()
1394 if (left->gen != right->gen) in nfkdicf_equal()
1396 if (left->ccc != right->ccc) in nfkdicf_equal()
1398 if (left->utf8nfkdicf && right->utf8nfkdicf && in nfkdicf_equal()
1399 strcmp(left->utf8nfkdicf, right->utf8nfkdicf) == 0) in nfkdicf_equal()
1401 if (left->utf8nfkdicf && right->utf8nfkdicf) in nfkdicf_equal()
1403 if (left->utf8nfkdicf || right->utf8nfkdicf) in nfkdicf_equal()
1405 if (left->utf8nfkdi && right->utf8nfkdi && in nfkdicf_equal()
1406 strcmp(left->utf8nfkdi, right->utf8nfkdi) == 0) in nfkdicf_equal()
1408 if (left->utf8nfkdi || right->utf8nfkdi) in nfkdicf_equal()
1417 printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf, in nfkdi_print()
1418 leaf->code, leaf->ccc, leaf->gen); in nfkdi_print()
1419 if (leaf->utf8nfkdi && leaf->utf8nfkdi[0] == HANGUL) in nfkdi_print()
1421 else if (leaf->utf8nfkdi) in nfkdi_print()
1422 printf(" nfkdi \"%s\"", (const char*)leaf->utf8nfkdi); in nfkdi_print()
1430 printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf, in nfkdicf_print()
1431 leaf->code, leaf->ccc, leaf->gen); in nfkdicf_print()
1432 if (leaf->utf8nfkdicf) in nfkdicf_print()
1433 printf(" nfkdicf \"%s\"", (const char*)leaf->utf8nfkdicf); in nfkdicf_print()
1434 else if (leaf->utf8nfkdi && leaf->utf8nfkdi[0] == HANGUL) in nfkdicf_print()
1436 else if (leaf->utf8nfkdi) in nfkdicf_print()
1437 printf(" nfkdi \"%s\"", (const char*)leaf->utf8nfkdi); in nfkdicf_print()
1450 if (leaf->utf8nfkdicf) in nfkdicf_mark()
1459 return leaf->correction; in correction_mark()
1467 if (HANGUL_SYLLABLE(leaf->code)) in nfkdi_size()
1469 else if (leaf->utf8nfkdi) in nfkdi_size()
1470 size += strlen(leaf->utf8nfkdi) + 1; in nfkdi_size()
1479 if (HANGUL_SYLLABLE(leaf->code)) in nfkdicf_size()
1481 else if (leaf->utf8nfkdicf) in nfkdicf_size()
1482 size += strlen(leaf->utf8nfkdicf) + 1; in nfkdicf_size()
1483 else if (leaf->utf8nfkdi) in nfkdicf_size()
1484 size += strlen(leaf->utf8nfkdi) + 1; in nfkdicf_size()
1492 return &tree->leafindex[leaf->code]; in nfkdi_index()
1499 return &tree->leafindex[leaf->code]; in nfkdicf_index()
1507 *data++ = leaf->gen; in nfkdi_emit()
1508 if (HANGUL_SYLLABLE(leaf->code)) { in nfkdi_emit()
1511 } else if (leaf->utf8nfkdi) { in nfkdi_emit()
1513 s = (unsigned char*)leaf->utf8nfkdi; in nfkdi_emit()
1517 *data++ = leaf->ccc; in nfkdi_emit()
1527 *data++ = leaf->gen; in nfkdicf_emit()
1528 if (HANGUL_SYLLABLE(leaf->code)) { in nfkdicf_emit()
1531 } else if (leaf->utf8nfkdicf) { in nfkdicf_emit()
1533 s = (unsigned char*)leaf->utf8nfkdicf; in nfkdicf_emit()
1536 } else if (leaf->utf8nfkdi) { in nfkdicf_emit()
1538 s = (unsigned char*)leaf->utf8nfkdi; in nfkdicf_emit()
1542 *data++ = leaf->ccc; in nfkdicf_emit()
1554 if (data->utf8nfkdi) { in utf8_create()
1555 assert(data->utf8nfkdi[0] == HANGUL); in utf8_create()
1560 um = data->utf32nfkdi; in utf8_create()
1565 data->utf8nfkdi = strdup(utf); in utf8_create()
1568 um = data->utf32nfkdicf; in utf8_create()
1573 if (!data->utf8nfkdi || strcmp(data->utf8nfkdi, utf)) in utf8_create()
1574 data->utf8nfkdicf = strdup(utf); in utf8_create()
1601 nextage = (unsigned int)-1; in trees_init()
1607 if (nextage < data->correction && in trees_init()
1608 data->correction < maxage) in trees_init()
1609 nextage = data->correction; in trees_init()
1620 nextage = (unsigned int)-1; in trees_init()
1623 trees[--count].maxage = maxage; in trees_init()
1624 trees[--count].maxage = maxage; in trees_init()
1628 if (nextage < data->correction && in trees_init()
1629 data->correction < maxage) in trees_init()
1630 nextage = data->correction; in trees_init()
1639 trees[i].maxage = ages[j-1]; in trees_init()
1643 trees[trees_count-2].next = &trees[trees_count-1]; in trees_init()
1644 trees[trees_count-1].leaf_mark = nfkdi_mark; in trees_init()
1645 trees[trees_count-2].leaf_mark = nfkdicf_mark; in trees_init()
1646 for (i = 0; i != trees_count-2; i += 2) { in trees_init()
1647 trees[i].next = &trees[trees_count-2]; in trees_init()
1649 trees[i+1].next = &trees[trees_count-1]; in trees_init()
1689 if (unicode_data[unichar].gen < 0) in trees_populate()
1693 if (data->correction <= trees[i].maxage) in trees_populate()
1731 nfkdi = utf8data + trees[trees_count-1].index; in trees_reduce()
1732 nfkdicf = utf8data + trees[trees_count-2].index; in trees_reduce()
1734 nfkdi_tree = &trees[trees_count-1]; in trees_reduce()
1735 nfkdicf_tree = &trees[trees_count-2]; in trees_reduce()
1749 printf("Verifying %s_%x\n", tree->type, tree->maxage); in verify()
1750 nocf = strcmp(tree->type, "nfkdicf"); in verify()
1755 if (data->correction <= tree->maxage) in verify()
1761 if (data->gen != -1) in verify()
1768 if (data->gen == -1) in verify()
1770 if (data->gen != LEAF_GEN(leaf)) in verify()
1773 if (HANGUL_SYLLABLE(data->code)) { in verify()
1774 if (data->utf8nfkdi[0] != HANGUL) in verify()
1777 if (!data->utf8nfkdi) { in verify()
1779 } else if (strcmp(data->utf8nfkdi, in verify()
1784 if (!data->utf8nfkdicf && in verify()
1785 !data->utf8nfkdi) { in verify()
1787 } else if (data->utf8nfkdicf) { in verify()
1788 if (strcmp(data->utf8nfkdicf, in verify()
1791 } else if (strcmp(data->utf8nfkdi, in verify()
1796 } else if (data->ccc != LEAF_CCC(leaf)) { in verify()
1801 printf("%X code %X gen %d ccc %d" in verify()
1802 " nfkdi -> \"%s\"", in verify()
1803 unichar, data->code, data->gen, in verify()
1804 data->ccc, in verify()
1805 data->utf8nfkdi); in verify()
1807 printf(" gen %d ccc %d" in verify()
1808 " nfkdi -> \"%s\"", in verify()
1827 /* ------------------------------------------------------------------ */
1834 printf("normalization of UTF-8 strings. The trie is derived from\n"); in help()
1841 printf("\t- Apply unicode normalization form NFKD.\n"); in help()
1842 printf("\t- Remove any Default_Ignorable_Code_Point.\n"); in help()
1845 printf("\t- Apply unicode normalization form NFKD.\n"); in help()
1846 printf("\t- Remove any Default_Ignorable_Code_Point.\n"); in help()
1847 printf("\t- Apply a full casefold (C + F).\n"); in help()
1859 printf("\t-a %s\n", AGE_NAME); in help()
1860 printf("\t-c %s\n", CCC_NAME); in help()
1861 printf("\t-p %s\n", PROP_NAME); in help()
1862 printf("\t-d %s\n", DATA_NAME); in help()
1863 printf("\t-f %s\n", FOLD_NAME); in help()
1864 printf("\t-n %s\n", NORM_NAME); in help()
1867 printf("\t-t %s\n", TEST_NAME); in help()
1870 printf("\t-o %s\n", UTF8_NAME); in help()
1898 /* ------------------------------------------------------------------ */
1910 printf(" %X ->", unichar); in print_utf32nfkdi()
1917 printf(" %X ->", unichar); in print_utf32nfkdicf()
1922 /* ------------------------------------------------------------------ */
1933 int gen; in age_init() local
1945 gen = 0; in age_init()
1979 ages[ages_count] = (unsigned int)-1; in age_init()
1983 gen = 0; in age_init()
1988 ages[++gen] = in age_init()
1991 printf(" Age V%d_%d_%d = gen %d\n", in age_init()
1992 major, minor, revision, gen); in age_init()
1999 ages[++gen] = UNICODE_AGE(major, minor, 0); in age_init()
2002 major, minor, gen); in age_init()
2011 unicode_data[unichar].gen = gen; in age_init()
2012 count += 1 + last - first; in age_init()
2014 printf(" %X..%X gen %d\n", first, last, gen); in age_init()
2021 unicode_data[unichar].gen = gen; in age_init()
2024 printf(" %X gen %d\n", unichar, gen); in age_init()
2030 unicode_maxage = ages[gen]; in age_init()
2037 unicode_data[unichar].gen = -1; in age_init()
2099 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ in nfkdi_init() local
2126 /* decode the decomposition into UTF-32 */ in nfkdi_init()
2129 mapping[i] = strtoul(s, &s, 16); in nfkdi_init()
2130 if (!utf32valid(mapping[i])) in nfkdi_init()
2134 mapping[i++] = 0; in nfkdi_init()
2137 memcpy(um, mapping, i * sizeof(unsigned int)); in nfkdi_init()
2155 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ in nfkdicf_init() local
2185 mapping[i] = strtoul(s, &s, 16); in nfkdicf_init()
2186 if (!utf32valid(mapping[i])) in nfkdicf_init()
2190 mapping[i++] = 0; in nfkdicf_init()
2193 memcpy(um, mapping, i * sizeof(unsigned int)); in nfkdicf_init()
2285 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ in corrections_init() local
2329 mapping[i] = strtoul(s, &s, 16); in corrections_init()
2330 if (!utf32valid(mapping[i])) in corrections_init()
2334 mapping[i++] = 0; in corrections_init()
2337 memcpy(um, mapping, i * sizeof(unsigned int)); in corrections_init()
2341 printf(" %X -> %s -> %s V%d_%d_%d\n", in corrections_init()
2353 /* ------------------------------------------------------------------ */
2372 * SIndex = s - SBase
2413 unsigned int mapping[4]; in hangul_decompose() local
2423 unsigned int si = unichar - sb; in hangul_decompose()
2429 mapping[i++] = lb + li; in hangul_decompose()
2430 mapping[i++] = vb + vi; in hangul_decompose()
2432 mapping[i++] = tb + ti; in hangul_decompose()
2433 mapping[i++] = 0; in hangul_decompose()
2437 memcpy(um, mapping, i * sizeof(unsigned int)); in hangul_decompose()
2442 memcpy(um, mapping, i * sizeof(unsigned int)); in hangul_decompose()
2466 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ in nfkdi_decompose() local
2489 mapping[i++] = dc[j]; in nfkdi_decompose()
2492 mapping[i++] = *um; in nfkdi_decompose()
2496 mapping[i++] = 0; in nfkdi_decompose()
2501 memcpy(um, mapping, i * sizeof(unsigned int)); in nfkdi_decompose()
2507 memcpy(um, mapping, i * sizeof(unsigned int)); in nfkdi_decompose()
2521 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ in nfkdicf_decompose() local
2543 mapping[i++] = dc[j]; in nfkdicf_decompose()
2546 mapping[i++] = *um; in nfkdicf_decompose()
2550 mapping[i++] = 0; in nfkdicf_decompose()
2555 memcpy(um, mapping, i * sizeof(unsigned int)); in nfkdicf_decompose()
2566 /* ------------------------------------------------------------------ */
2596 * SIndex = s - SBase
2645 si = utf8decode(str) - SB; in utf8hangul()
2656 /* Add LPart, a 3-byte UTF-8 sequence. */ in utf8hangul()
2659 /* Add VPart, a 3-byte UTF-8 sequence. */ in utf8hangul()
2662 /* Add TPart if required, also a 3-byte UTF-8 sequence. */ in utf8hangul()
2676 * A non-NULL return guarantees that the UTF-8 sequence starting at s
2677 * is well-formed and corresponds to a known unicode code point. The
2678 * shorthand for this will be "is valid UTF-8 unicode".
2683 utf8trie_t *trie = utf8data + tree->index; in utf8nlookup()
2697 if (--len == 0) in utf8nlookup()
2708 while (--offlen) { in utf8nlookup()
2739 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is in utf8nlookup()
2741 * start of the sequence is at s-2. in utf8nlookup()
2744 trie = utf8hangul(s - 2, hangul); in utf8nlookup()
2757 return utf8nlookup(tree, hangul, s, (size_t)-1); in utf8lookup()
2761 * Return the number of bytes used by the current UTF-8 sequence.
2762 * Assumes the input points to the first byte of a valid UTF-8
2773 * Return -1 if s is not valid UTF-8 unicode.
2774 * Return 0 if only non-assigned code points are used.
2784 return -1; in utf8agemax()
2789 return -1; in utf8agemax()
2791 if (leaf_age <= tree->maxage && leaf_age > age) in utf8agemax()
2800 * Return -1 if s is not valid UTF-8 unicode.
2801 * Return 0 if non-assigned code points are used.
2811 return -1; in utf8agemin()
2812 age = tree->maxage; in utf8agemin()
2816 return -1; in utf8agemin()
2818 if (leaf_age <= tree->maxage && leaf_age < age) in utf8agemin()
2827 * Return -1 if s is not valid UTF-8 unicode.
2837 return -1; in utf8nagemax()
2842 return -1; in utf8nagemax()
2844 if (leaf_age <= tree->maxage && leaf_age > age) in utf8nagemax()
2846 len -= utf8clen(s); in utf8nagemax()
2854 * Return -1 if s is not valid UTF-8 unicode.
2864 return -1; in utf8nagemin()
2865 age = tree->maxage; in utf8nagemin()
2869 return -1; in utf8nagemin()
2871 if (leaf_age <= tree->maxage && leaf_age < age) in utf8nagemin()
2873 len -= utf8clen(s); in utf8nagemin()
2881 * Return -1 if s is not valid UTF-8 unicode.
2892 return -1; in utf8len()
2896 return -1; in utf8len()
2897 if (ages[LEAF_GEN(leaf)] > tree->maxage) in utf8len()
2910 * Return -1 if s is not valid UTF-8 unicode.
2919 return -1; in utf8nlen()
2923 return -1; in utf8nlen()
2924 if (ages[LEAF_GEN(leaf)] > tree->maxage) in utf8nlen()
2930 len -= utf8clen(s); in utf8nlen()
2961 * Returns -1 on error, 0 on success.
2967 return -1; in utf8ncursor()
2969 return -1; in utf8ncursor()
2970 u8c->tree = tree; in utf8ncursor()
2971 u8c->s = s; in utf8ncursor()
2972 u8c->p = NULL; in utf8ncursor()
2973 u8c->ss = NULL; in utf8ncursor()
2974 u8c->sp = NULL; in utf8ncursor()
2975 u8c->len = len; in utf8ncursor()
2976 u8c->slen = 0; in utf8ncursor()
2977 u8c->ccc = STOPPER; in utf8ncursor()
2978 u8c->nccc = STOPPER; in utf8ncursor()
2979 u8c->unichar = 0; in utf8ncursor()
2981 if (u8c->len != len) in utf8ncursor()
2982 return -1; in utf8ncursor()
2985 return -1; in utf8ncursor()
2992 * s : NUL-terminated string.
2996 * Returns -1 on error, 0 on success.
3000 return utf8ncursor(u8c, tree, s, (unsigned int)-1); in utf8cursor()
3006 * Returns the byte cast to an unsigned char on success, and -1 on failure.
3008 * The cursor keeps track of the location in the string in u8c->s.
3010 * u8c->p, and u8c->s is set to the start of the decomposition. Note
3011 * that bytes from a decomposition do not count against u8c->len.
3013 * Characters are emitted if they match the current CCC in u8c->ccc.
3014 * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
3018 * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
3020 * emitted and stores it in u8c->nccc, the second pass emits the
3026 * u8c->p != NULL -> a decomposition is being scanned.
3027 * u8c->ss != NULL -> this is a repeating scan.
3028 * u8c->ccc == -1 -> this is the first scan of a repeating scan.
3037 if (u8c->p && *u8c->s == '\0') { in utf8byte()
3038 u8c->s = u8c->p; in utf8byte()
3039 u8c->p = NULL; in utf8byte()
3042 /* Check for end-of-string. */ in utf8byte()
3043 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) { in utf8byte()
3045 if (u8c->ccc == STOPPER) in utf8byte()
3047 /* End-of-string during a scan counts as a stopper. */ in utf8byte()
3050 } else if ((*u8c->s & 0xC0) == 0x80) { in utf8byte()
3052 if (!u8c->p) in utf8byte()
3053 u8c->len--; in utf8byte()
3054 return (unsigned char)*u8c->s++; in utf8byte()
3058 if (u8c->p) { in utf8byte()
3059 leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s); in utf8byte()
3061 leaf = utf8nlookup(u8c->tree, u8c->hangul, in utf8byte()
3062 u8c->s, u8c->len); in utf8byte()
3067 return -1; in utf8byte()
3070 if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) { in utf8byte()
3073 u8c->len -= utf8clen(u8c->s); in utf8byte()
3074 u8c->p = u8c->s + utf8clen(u8c->s); in utf8byte()
3075 u8c->s = LEAF_STR(leaf); in utf8byte()
3077 if (*u8c->s == '\0') { in utf8byte()
3078 if (u8c->ccc == STOPPER) in utf8byte()
3083 leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s); in utf8byte()
3086 u8c->unichar = utf8decode(u8c->s); in utf8byte()
3092 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc) in utf8byte()
3093 u8c->nccc = ccc; in utf8byte()
3099 if (ccc == u8c->ccc) { in utf8byte()
3100 if (!u8c->p) in utf8byte()
3101 u8c->len--; in utf8byte()
3102 return (unsigned char)*u8c->s++; in utf8byte()
3107 if (u8c->nccc == STOPPER) { in utf8byte()
3113 assert(u8c->ccc == STOPPER); in utf8byte()
3114 u8c->ccc = MINCCC - 1; in utf8byte()
3115 u8c->nccc = ccc; in utf8byte()
3116 u8c->sp = u8c->p; in utf8byte()
3117 u8c->ss = u8c->s; in utf8byte()
3118 u8c->slen = u8c->len; in utf8byte()
3119 if (!u8c->p) in utf8byte()
3120 u8c->len -= utf8clen(u8c->s); in utf8byte()
3121 u8c->s += utf8clen(u8c->s); in utf8byte()
3124 if (!u8c->p) in utf8byte()
3125 u8c->len -= utf8clen(u8c->s); in utf8byte()
3126 u8c->s += utf8clen(u8c->s); in utf8byte()
3127 } else if (u8c->nccc != MAXCCC + 1) { in utf8byte()
3129 u8c->ccc = u8c->nccc; in utf8byte()
3130 u8c->nccc = MAXCCC + 1; in utf8byte()
3131 u8c->s = u8c->ss; in utf8byte()
3132 u8c->p = u8c->sp; in utf8byte()
3133 u8c->len = u8c->slen; in utf8byte()
3136 u8c->ccc = STOPPER; in utf8byte()
3137 u8c->nccc = STOPPER; in utf8byte()
3138 u8c->sp = NULL; in utf8byte()
3139 u8c->ss = NULL; in utf8byte()
3140 u8c->slen = 0; in utf8byte()
3145 /* ------------------------------------------------------------------ */
3154 /* First test: null-terminated string. */ in normalize_line()
3158 return -1; in normalize_line()
3161 return -1; in normalize_line()
3163 return -1; in normalize_line()
3165 return -1; in normalize_line()
3167 /* Second test: length-limited string. */ in normalize_line()
3170 s[strlen(s) + 1] = -1; in normalize_line()
3173 return -1; in normalize_line()
3176 return -1; in normalize_line()
3178 return -1; in normalize_line()
3180 return -1; in normalize_line()
3223 if (data->utf8nfkdi && !*data->utf8nfkdi) in normalization_test()
3232 printf("Line %s -> %s", buf0, buf1); in normalization_test()
3246 /* ------------------------------------------------------------------ */
3254 int gen; in write_file() local
3264 fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n"); in write_file()
3278 for (gen = 0; gen < ages_count; gen++) { in write_file()
3280 ages[gen], trees[t].index, in write_file()
3281 ages[gen] == unicode_maxage ? "" : ","); in write_file()
3282 if (trees[t].maxage == ages[gen]) in write_file()
3289 for (gen = 0; gen < ages_count; gen++) { in write_file()
3291 ages[gen], trees[t].index, in write_file()
3292 ages[gen] == unicode_maxage ? "" : ","); in write_file()
3293 if (trees[t].maxage == ages[gen]) in write_file()
3305 if (t < trees_count-1) in write_file()
3311 (j < utf8data_size -1 ? "," : "")); in write_file()
3318 /* ------------------------------------------------------------------ */
3327 while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) { in main()