• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  *******************************************************************************
3  * Copyright (C) 2006-2008, International Business Machines Corporation        *
4  * and others. All Rights Reserved.                                            *
5  *******************************************************************************
6  */
7 
8 #include "unicode/utypes.h"
9 
10 #if !UCONFIG_NO_BREAK_ITERATION
11 
12 #include "triedict.h"
13 #include "unicode/chariter.h"
14 #include "unicode/uchriter.h"
15 #include "unicode/strenum.h"
16 #include "unicode/uenum.h"
17 #include "unicode/udata.h"
18 #include "cmemory.h"
19 #include "udataswp.h"
20 #include "uvector.h"
21 #include "uvectr32.h"
22 #include "uarrsort.h"
23 
24 //#define DEBUG_TRIE_DICT 1
25 
26 #ifdef DEBUG_TRIE_DICT
27 #include <sys/times.h>
28 #include <limits.h>
29 #include <stdio.h>
30 #endif
31 
32 U_NAMESPACE_BEGIN
33 
34 /*******************************************************************
35  * TrieWordDictionary
36  */
37 
TrieWordDictionary()38 TrieWordDictionary::TrieWordDictionary() {
39 }
40 
~TrieWordDictionary()41 TrieWordDictionary::~TrieWordDictionary() {
42 }
43 
44 /*******************************************************************
45  * MutableTrieDictionary
46  */
47 
48 // Node structure for the ternary, uncompressed trie
49 struct TernaryNode : public UMemory {
50     UChar       ch;         // UTF-16 code unit
51     uint16_t    flags;      // Flag word
52     TernaryNode *low;       // Less-than link
53     TernaryNode *equal;     // Equal link
54     TernaryNode *high;      // Greater-than link
55 
56     TernaryNode(UChar uc);
57     ~TernaryNode();
58 };
59 
60 enum MutableTrieNodeFlags {
61     kEndsWord = 0x0001      // This node marks the end of a valid word
62 };
63 
64 inline
TernaryNode(UChar uc)65 TernaryNode::TernaryNode(UChar uc) {
66     ch = uc;
67     flags = 0;
68     low = NULL;
69     equal = NULL;
70     high = NULL;
71 }
72 
73 // Not inline since it's recursive
~TernaryNode()74 TernaryNode::~TernaryNode() {
75     delete low;
76     delete equal;
77     delete high;
78 }
79 
MutableTrieDictionary(UChar median,UErrorCode & status)80 MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status ) {
81     // Start the trie off with something. Having the root node already present
82     // cuts a special case out of the search/insertion functions.
83     // Making it a median character cuts the worse case for searches from
84     // 4x a balanced trie to 2x a balanced trie. It's best to choose something
85     // that starts a word that is midway in the list.
86     fTrie = new TernaryNode(median);
87     if (fTrie == NULL) {
88         status = U_MEMORY_ALLOCATION_ERROR;
89     }
90     fIter = utext_openUChars(NULL, NULL, 0, &status);
91     if (U_SUCCESS(status) && fIter == NULL) {
92         status = U_MEMORY_ALLOCATION_ERROR;
93     }
94 }
95 
MutableTrieDictionary(UErrorCode & status)96 MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) {
97     fTrie = NULL;
98     fIter = utext_openUChars(NULL, NULL, 0, &status);
99     if (U_SUCCESS(status) && fIter == NULL) {
100         status = U_MEMORY_ALLOCATION_ERROR;
101     }
102 }
103 
~MutableTrieDictionary()104 MutableTrieDictionary::~MutableTrieDictionary() {
105     delete fTrie;
106     utext_close(fIter);
107 }
108 
109 int32_t
search(UText * text,int32_t maxLength,int32_t * lengths,int & count,int limit,TernaryNode * & parent,UBool & pMatched) const110 MutableTrieDictionary::search( UText *text,
111                                    int32_t maxLength,
112                                    int32_t *lengths,
113                                    int &count,
114                                    int limit,
115                                    TernaryNode *&parent,
116                                    UBool &pMatched ) const {
117     // TODO: current implementation works in UTF-16 space
118     const TernaryNode *up = NULL;
119     const TernaryNode *p = fTrie;
120     int mycount = 0;
121     pMatched = TRUE;
122     int i;
123 
124     UChar uc = utext_current32(text);
125     for (i = 0; i < maxLength && p != NULL; ++i) {
126         while (p != NULL) {
127             if (uc < p->ch) {
128                 up = p;
129                 p = p->low;
130             }
131             else if (uc == p->ch) {
132                 break;
133             }
134             else {
135                 up = p;
136                 p = p->high;
137             }
138         }
139         if (p == NULL) {
140             pMatched = FALSE;
141             break;
142         }
143         // Must be equal to get here
144         if (limit > 0 && (p->flags & kEndsWord)) {
145             lengths[mycount++] = i+1;
146             --limit;
147         }
148         up = p;
149         p = p->equal;
150         uc = utext_next32(text);
151         uc = utext_current32(text);
152     }
153 
154     // Note that there is no way to reach here with up == 0 unless
155     // maxLength is 0 coming in.
156     parent = (TernaryNode *)up;
157     count = mycount;
158     return i;
159 }
160 
161 void
addWord(const UChar * word,int32_t length,UErrorCode & status)162 MutableTrieDictionary::addWord( const UChar *word,
163                                 int32_t length,
164                                 UErrorCode &status ) {
165 #if 0
166     if (length <= 0) {
167         status = U_ILLEGAL_ARGUMENT_ERROR;
168         return;
169     }
170 #endif
171     TernaryNode *parent;
172     UBool pMatched;
173     int count;
174     fIter = utext_openUChars(fIter, word, length, &status);
175 
176     int matched;
177     matched = search(fIter, length, NULL, count, 0, parent, pMatched);
178 
179     while (matched++ < length) {
180         UChar32 uc = utext_next32(fIter);  // TODO:  supplemetary support?
181         U_ASSERT(uc != U_SENTINEL);
182         TernaryNode *newNode = new TernaryNode(uc);
183         if (newNode == NULL) {
184             status = U_MEMORY_ALLOCATION_ERROR;
185             return;
186         }
187         if (pMatched) {
188             parent->equal = newNode;
189         }
190         else {
191             pMatched = TRUE;
192             if (uc < parent->ch) {
193                 parent->low = newNode;
194             }
195             else {
196                 parent->high = newNode;
197             }
198         }
199         parent = newNode;
200     }
201 
202     parent->flags |= kEndsWord;
203 }
204 
205 #if 0
206 void
207 MutableTrieDictionary::addWords( UEnumeration *words,
208                                   UErrorCode &status ) {
209     int32_t length;
210     const UChar *word;
211     while ((word = uenum_unext(words, &length, &status)) && U_SUCCESS(status)) {
212         addWord(word, length, status);
213     }
214 }
215 #endif
216 
217 int32_t
matches(UText * text,int32_t maxLength,int32_t * lengths,int & count,int limit) const218 MutableTrieDictionary::matches( UText *text,
219                                 int32_t maxLength,
220                                 int32_t *lengths,
221                                 int &count,
222                                 int limit ) const {
223     TernaryNode *parent;
224     UBool pMatched;
225     return search(text, maxLength, lengths, count, limit, parent, pMatched);
226 }
227 
228 // Implementation of iteration for MutableTrieDictionary
229 class MutableTrieEnumeration : public StringEnumeration {
230 private:
231     UStack      fNodeStack;     // Stack of nodes to process
232     UVector32   fBranchStack;   // Stack of which branch we are working on
233     TernaryNode *fRoot;         // Root node
234     enum StackBranch {
235         kLessThan,
236         kEqual,
237         kGreaterThan,
238         kDone
239     };
240 
241 public:
242     static UClassID U_EXPORT2 getStaticClassID(void);
243     virtual UClassID getDynamicClassID(void) const;
244 public:
MutableTrieEnumeration(TernaryNode * root,UErrorCode & status)245     MutableTrieEnumeration(TernaryNode *root, UErrorCode &status)
246         : fNodeStack(status), fBranchStack(status) {
247         fRoot = root;
248         fNodeStack.push(root, status);
249         fBranchStack.push(kLessThan, status);
250         unistr.remove();
251     }
252 
~MutableTrieEnumeration()253     virtual ~MutableTrieEnumeration() {
254     }
255 
clone() const256     virtual StringEnumeration *clone() const {
257         UErrorCode status = U_ZERO_ERROR;
258         return new MutableTrieEnumeration(fRoot, status);
259     }
260 
snext(UErrorCode & status)261     virtual const UnicodeString *snext(UErrorCode &status) {
262         if (fNodeStack.empty() || U_FAILURE(status)) {
263             return NULL;
264         }
265         TernaryNode *node = (TernaryNode *) fNodeStack.peek();
266         StackBranch where = (StackBranch) fBranchStack.peeki();
267         while (!fNodeStack.empty() && U_SUCCESS(status)) {
268             UBool emit;
269             UBool equal;
270 
271             switch (where) {
272             case kLessThan:
273                 if (node->low != NULL) {
274                     fBranchStack.setElementAt(kEqual, fBranchStack.size()-1);
275                     node = (TernaryNode *) fNodeStack.push(node->low, status);
276                     where = (StackBranch) fBranchStack.push(kLessThan, status);
277                     break;
278                 }
279             case kEqual:
280                 emit = (node->flags & kEndsWord) != 0;
281                 equal = (node->equal != NULL);
282                 // If this node should be part of the next emitted string, append
283                 // the UChar to the string, and make sure we pop it when we come
284                 // back to this node. The character should only be in the string
285                 // for as long as we're traversing the equal subtree of this node
286                 if (equal || emit) {
287                     unistr.append(node->ch);
288                     fBranchStack.setElementAt(kGreaterThan, fBranchStack.size()-1);
289                 }
290                 if (equal) {
291                     node = (TernaryNode *) fNodeStack.push(node->equal, status);
292                     where = (StackBranch) fBranchStack.push(kLessThan, status);
293                 }
294                 if (emit) {
295                     return &unistr;
296                 }
297                 if (equal) {
298                     break;
299                 }
300             case kGreaterThan:
301                 // If this node's character is in the string, remove it.
302                 if (node->equal != NULL || (node->flags & kEndsWord)) {
303                     unistr.truncate(unistr.length()-1);
304                 }
305                 if (node->high != NULL) {
306                     fBranchStack.setElementAt(kDone, fBranchStack.size()-1);
307                     node = (TernaryNode *) fNodeStack.push(node->high, status);
308                     where = (StackBranch) fBranchStack.push(kLessThan, status);
309                     break;
310                 }
311             case kDone:
312                 fNodeStack.pop();
313                 fBranchStack.popi();
314                 node = (TernaryNode *) fNodeStack.peek();
315                 where = (StackBranch) fBranchStack.peeki();
316                 break;
317             default:
318                 return NULL;
319             }
320         }
321         return NULL;
322     }
323 
324     // Very expensive, but this should never be used.
count(UErrorCode & status) const325     virtual int32_t count(UErrorCode &status) const {
326         MutableTrieEnumeration counter(fRoot, status);
327         int32_t result = 0;
328         while (counter.snext(status) != NULL && U_SUCCESS(status)) {
329             ++result;
330         }
331         return result;
332     }
333 
reset(UErrorCode & status)334     virtual void reset(UErrorCode &status) {
335         fNodeStack.removeAllElements();
336         fBranchStack.removeAllElements();
337         fNodeStack.push(fRoot, status);
338         fBranchStack.push(kLessThan, status);
339         unistr.remove();
340     }
341 };
342 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(MutableTrieEnumeration)343 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(MutableTrieEnumeration)
344 
345 StringEnumeration *
346 MutableTrieDictionary::openWords( UErrorCode &status ) const {
347     if (U_FAILURE(status)) {
348         return NULL;
349     }
350     return new MutableTrieEnumeration(fTrie, status);
351 }
352 
353 /*******************************************************************
354  * CompactTrieDictionary
355  */
356 
357 struct CompactTrieHeader {
358     uint32_t        size;           // Size of the data in bytes
359     uint32_t        magic;          // Magic number (including version)
360     uint16_t        nodeCount;      // Number of entries in offsets[]
361     uint16_t        root;           // Node number of the root node
362     uint32_t        offsets[1];      // Offsets to nodes from start of data
363 };
364 
365 // Note that to avoid platform-specific alignment issues, all members of the node
366 // structures should be the same size, or should contain explicit padding to
367 // natural alignment boundaries.
368 
369 // We can't use a bitfield for the flags+count field, because the layout of those
370 // is not portable. 12 bits of count allows for up to 4096 entries in a node.
371 struct CompactTrieNode {
372     uint16_t        flagscount;     // Count of sub-entries, plus flags
373 };
374 
375 enum CompactTrieNodeFlags {
376     kVerticalNode   = 0x1000,       // This is a vertical node
377     kParentEndsWord = 0x2000,       // The node whose equal link points to this ends a word
378     kReservedFlag1  = 0x4000,
379     kReservedFlag2  = 0x8000,
380     kCountMask      = 0x0FFF,       // The count portion of flagscount
381     kFlagMask       = 0xF000        // The flags portion of flagscount
382 };
383 
384 // The two node types are distinguished by the kVerticalNode flag.
385 
386 struct CompactTrieHorizontalEntry {
387     uint16_t        ch;             // UChar
388     uint16_t        equal;          // Equal link node index
389 };
390 
391 // We don't use inheritance here because C++ does not guarantee that the
392 // base class comes first in memory!!
393 
394 struct CompactTrieHorizontalNode {
395     uint16_t        flagscount;     // Count of sub-entries, plus flags
396     CompactTrieHorizontalEntry      entries[1];
397 };
398 
399 struct CompactTrieVerticalNode {
400     uint16_t        flagscount;     // Count of sub-entries, plus flags
401     uint16_t        equal;          // Equal link node index
402     uint16_t        chars[1];       // Code units
403 };
404 
405 // {'Dic', 1}, version 1
406 #define COMPACT_TRIE_MAGIC_1 0x44696301
407 
CompactTrieDictionary(UDataMemory * dataObj,UErrorCode & status)408 CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj,
409                                                 UErrorCode &status )
410 : fUData(dataObj)
411 {
412     fData = (const CompactTrieHeader *) udata_getMemory(dataObj);
413     fOwnData = FALSE;
414     if (fData->magic != COMPACT_TRIE_MAGIC_1) {
415         status = U_ILLEGAL_ARGUMENT_ERROR;
416         fData = NULL;
417     }
418 }
CompactTrieDictionary(const void * data,UErrorCode & status)419 CompactTrieDictionary::CompactTrieDictionary( const void *data,
420                                                 UErrorCode &status )
421 : fUData(NULL)
422 {
423     fData = (const CompactTrieHeader *) data;
424     fOwnData = FALSE;
425     if (fData->magic != COMPACT_TRIE_MAGIC_1) {
426         status = U_ILLEGAL_ARGUMENT_ERROR;
427         fData = NULL;
428     }
429 }
430 
CompactTrieDictionary(const MutableTrieDictionary & dict,UErrorCode & status)431 CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict,
432                                                 UErrorCode &status )
433 : fUData(NULL)
434 {
435     fData = compactMutableTrieDictionary(dict, status);
436     fOwnData = !U_FAILURE(status);
437 }
438 
~CompactTrieDictionary()439 CompactTrieDictionary::~CompactTrieDictionary() {
440     if (fOwnData) {
441         uprv_free((void *)fData);
442     }
443     if (fUData) {
444         udata_close(fUData);
445     }
446 }
447 
448 uint32_t
dataSize() const449 CompactTrieDictionary::dataSize() const {
450     return fData->size;
451 }
452 
453 const void *
data() const454 CompactTrieDictionary::data() const {
455     return fData;
456 }
457 
458 // This function finds the address of a node for us, given its node ID
459 static inline const CompactTrieNode *
getCompactNode(const CompactTrieHeader * header,uint16_t node)460 getCompactNode(const CompactTrieHeader *header, uint16_t node) {
461     return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]);
462 }
463 
464 int32_t
matches(UText * text,int32_t maxLength,int32_t * lengths,int & count,int limit) const465 CompactTrieDictionary::matches( UText *text,
466                                 int32_t maxLength,
467                                 int32_t *lengths,
468                                 int &count,
469                                 int limit ) const {
470     // TODO: current implementation works in UTF-16 space
471     const CompactTrieNode *node = getCompactNode(fData, fData->root);
472     int mycount = 0;
473 
474     UChar uc = utext_current32(text);
475     int i = 0;
476 
477     while (node != NULL) {
478         // Check if the node we just exited ends a word
479         if (limit > 0 && (node->flagscount & kParentEndsWord)) {
480             lengths[mycount++] = i;
481             --limit;
482         }
483         // Check that we haven't exceeded the maximum number of input characters.
484         // We have to do that here rather than in the while condition so that
485         // we can check for ending a word, above.
486         if (i >= maxLength) {
487             break;
488         }
489 
490         int nodeCount = (node->flagscount & kCountMask);
491         if (nodeCount == 0) {
492             // Special terminal node; return now
493             break;
494         }
495         if (node->flagscount & kVerticalNode) {
496             // Vertical node; check all the characters in it
497             const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node;
498             for (int j = 0; j < nodeCount && i < maxLength; ++j) {
499                 if (uc != vnode->chars[j]) {
500                     // We hit a non-equal character; return
501                     goto exit;
502                 }
503                 utext_next32(text);
504                 uc = utext_current32(text);
505                 ++i;
506             }
507             // To get here we must have come through the whole list successfully;
508             // go on to the next node. Note that a word cannot end in the middle
509             // of a vertical node.
510             node = getCompactNode(fData, vnode->equal);
511         }
512         else {
513             // Horizontal node; do binary search
514             const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;
515             int low = 0;
516             int high = nodeCount-1;
517             int middle;
518             node = NULL;    // If we don't find a match, we'll fall out of the loop
519             while (high >= low) {
520                 middle = (high+low)/2;
521                 if (uc == hnode->entries[middle].ch) {
522                     // We hit a match; get the next node and next character
523                     node = getCompactNode(fData, hnode->entries[middle].equal);
524                     utext_next32(text);
525                     uc = utext_current32(text);
526                     ++i;
527                     break;
528                 }
529                 else if (uc < hnode->entries[middle].ch) {
530                     high = middle-1;
531                 }
532                 else {
533                     low = middle+1;
534                 }
535             }
536         }
537     }
538 exit:
539     count = mycount;
540     return i;
541 }
542 
543 // Implementation of iteration for CompactTrieDictionary
544 class CompactTrieEnumeration : public StringEnumeration {
545 private:
546     UVector32               fNodeStack;     // Stack of nodes to process
547     UVector32               fIndexStack;    // Stack of where in node we are
548     const CompactTrieHeader *fHeader;       // Trie data
549 
550 public:
551     static UClassID U_EXPORT2 getStaticClassID(void);
552     virtual UClassID getDynamicClassID(void) const;
553 public:
CompactTrieEnumeration(const CompactTrieHeader * header,UErrorCode & status)554     CompactTrieEnumeration(const CompactTrieHeader *header, UErrorCode &status)
555         : fNodeStack(status), fIndexStack(status) {
556         fHeader = header;
557         fNodeStack.push(header->root, status);
558         fIndexStack.push(0, status);
559         unistr.remove();
560     }
561 
~CompactTrieEnumeration()562     virtual ~CompactTrieEnumeration() {
563     }
564 
clone() const565     virtual StringEnumeration *clone() const {
566         UErrorCode status = U_ZERO_ERROR;
567         return new CompactTrieEnumeration(fHeader, status);
568     }
569 
570     virtual const UnicodeString * snext(UErrorCode &status);
571 
572     // Very expensive, but this should never be used.
count(UErrorCode & status) const573     virtual int32_t count(UErrorCode &status) const {
574         CompactTrieEnumeration counter(fHeader, status);
575         int32_t result = 0;
576         while (counter.snext(status) != NULL && U_SUCCESS(status)) {
577             ++result;
578         }
579         return result;
580     }
581 
reset(UErrorCode & status)582     virtual void reset(UErrorCode &status) {
583         fNodeStack.removeAllElements();
584         fIndexStack.removeAllElements();
585         fNodeStack.push(fHeader->root, status);
586         fIndexStack.push(0, status);
587         unistr.remove();
588     }
589 };
590 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompactTrieEnumeration) const591 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompactTrieEnumeration)
592 
593 const UnicodeString *
594 CompactTrieEnumeration::snext(UErrorCode &status) {
595     if (fNodeStack.empty() || U_FAILURE(status)) {
596         return NULL;
597     }
598     const CompactTrieNode *node = getCompactNode(fHeader, fNodeStack.peeki());
599     int where = fIndexStack.peeki();
600     while (!fNodeStack.empty() && U_SUCCESS(status)) {
601         int nodeCount = (node->flagscount & kCountMask);
602         UBool goingDown = FALSE;
603         if (nodeCount == 0) {
604             // Terminal node; go up immediately
605             fNodeStack.popi();
606             fIndexStack.popi();
607             node = getCompactNode(fHeader, fNodeStack.peeki());
608             where = fIndexStack.peeki();
609         }
610         else if (node->flagscount & kVerticalNode) {
611             // Vertical node
612             const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node;
613             if (where == 0) {
614                 // Going down
615                 unistr.append((const UChar *)vnode->chars, (int32_t) nodeCount);
616                 fIndexStack.setElementAt(1, fIndexStack.size()-1);
617                 node = getCompactNode(fHeader, fNodeStack.push(vnode->equal, status));
618                 where = fIndexStack.push(0, status);
619                 goingDown = TRUE;
620             }
621             else {
622                 // Going up
623                 unistr.truncate(unistr.length()-nodeCount);
624                 fNodeStack.popi();
625                 fIndexStack.popi();
626                 node = getCompactNode(fHeader, fNodeStack.peeki());
627                 where = fIndexStack.peeki();
628             }
629         }
630         else {
631             // Horizontal node
632             const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;
633             if (where > 0) {
634                 // Pop previous char
635                 unistr.truncate(unistr.length()-1);
636             }
637             if (where < nodeCount) {
638                 // Push on next node
639                 unistr.append((UChar)hnode->entries[where].ch);
640                 fIndexStack.setElementAt(where+1, fIndexStack.size()-1);
641                 node = getCompactNode(fHeader, fNodeStack.push(hnode->entries[where].equal, status));
642                 where = fIndexStack.push(0, status);
643                 goingDown = TRUE;
644             }
645             else {
646                 // Going up
647                 fNodeStack.popi();
648                 fIndexStack.popi();
649                 node = getCompactNode(fHeader, fNodeStack.peeki());
650                 where = fIndexStack.peeki();
651             }
652         }
653         // Check if the parent of the node we've just gone down to ends a
654         // word. If so, return it.
655         if (goingDown && (node->flagscount & kParentEndsWord)) {
656             return &unistr;
657         }
658     }
659     return NULL;
660 }
661 
662 StringEnumeration *
openWords(UErrorCode & status) const663 CompactTrieDictionary::openWords( UErrorCode &status ) const {
664     if (U_FAILURE(status)) {
665         return NULL;
666     }
667     return new CompactTrieEnumeration(fData, status);
668 }
669 
670 //
671 // Below here is all code related to converting a ternary trie to a compact trie
672 // and back again
673 //
674 
675 // Helper classes to construct the compact trie
676 class BuildCompactTrieNode: public UMemory {
677  public:
678     UBool           fParentEndsWord;
679     UBool           fVertical;
680     UBool           fHasDuplicate;
681     int32_t         fNodeID;
682     UnicodeString   fChars;
683 
684  public:
BuildCompactTrieNode(UBool parentEndsWord,UBool vertical,UStack & nodes,UErrorCode & status)685     BuildCompactTrieNode(UBool parentEndsWord, UBool vertical, UStack &nodes, UErrorCode &status) {
686         fParentEndsWord = parentEndsWord;
687         fHasDuplicate = FALSE;
688         fVertical = vertical;
689         fNodeID = nodes.size();
690         nodes.push(this, status);
691     }
692 
~BuildCompactTrieNode()693     virtual ~BuildCompactTrieNode() {
694     }
695 
size()696     virtual uint32_t size() {
697         return sizeof(uint16_t);
698     }
699 
write(uint8_t * bytes,uint32_t & offset,const UVector32 &)700     virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &/*translate*/) {
701         // Write flag/count
702         *((uint16_t *)(bytes+offset)) = (fChars.length() & kCountMask)
703             | (fVertical ? kVerticalNode : 0) | (fParentEndsWord ? kParentEndsWord : 0 );
704         offset += sizeof(uint16_t);
705     }
706 };
707 
708 class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode {
709  public:
710     UStack          fLinks;
711 
712  public:
BuildCompactTrieHorizontalNode(UBool parentEndsWord,UStack & nodes,UErrorCode & status)713     BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status)
714         : BuildCompactTrieNode(parentEndsWord, FALSE, nodes, status), fLinks(status) {
715     }
716 
~BuildCompactTrieHorizontalNode()717     virtual ~BuildCompactTrieHorizontalNode() {
718     }
719 
size()720     virtual uint32_t size() {
721         return offsetof(CompactTrieHorizontalNode,entries) +
722                 (fChars.length()*sizeof(CompactTrieHorizontalEntry));
723     }
724 
write(uint8_t * bytes,uint32_t & offset,const UVector32 & translate)725     virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {
726         BuildCompactTrieNode::write(bytes, offset, translate);
727         int32_t count = fChars.length();
728         for (int32_t i = 0; i < count; ++i) {
729             CompactTrieHorizontalEntry *entry = (CompactTrieHorizontalEntry *)(bytes+offset);
730             entry->ch = fChars[i];
731             entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID);
732 #ifdef DEBUG_TRIE_DICT
733             if (entry->equal == 0) {
734                 fprintf(stderr, "ERROR: horizontal link %d, logical node %d maps to physical node zero\n",
735                         i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID);
736             }
737 #endif
738             offset += sizeof(CompactTrieHorizontalEntry);
739         }
740     }
741 
addNode(UChar ch,BuildCompactTrieNode * link,UErrorCode & status)742     void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) {
743         fChars.append(ch);
744         fLinks.push(link, status);
745     }
746 };
747 
748 class BuildCompactTrieVerticalNode: public BuildCompactTrieNode {
749  public:
750     BuildCompactTrieNode    *fEqual;
751 
752  public:
BuildCompactTrieVerticalNode(UBool parentEndsWord,UStack & nodes,UErrorCode & status)753     BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status)
754         : BuildCompactTrieNode(parentEndsWord, TRUE, nodes, status) {
755         fEqual = NULL;
756     }
757 
~BuildCompactTrieVerticalNode()758     virtual ~BuildCompactTrieVerticalNode() {
759     }
760 
size()761     virtual uint32_t size() {
762         return offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof(uint16_t));
763     }
764 
write(uint8_t * bytes,uint32_t & offset,const UVector32 & translate)765     virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {
766         CompactTrieVerticalNode *node = (CompactTrieVerticalNode *)(bytes+offset);
767         BuildCompactTrieNode::write(bytes, offset, translate);
768         node->equal = translate.elementAti(fEqual->fNodeID);
769         offset += sizeof(node->equal);
770 #ifdef DEBUG_TRIE_DICT
771         if (node->equal == 0) {
772             fprintf(stderr, "ERROR: vertical link, logical node %d maps to physical node zero\n",
773                     fEqual->fNodeID);
774         }
775 #endif
776         fChars.extract(0, fChars.length(), (UChar *)node->chars);
777         offset += sizeof(uint16_t)*fChars.length();
778     }
779 
addChar(UChar ch)780     void addChar(UChar ch) {
781         fChars.append(ch);
782     }
783 
setLink(BuildCompactTrieNode * node)784     void setLink(BuildCompactTrieNode *node) {
785         fEqual = node;
786     }
787 };
788 
789 // Forward declaration
790 static void walkHorizontal(const TernaryNode *node,
791                             BuildCompactTrieHorizontalNode *building,
792                             UStack &nodes,
793                             UErrorCode &status);
794 
795 // Convert one node. Uses recursion.
796 
797 static BuildCompactTrieNode *
compactOneNode(const TernaryNode * node,UBool parentEndsWord,UStack & nodes,UErrorCode & status)798 compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, UErrorCode &status) {
799     if (U_FAILURE(status)) {
800         return NULL;
801     }
802     BuildCompactTrieNode *result = NULL;
803     UBool horizontal = (node->low != NULL || node->high != NULL);
804     if (horizontal) {
805         BuildCompactTrieHorizontalNode *hResult =
806                 new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status);
807         if (hResult == NULL) {
808             status = U_MEMORY_ALLOCATION_ERROR;
809             return NULL;
810         }
811         if (U_SUCCESS(status)) {
812             walkHorizontal(node, hResult, nodes, status);
813             result = hResult;
814         }
815     }
816     else {
817         BuildCompactTrieVerticalNode *vResult =
818                 new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status);
819         if (vResult == NULL) {
820             status = U_MEMORY_ALLOCATION_ERROR;
821         }
822         else if (U_SUCCESS(status)) {
823             UBool   endsWord = FALSE;
824             // Take up nodes until we end a word, or hit a node with < or > links
825             do {
826                 vResult->addChar(node->ch);
827                 endsWord = (node->flags & kEndsWord) != 0;
828                 node = node->equal;
829             }
830             while(node != NULL && !endsWord && node->low == NULL && node->high == NULL);
831             if (node == NULL) {
832                 if (!endsWord) {
833                     status = U_ILLEGAL_ARGUMENT_ERROR;  // Corrupt input trie
834                 }
835                 else {
836                     vResult->setLink((BuildCompactTrieNode *)nodes[1]);
837                 }
838             }
839             else {
840                 vResult->setLink(compactOneNode(node, endsWord, nodes, status));
841             }
842             result = vResult;
843         }
844     }
845     return result;
846 }
847 
848 // Walk the set of peers at the same level, to build a horizontal node.
849 // Uses recursion.
850 
walkHorizontal(const TernaryNode * node,BuildCompactTrieHorizontalNode * building,UStack & nodes,UErrorCode & status)851 static void walkHorizontal(const TernaryNode *node,
852                             BuildCompactTrieHorizontalNode *building,
853                             UStack &nodes,
854                             UErrorCode &status) {
855     while (U_SUCCESS(status) && node != NULL) {
856         if (node->low != NULL) {
857             walkHorizontal(node->low, building, nodes, status);
858         }
859         BuildCompactTrieNode *link = NULL;
860         if (node->equal != NULL) {
861             link = compactOneNode(node->equal, (node->flags & kEndsWord) != 0, nodes, status);
862         }
863         else if (node->flags & kEndsWord) {
864             link = (BuildCompactTrieNode *)nodes[1];
865         }
866         if (U_SUCCESS(status) && link != NULL) {
867             building->addNode(node->ch, link, status);
868         }
869         // Tail recurse manually instead of leaving it to the compiler.
870         //if (node->high != NULL) {
871         //    walkHorizontal(node->high, building, nodes, status);
872         //}
873         node = node->high;
874     }
875 }
876 
877 U_NAMESPACE_END
878 U_NAMESPACE_USE
879 U_CDECL_BEGIN
880 static int32_t U_CALLCONV
_sortBuildNodes(const void *,const void * voidl,const void * voidr)881 _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr) {
882     BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl;
883     BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr;
884     // Check for comparing a node to itself, to avoid spurious duplicates
885     if (left == right) {
886         return 0;
887     }
888     // Most significant is type of node. Can never coalesce.
889     if (left->fVertical != right->fVertical) {
890         return left->fVertical - right->fVertical;
891     }
892     // Next, the "parent ends word" flag. If that differs, we cannot coalesce.
893     if (left->fParentEndsWord != right->fParentEndsWord) {
894         return left->fParentEndsWord - right->fParentEndsWord;
895     }
896     // Next, the string. If that differs, we can never coalesce.
897     int32_t result = left->fChars.compare(right->fChars);
898     if (result != 0) {
899         return result;
900     }
901     // We know they're both the same node type, so branch for the two cases.
902     if (left->fVertical) {
903         result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID
904                             - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID;
905     }
906     else {
907         // We need to compare the links vectors. They should be the
908         // same size because the strings were equal.
909         // We compare the node IDs instead of the pointers, to handle
910         // coalesced nodes.
911         BuildCompactTrieHorizontalNode *hleft, *hright;
912         hleft = (BuildCompactTrieHorizontalNode *)left;
913         hright = (BuildCompactTrieHorizontalNode *)right;
914         int32_t count = hleft->fLinks.size();
915         for (int32_t i = 0; i < count && result == 0; ++i) {
916             result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID -
917                      ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;
918         }
919     }
920     // If they are equal to each other, mark them (speeds coalescing)
921     if (result == 0) {
922         left->fHasDuplicate = TRUE;
923         right->fHasDuplicate = TRUE;
924     }
925     return result;
926 }
927 U_CDECL_END
928 U_NAMESPACE_BEGIN
929 
coalesceDuplicates(UStack & nodes,UErrorCode & status)930 static void coalesceDuplicates(UStack &nodes, UErrorCode &status) {
931     // We sort the array of nodes to place duplicates next to each other
932     if (U_FAILURE(status)) {
933         return;
934     }
935     int32_t size = nodes.size();
936     void **array = (void **)uprv_malloc(sizeof(void *)*size);
937     if (array == NULL) {
938         status = U_MEMORY_ALLOCATION_ERROR;
939         return;
940     }
941     (void) nodes.toArray(array);
942 
943     // Now repeatedly identify duplicates until there are no more
944     int32_t dupes = 0;
945     long    passCount = 0;
946 #ifdef DEBUG_TRIE_DICT
947     long    totalDupes = 0;
948 #endif
949     do {
950         BuildCompactTrieNode *node;
951         BuildCompactTrieNode *first = NULL;
952         BuildCompactTrieNode **p;
953         BuildCompactTrieNode **pFirst = NULL;
954         int32_t counter = size - 2;
955         // Sort the array, skipping nodes 0 and 1. Use quicksort for the first
956         // pass for speed. For the second and subsequent passes, we use stable
957         // (insertion) sort for two reasons:
958         // 1. The array is already mostly ordered, so we get better performance.
959         // 2. The way we find one and only one instance of a set of duplicates is to
960         //    check that the node ID equals the array index. If we used an unstable
961         //    sort for the second or later passes, it's possible that none of the
962         //    duplicates would wind up with a node ID equal to its array index.
963         //    The sort stability guarantees that, because as we coalesce more and
964         //    more groups, the first element of the resultant group will be one of
965         //    the first elements of the groups being coalesced.
966         // To use quicksort for the second and subsequent passes, we would have to
967         // find the minimum of the node numbers in a group, and set all the nodes
968         // in the group to that node number.
969         uprv_sortArray(array+2, counter, sizeof(void *), _sortBuildNodes, NULL, (passCount > 0), &status);
970         dupes = 0;
971         for (p = (BuildCompactTrieNode **)array + 2; counter > 0; --counter, ++p) {
972             node = *p;
973             if (node->fHasDuplicate) {
974                 if (first == NULL) {
975                     first = node;
976                     pFirst = p;
977                 }
978                 else if (_sortBuildNodes(NULL, pFirst, p) != 0) {
979                     // Starting a new run of dupes
980                     first = node;
981                     pFirst = p;
982                 }
983                 else if (node->fNodeID != first->fNodeID) {
984                     // Slave one to the other, note duplicate
985                     node->fNodeID = first->fNodeID;
986                     dupes += 1;
987                 }
988             }
989             else {
990                 // This node has no dupes
991                 first = NULL;
992                 pFirst = NULL;
993             }
994         }
995         passCount += 1;
996 #ifdef DEBUG_TRIE_DICT
997         totalDupes += dupes;
998         fprintf(stderr, "Trie node dupe removal, pass %d: %d nodes tagged\n", passCount, dupes);
999 #endif
1000     }
1001     while (dupes > 0);
1002 #ifdef DEBUG_TRIE_DICT
1003     fprintf(stderr, "Trie node dupe removal complete: %d tagged in %d passes\n", totalDupes, passCount);
1004 #endif
1005 
1006     // We no longer need the temporary array, as the nodes have all been marked appropriately.
1007     uprv_free(array);
1008 }
1009 
1010 U_NAMESPACE_END
1011 U_CDECL_BEGIN
_deleteBuildNode(void * obj)1012 static void U_CALLCONV _deleteBuildNode(void *obj) {
1013     delete (BuildCompactTrieNode *) obj;
1014 }
1015 U_CDECL_END
1016 U_NAMESPACE_BEGIN
1017 
1018 CompactTrieHeader *
compactMutableTrieDictionary(const MutableTrieDictionary & dict,UErrorCode & status)1019 CompactTrieDictionary::compactMutableTrieDictionary( const MutableTrieDictionary &dict,
1020                                 UErrorCode &status ) {
1021     if (U_FAILURE(status)) {
1022         return NULL;
1023     }
1024 #ifdef DEBUG_TRIE_DICT
1025     struct tms timing;
1026     struct tms previous;
1027     (void) ::times(&previous);
1028 #endif
1029     UStack nodes(_deleteBuildNode, NULL, status);      // Index of nodes
1030 
1031     // Add node 0, used as the NULL pointer/sentinel.
1032     nodes.addElement((int32_t)0, status);
1033 
1034     // Start by creating the special empty node we use to indicate that the parent
1035     // terminates a word. This must be node 1, because the builder assumes
1036     // that.
1037     if (U_FAILURE(status)) {
1038         return NULL;
1039     }
1040     BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, FALSE, nodes, status);
1041     if (terminal == NULL) {
1042         status = U_MEMORY_ALLOCATION_ERROR;
1043     }
1044 
1045     // This call does all the work of building the new trie structure. The root
1046     // will be node 2.
1047     BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status);
1048 #ifdef DEBUG_TRIE_DICT
1049     (void) ::times(&timing);
1050     fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n",
1051         nodes.size(), (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
1052         (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
1053     previous = timing;
1054 #endif
1055 
1056     // Now coalesce all duplicate nodes.
1057     coalesceDuplicates(nodes, status);
1058 #ifdef DEBUG_TRIE_DICT
1059     (void) ::times(&timing);
1060     fprintf(stderr, "Duplicates coalesced, time user %f system %f\n",
1061         (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
1062         (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
1063     previous = timing;
1064 #endif
1065 
1066     // Next, build the output trie.
1067     // First we compute all the sizes and build the node ID translation table.
1068     uint32_t totalSize = offsetof(CompactTrieHeader,offsets);
1069     int32_t count = nodes.size();
1070     int32_t nodeCount = 1;              // The sentinel node we already have
1071     BuildCompactTrieNode *node;
1072     int32_t i;
1073     UVector32 translate(count, status); // Should be no growth needed after this
1074     translate.push(0, status);          // The sentinel node
1075 
1076     if (U_FAILURE(status)) {
1077         return NULL;
1078     }
1079 
1080     for (i = 1; i < count; ++i) {
1081         node = (BuildCompactTrieNode *)nodes[i];
1082         if (node->fNodeID == i) {
1083             // Only one node out of each duplicate set is used
1084             if (i >= translate.size()) {
1085                 // Logically extend the mapping table
1086                 translate.setSize(i+1);
1087             }
1088             translate.setElementAt(nodeCount++, i);
1089             totalSize += node->size();
1090         }
1091     }
1092 
1093     // Check for overflowing 16 bits worth of nodes.
1094     if (nodeCount > 0x10000) {
1095         status = U_ILLEGAL_ARGUMENT_ERROR;
1096         return NULL;
1097     }
1098 
1099     // Add enough room for the offsets.
1100     totalSize += nodeCount*sizeof(uint32_t);
1101 #ifdef DEBUG_TRIE_DICT
1102     (void) ::times(&timing);
1103     fprintf(stderr, "Sizes/mapping done, time user %f system %f\n",
1104         (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
1105         (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
1106     previous = timing;
1107     fprintf(stderr, "%d nodes, %d unique, %d bytes\n", nodes.size(), nodeCount, totalSize);
1108 #endif
1109     uint8_t *bytes = (uint8_t *)uprv_malloc(totalSize);
1110     if (bytes == NULL) {
1111         status = U_MEMORY_ALLOCATION_ERROR;
1112         return NULL;
1113     }
1114 
1115     CompactTrieHeader *header = (CompactTrieHeader *)bytes;
1116     header->size = totalSize;
1117     header->nodeCount = nodeCount;
1118     header->offsets[0] = 0;                     // Sentinel
1119     header->root = translate.elementAti(root->fNodeID);
1120 #ifdef DEBUG_TRIE_DICT
1121     if (header->root == 0) {
1122         fprintf(stderr, "ERROR: root node %d translate to physical zero\n", root->fNodeID);
1123     }
1124 #endif
1125     uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uint32_t));
1126     nodeCount = 1;
1127     // Now write the data
1128     for (i = 1; i < count; ++i) {
1129         node = (BuildCompactTrieNode *)nodes[i];
1130         if (node->fNodeID == i) {
1131             header->offsets[nodeCount++] = offset;
1132             node->write(bytes, offset, translate);
1133         }
1134     }
1135 #ifdef DEBUG_TRIE_DICT
1136     (void) ::times(&timing);
1137     fprintf(stderr, "Trie built, time user %f system %f\n",
1138         (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
1139         (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
1140     previous = timing;
1141     fprintf(stderr, "Final offset is %d\n", offset);
1142 
1143     // Collect statistics on node types and sizes
1144     int hCount = 0;
1145     int vCount = 0;
1146     size_t hSize = 0;
1147     size_t vSize = 0;
1148     size_t hItemCount = 0;
1149     size_t vItemCount = 0;
1150     uint32_t previousOff = offset;
1151     for (uint16_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {
1152         const CompactTrieNode *node = getCompactNode(header, nodeIdx);
1153         if (node->flagscount & kVerticalNode) {
1154             vCount += 1;
1155             vItemCount += (node->flagscount & kCountMask);
1156             vSize += previousOff-header->offsets[nodeIdx];
1157         }
1158         else {
1159             hCount += 1;
1160             hItemCount += (node->flagscount & kCountMask);
1161             hSize += previousOff-header->offsets[nodeIdx];
1162         }
1163         previousOff = header->offsets[nodeIdx];
1164     }
1165     fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items\n", hCount,
1166                 (double)hSize/hCount, (double)hItemCount/hCount);
1167     fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n", vCount,
1168                 (double)vSize/vCount, (double)vItemCount/vCount);
1169 #endif
1170 
1171     if (U_FAILURE(status)) {
1172         uprv_free(bytes);
1173         header = NULL;
1174     }
1175     else {
1176         header->magic = COMPACT_TRIE_MAGIC_1;
1177     }
1178     return header;
1179 }
1180 
1181 // Forward declaration
1182 static TernaryNode *
1183 unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UErrorCode &status );
1184 
1185 
1186 // Convert a horizontal node (or subarray thereof) into a ternary subtrie
1187 static TernaryNode *
unpackHorizontalArray(const CompactTrieHeader * header,const CompactTrieHorizontalEntry * array,int low,int high,UErrorCode & status)1188 unpackHorizontalArray( const CompactTrieHeader *header, const CompactTrieHorizontalEntry *array,
1189                             int low, int high, UErrorCode &status ) {
1190     if (U_FAILURE(status) || low > high) {
1191         return NULL;
1192     }
1193     int middle = (low+high)/2;
1194     TernaryNode *result = new TernaryNode(array[middle].ch);
1195     if (result == NULL) {
1196         status = U_MEMORY_ALLOCATION_ERROR;
1197         return NULL;
1198     }
1199     const CompactTrieNode *equal = getCompactNode(header, array[middle].equal);
1200     if (equal->flagscount & kParentEndsWord) {
1201         result->flags |= kEndsWord;
1202     }
1203     result->low = unpackHorizontalArray(header, array, low, middle-1, status);
1204     result->high = unpackHorizontalArray(header, array, middle+1, high, status);
1205     result->equal = unpackOneNode(header, equal, status);
1206     return result;
1207 }
1208 
1209 // Convert one compact trie node into a ternary subtrie
1210 static TernaryNode *
unpackOneNode(const CompactTrieHeader * header,const CompactTrieNode * node,UErrorCode & status)1211 unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UErrorCode &status ) {
1212     int nodeCount = (node->flagscount & kCountMask);
1213     if (nodeCount == 0 || U_FAILURE(status)) {
1214         // Failure, or terminal node
1215         return NULL;
1216     }
1217     if (node->flagscount & kVerticalNode) {
1218         const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node;
1219         TernaryNode *head = NULL;
1220         TernaryNode *previous = NULL;
1221         TernaryNode *latest = NULL;
1222         for (int i = 0; i < nodeCount; ++i) {
1223             latest = new TernaryNode(vnode->chars[i]);
1224             if (latest == NULL) {
1225                 status = U_MEMORY_ALLOCATION_ERROR;
1226                 break;
1227             }
1228             if (head == NULL) {
1229                 head = latest;
1230             }
1231             if (previous != NULL) {
1232                 previous->equal = latest;
1233             }
1234             previous = latest;
1235         }
1236         if (latest != NULL) {
1237             const CompactTrieNode *equal = getCompactNode(header, vnode->equal);
1238             if (equal->flagscount & kParentEndsWord) {
1239                 latest->flags |= kEndsWord;
1240             }
1241             latest->equal = unpackOneNode(header, equal, status);
1242         }
1243         return head;
1244     }
1245     else {
1246         // Horizontal node
1247         const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;
1248         return unpackHorizontalArray(header, &hnode->entries[0], 0, nodeCount-1, status);
1249     }
1250 }
1251 
1252 MutableTrieDictionary *
cloneMutable(UErrorCode & status) const1253 CompactTrieDictionary::cloneMutable( UErrorCode &status ) const {
1254     MutableTrieDictionary *result = new MutableTrieDictionary( status );
1255     if (result == NULL) {
1256         status = U_MEMORY_ALLOCATION_ERROR;
1257         return NULL;
1258     }
1259     TernaryNode *root = unpackOneNode(fData, getCompactNode(fData, fData->root), status);
1260     if (U_FAILURE(status)) {
1261         delete root;    // Clean up
1262         delete result;
1263         return NULL;
1264     }
1265     result->fTrie = root;
1266     return result;
1267 }
1268 
1269 U_NAMESPACE_END
1270 
1271 U_CAPI int32_t U_EXPORT2
triedict_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * status)1272 triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
1273            UErrorCode *status) {
1274 
1275     if (status == NULL || U_FAILURE(*status)) {
1276         return 0;
1277     }
1278     if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
1279         *status=U_ILLEGAL_ARGUMENT_ERROR;
1280         return 0;
1281     }
1282 
1283     //
1284     //  Check that the data header is for for dictionary data.
1285     //    (Header contents are defined in genxxx.cpp)
1286     //
1287     const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4);
1288     if(!(  pInfo->dataFormat[0]==0x54 &&   /* dataFormat="TrDc" */
1289            pInfo->dataFormat[1]==0x72 &&
1290            pInfo->dataFormat[2]==0x44 &&
1291            pInfo->dataFormat[3]==0x63 &&
1292            pInfo->formatVersion[0]==1  )) {
1293         udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
1294                          pInfo->dataFormat[0], pInfo->dataFormat[1],
1295                          pInfo->dataFormat[2], pInfo->dataFormat[3],
1296                          pInfo->formatVersion[0]);
1297         *status=U_UNSUPPORTED_ERROR;
1298         return 0;
1299     }
1300 
1301     //
1302     // Swap the data header.  (This is the generic ICU Data Header, not the
1303     //                         CompactTrieHeader).  This swap also conveniently gets us
1304     //                         the size of the ICU d.h., which lets us locate the start
1305     //                         of the RBBI specific data.
1306     //
1307     int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
1308 
1309     //
1310     // Get the CompactTrieHeader, and check that it appears to be OK.
1311     //
1312     const uint8_t  *inBytes =(const uint8_t *)inData+headerSize;
1313     const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes;
1314     if (ds->readUInt32(header->magic) != COMPACT_TRIE_MAGIC_1
1315             || ds->readUInt32(header->size) < sizeof(CompactTrieHeader))
1316     {
1317         udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n");
1318         *status=U_UNSUPPORTED_ERROR;
1319         return 0;
1320     }
1321 
1322     //
1323     // Prefight operation?  Just return the size
1324     //
1325     uint32_t totalSize = ds->readUInt32(header->size);
1326     int32_t sizeWithUData = (int32_t)totalSize + headerSize;
1327     if (length < 0) {
1328         return sizeWithUData;
1329     }
1330 
1331     //
1332     // Check that length passed in is consistent with length from RBBI data header.
1333     //
1334     if (length < sizeWithUData) {
1335         udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n",
1336                             totalSize);
1337         *status=U_INDEX_OUTOFBOUNDS_ERROR;
1338         return 0;
1339         }
1340 
1341     //
1342     // Swap the Data.  Do the data itself first, then the CompactTrieHeader, because
1343     //                 we need to reference the header to locate the data, and an
1344     //                 inplace swap of the header leaves it unusable.
1345     //
1346     uint8_t             *outBytes = (uint8_t *)outData + headerSize;
1347     CompactTrieHeader   *outputHeader = (CompactTrieHeader *)outBytes;
1348 
1349 #if 0
1350     //
1351     // If not swapping in place, zero out the output buffer before starting.
1352     //
1353     if (inBytes != outBytes) {
1354         uprv_memset(outBytes, 0, totalSize);
1355     }
1356 
1357     // We need to loop through all the nodes in the offset table, and swap each one.
1358     uint16_t nodeCount = ds->readUInt16(header->nodeCount);
1359     // Skip node 0, which should always be 0.
1360     for (int i = 1; i < nodeCount; ++i) {
1361         uint32_t nodeOff = ds->readUInt32(header->offsets[i]);
1362         const CompactTrieNode *inNode = (const CompactTrieNode *)(inBytes + nodeOff);
1363         CompactTrieNode *outNode = (CompactTrieNode *)(outBytes + nodeOff);
1364         uint16_t flagscount = ds->readUInt16(inNode->flagscount);
1365         uint16_t itemCount = flagscount & kCountMask;
1366         ds->writeUInt16(&outNode->flagscount, flagscount);
1367         if (itemCount > 0) {
1368             if (flagscount & kVerticalNode) {
1369                 ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars),
1370                                     itemCount*sizeof(uint16_t),
1371                                     outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), status);
1372                 uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal);
1373                 ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal));
1374             }
1375             else {
1376                 const CompactTrieHorizontalNode *inHNode = (const CompactTrieHorizontalNode *)inNode;
1377                 CompactTrieHorizontalNode *outHNode = (CompactTrieHorizontalNode *)outNode;
1378                 for (int j = 0; j < itemCount; ++j) {
1379                     uint16_t word = ds->readUInt16(inHNode->entries[j].ch);
1380                     ds->writeUInt16(&outHNode->entries[j].ch, word);
1381                     word = ds->readUInt16(inHNode->entries[j].equal);
1382                     ds->writeUInt16(&outHNode->entries[j].equal, word);
1383                 }
1384             }
1385         }
1386     }
1387 #endif
1388 
1389     // All the data in all the nodes consist of 16 bit items. Swap them all at once.
1390     uint16_t nodeCount = ds->readUInt16(header->nodeCount);
1391     uint32_t nodesOff = offsetof(CompactTrieHeader,offsets)+((uint32_t)nodeCount*sizeof(uint32_t));
1392     ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff, status);
1393 
1394     // Swap the header
1395     ds->writeUInt32(&outputHeader->size, totalSize);
1396     uint32_t magic = ds->readUInt32(header->magic);
1397     ds->writeUInt32(&outputHeader->magic, magic);
1398     ds->writeUInt16(&outputHeader->nodeCount, nodeCount);
1399     uint16_t root = ds->readUInt16(header->root);
1400     ds->writeUInt16(&outputHeader->root, root);
1401     ds->swapArray32(ds, inBytes+offsetof(CompactTrieHeader,offsets),
1402             sizeof(uint32_t)*(int32_t)nodeCount,
1403             outBytes+offsetof(CompactTrieHeader,offsets), status);
1404 
1405     return sizeWithUData;
1406 }
1407 
1408 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1409