1 /**
2 *******************************************************************************
3 * Copyright (C) 2006-2008, International Business Machines Corporation *
4 * and others. All Rights Reserved. *
5 *******************************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_BREAK_ITERATION
11
12 #include "triedict.h"
13 #include "unicode/chariter.h"
14 #include "unicode/uchriter.h"
15 #include "unicode/strenum.h"
16 #include "unicode/uenum.h"
17 #include "unicode/udata.h"
18 #include "cmemory.h"
19 #include "udataswp.h"
20 #include "uvector.h"
21 #include "uvectr32.h"
22 #include "uarrsort.h"
23 #include "hash.h"
24
25 //#define DEBUG_TRIE_DICT 1
26
27 #ifdef DEBUG_TRIE_DICT
28 #include <sys/times.h>
29 #include <limits.h>
30 #include <stdio.h>
31 #include <time.h>
32 #ifndef CLK_TCK
33 #define CLK_TCK CLOCKS_PER_SEC
34 #endif
35
36 #endif
37
38 U_NAMESPACE_BEGIN
39
40 /*******************************************************************
41 * TrieWordDictionary
42 */
43
TrieWordDictionary()44 TrieWordDictionary::TrieWordDictionary() {
45 }
46
~TrieWordDictionary()47 TrieWordDictionary::~TrieWordDictionary() {
48 }
49
50 /*******************************************************************
51 * MutableTrieDictionary
52 */
53
54 //#define MAX_VALUE 65535
55
56 // forward declaration
57 inline uint16_t scaleLogProbabilities(double logprob);
58
59 // Node structure for the ternary, uncompressed trie
60 struct TernaryNode : public UMemory {
61 UChar ch; // UTF-16 code unit
62 uint16_t flags; // Flag word
63 TernaryNode *low; // Less-than link
64 TernaryNode *equal; // Equal link
65 TernaryNode *high; // Greater-than link
66
67 TernaryNode(UChar uc);
68 ~TernaryNode();
69 };
70
71 enum MutableTrieNodeFlags {
72 kEndsWord = 0x0001 // This node marks the end of a valid word
73 };
74
75 inline
TernaryNode(UChar uc)76 TernaryNode::TernaryNode(UChar uc) {
77 ch = uc;
78 flags = 0;
79 low = NULL;
80 equal = NULL;
81 high = NULL;
82 }
83
84 // Not inline since it's recursive
~TernaryNode()85 TernaryNode::~TernaryNode() {
86 delete low;
87 delete equal;
88 delete high;
89 }
90
MutableTrieDictionary(UChar median,UErrorCode & status,UBool containsValue)91 MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status,
92 UBool containsValue /* = FALSE */ ) {
93 // Start the trie off with something. Having the root node already present
94 // cuts a special case out of the search/insertion functions.
95 // Making it a median character cuts the worse case for searches from
96 // 4x a balanced trie to 2x a balanced trie. It's best to choose something
97 // that starts a word that is midway in the list.
98 fTrie = new TernaryNode(median);
99 if (fTrie == NULL) {
100 status = U_MEMORY_ALLOCATION_ERROR;
101 }
102 fIter = utext_openUChars(NULL, NULL, 0, &status);
103 if (U_SUCCESS(status) && fIter == NULL) {
104 status = U_MEMORY_ALLOCATION_ERROR;
105 }
106
107 fValued = containsValue;
108 }
109
MutableTrieDictionary(UErrorCode & status,UBool containsValue)110 MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status,
111 UBool containsValue /* = false */ ) {
112 fTrie = NULL;
113 fIter = utext_openUChars(NULL, NULL, 0, &status);
114 if (U_SUCCESS(status) && fIter == NULL) {
115 status = U_MEMORY_ALLOCATION_ERROR;
116 }
117
118 fValued = containsValue;
119 }
120
~MutableTrieDictionary()121 MutableTrieDictionary::~MutableTrieDictionary() {
122 delete fTrie;
123 utext_close(fIter);
124 }
125
126 int32_t
search(UText * text,int32_t maxLength,int32_t * lengths,int & count,int limit,TernaryNode * & parent,UBool & pMatched,uint16_t * values) const127 MutableTrieDictionary::search( UText *text,
128 int32_t maxLength,
129 int32_t *lengths,
130 int &count,
131 int limit,
132 TernaryNode *&parent,
133 UBool &pMatched,
134 uint16_t *values /*=NULL*/) const {
135 // TODO: current implementation works in UTF-16 space
136 const TernaryNode *up = NULL;
137 const TernaryNode *p = fTrie;
138 int mycount = 0;
139 pMatched = TRUE;
140 int i;
141
142 if (!fValued) {
143 values = NULL;
144 }
145
146 UChar uc = utext_current32(text);
147 for (i = 0; i < maxLength && p != NULL; ++i) {
148 while (p != NULL) {
149 if (uc < p->ch) {
150 up = p;
151 p = p->low;
152 }
153 else if (uc == p->ch) {
154 break;
155 }
156 else {
157 up = p;
158 p = p->high;
159 }
160 }
161 if (p == NULL) {
162 pMatched = FALSE;
163 break;
164 }
165 // Must be equal to get here
166 if (limit > 0 && (p->flags > 0)) {
167 //is there a more efficient way to add values? ie. remove if stmt
168 if(values != NULL) {
169 values[mycount] = p->flags;
170 }
171 lengths[mycount++] = i+1;
172 --limit;
173 }
174 up = p;
175 p = p->equal;
176 uc = utext_next32(text);
177 uc = utext_current32(text);
178 }
179
180 // Note that there is no way to reach here with up == 0 unless
181 // maxLength is 0 coming in.
182 parent = (TernaryNode *)up;
183 count = mycount;
184 return i;
185 }
186
187 void
addWord(const UChar * word,int32_t length,UErrorCode & status,uint16_t value)188 MutableTrieDictionary::addWord( const UChar *word,
189 int32_t length,
190 UErrorCode &status,
191 uint16_t value /* = 0 */ ) {
192 // dictionary cannot store zero values, would interfere with flags
193 if (length <= 0 || (!fValued && value > 0) || (fValued && value == 0)) {
194 status = U_ILLEGAL_ARGUMENT_ERROR;
195 return;
196 }
197
198 TernaryNode *parent;
199 UBool pMatched;
200 int count;
201 fIter = utext_openUChars(fIter, word, length, &status);
202
203 int matched;
204 matched = search(fIter, length, NULL, count, 0, parent, pMatched);
205
206 while (matched++ < length) {
207 UChar32 uc = utext_next32(fIter); // TODO: supplementary support?
208 U_ASSERT(uc != U_SENTINEL);
209 TernaryNode *newNode = new TernaryNode(uc);
210 if (newNode == NULL) {
211 status = U_MEMORY_ALLOCATION_ERROR;
212 return;
213 }
214 if (pMatched) {
215 parent->equal = newNode;
216 }
217 else {
218 pMatched = TRUE;
219 if (uc < parent->ch) {
220 parent->low = newNode;
221 }
222 else {
223 parent->high = newNode;
224 }
225 }
226 parent = newNode;
227 }
228
229 if(fValued && value > 0){
230 parent->flags = value;
231 } else {
232 parent->flags |= kEndsWord;
233 }
234 }
235
236 int32_t
matches(UText * text,int32_t maxLength,int32_t * lengths,int & count,int limit,uint16_t * values) const237 MutableTrieDictionary::matches( UText *text,
238 int32_t maxLength,
239 int32_t *lengths,
240 int &count,
241 int limit,
242 uint16_t *values /*=NULL*/) const {
243 TernaryNode *parent;
244 UBool pMatched;
245 return search(text, maxLength, lengths, count, limit, parent, pMatched, values);
246 }
247
248 // Implementation of iteration for MutableTrieDictionary
249 class MutableTrieEnumeration : public StringEnumeration {
250 private:
251 UStack fNodeStack; // Stack of nodes to process
252 UVector32 fBranchStack; // Stack of which branch we are working on
253 TernaryNode *fRoot; // Root node
254 enum StackBranch {
255 kLessThan,
256 kEqual,
257 kGreaterThan,
258 kDone
259 };
260
261 public:
262 static UClassID U_EXPORT2 getStaticClassID(void);
263 virtual UClassID getDynamicClassID(void) const;
264 public:
MutableTrieEnumeration(TernaryNode * root,UErrorCode & status)265 MutableTrieEnumeration(TernaryNode *root, UErrorCode &status)
266 : fNodeStack(status), fBranchStack(status) {
267 fRoot = root;
268 fNodeStack.push(root, status);
269 fBranchStack.push(kLessThan, status);
270 unistr.remove();
271 }
272
~MutableTrieEnumeration()273 virtual ~MutableTrieEnumeration() {
274 }
275
clone() const276 virtual StringEnumeration *clone() const {
277 UErrorCode status = U_ZERO_ERROR;
278 return new MutableTrieEnumeration(fRoot, status);
279 }
280
snext(UErrorCode & status)281 virtual const UnicodeString *snext(UErrorCode &status) {
282 if (fNodeStack.empty() || U_FAILURE(status)) {
283 return NULL;
284 }
285 TernaryNode *node = (TernaryNode *) fNodeStack.peek();
286 StackBranch where = (StackBranch) fBranchStack.peeki();
287 while (!fNodeStack.empty() && U_SUCCESS(status)) {
288 UBool emit;
289 UBool equal;
290
291 switch (where) {
292 case kLessThan:
293 if (node->low != NULL) {
294 fBranchStack.setElementAt(kEqual, fBranchStack.size()-1);
295 node = (TernaryNode *) fNodeStack.push(node->low, status);
296 where = (StackBranch) fBranchStack.push(kLessThan, status);
297 break;
298 }
299 case kEqual:
300 emit = node->flags > 0;
301 equal = (node->equal != NULL);
302 // If this node should be part of the next emitted string, append
303 // the UChar to the string, and make sure we pop it when we come
304 // back to this node. The character should only be in the string
305 // for as long as we're traversing the equal subtree of this node
306 if (equal || emit) {
307 unistr.append(node->ch);
308 fBranchStack.setElementAt(kGreaterThan, fBranchStack.size()-1);
309 }
310 if (equal) {
311 node = (TernaryNode *) fNodeStack.push(node->equal, status);
312 where = (StackBranch) fBranchStack.push(kLessThan, status);
313 }
314 if (emit) {
315 return &unistr;
316 }
317 if (equal) {
318 break;
319 }
320 case kGreaterThan:
321 // If this node's character is in the string, remove it.
322 if (node->equal != NULL || node->flags > 0) {
323 unistr.truncate(unistr.length()-1);
324 }
325 if (node->high != NULL) {
326 fBranchStack.setElementAt(kDone, fBranchStack.size()-1);
327 node = (TernaryNode *) fNodeStack.push(node->high, status);
328 where = (StackBranch) fBranchStack.push(kLessThan, status);
329 break;
330 }
331 case kDone:
332 fNodeStack.pop();
333 fBranchStack.popi();
334 node = (TernaryNode *) fNodeStack.peek();
335 where = (StackBranch) fBranchStack.peeki();
336 break;
337 default:
338 return NULL;
339 }
340 }
341 return NULL;
342 }
343
344 // Very expensive, but this should never be used.
count(UErrorCode & status) const345 virtual int32_t count(UErrorCode &status) const {
346 MutableTrieEnumeration counter(fRoot, status);
347 int32_t result = 0;
348 while (counter.snext(status) != NULL && U_SUCCESS(status)) {
349 ++result;
350 }
351 return result;
352 }
353
reset(UErrorCode & status)354 virtual void reset(UErrorCode &status) {
355 fNodeStack.removeAllElements();
356 fBranchStack.removeAllElements();
357 fNodeStack.push(fRoot, status);
358 fBranchStack.push(kLessThan, status);
359 unistr.remove();
360 }
361 };
362
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(MutableTrieEnumeration)363 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(MutableTrieEnumeration)
364
365 StringEnumeration *
366 MutableTrieDictionary::openWords( UErrorCode &status ) const {
367 if (U_FAILURE(status)) {
368 return NULL;
369 }
370 return new MutableTrieEnumeration(fTrie, status);
371 }
372
373 /*******************************************************************
374 * CompactTrieDictionary
375 */
376
377 //TODO further optimization:
378 // minimise size of trie with logprobs by storing values
379 // for terminal nodes directly in offsets[]
380 // --> calculating from next offset *might* be simpler, but would have to add
381 // one last offset for logprob of last node
382 // --> if calculate from current offset, need to factor in possible overflow
383 // as well.
384 // idea: store in offset, set first bit to indicate logprob storage-->won't
385 // have to access additional node
386
387 // {'Dic', 1}, version 1: uses old header, no values
388 #define COMPACT_TRIE_MAGIC_1 0x44696301
389 // version 2: uses new header (more than 2^16 nodes), no values
390 #define COMPACT_TRIE_MAGIC_2 0x44696302
391 // version 3: uses new header, includes values
392 #define COMPACT_TRIE_MAGIC_3 0x44696303
393
394 struct CompactTrieHeader {
395 uint32_t size; // Size of the data in bytes
396 uint32_t magic; // Magic number (including version)
397 uint32_t nodeCount; // Number of entries in offsets[]
398 uint32_t root; // Node number of the root node
399 uint32_t offsets[1]; // Offsets to nodes from start of data
400 };
401
402 // old version of CompactTrieHeader kept for backwards compatibility
403 struct CompactTrieHeaderV1 {
404 uint32_t size; // Size of the data in bytes
405 uint32_t magic; // Magic number (including version)
406 uint16_t nodeCount; // Number of entries in offsets[]
407 uint16_t root; // Node number of the root node
408 uint32_t offsets[1]; // Offsets to nodes from start of data
409 };
410
411 // Helper class for managing CompactTrieHeader and CompactTrieHeaderV1
412 struct CompactTrieInfo {
413 uint32_t size; // Size of the data in bytes
414 uint32_t magic; // Magic number (including version)
415 uint32_t nodeCount; // Number of entries in offsets[]
416 uint32_t root; // Node number of the root node
417 uint32_t *offsets; // Offsets to nodes from start of data
418 uint8_t *address; // pointer to header bytes in memory
419
CompactTrieInfoCompactTrieInfo420 CompactTrieInfo(const void *data, UErrorCode &status){
421 CompactTrieHeader *header = (CompactTrieHeader *) data;
422 if (header->magic != COMPACT_TRIE_MAGIC_1 &&
423 header->magic != COMPACT_TRIE_MAGIC_2 &&
424 header->magic != COMPACT_TRIE_MAGIC_3) {
425 status = U_ILLEGAL_ARGUMENT_ERROR;
426 } else {
427 size = header->size;
428 magic = header->magic;
429
430 if (header->magic == COMPACT_TRIE_MAGIC_1) {
431 CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *) header;
432 nodeCount = headerV1->nodeCount;
433 root = headerV1->root;
434 offsets = &(headerV1->offsets[0]);
435 address = (uint8_t *)headerV1;
436 } else {
437 nodeCount = header->nodeCount;
438 root = header->root;
439 offsets = &(header->offsets[0]);
440 address = (uint8_t *)header;
441 }
442 }
443 }
444
~CompactTrieInfoCompactTrieInfo445 ~CompactTrieInfo(){}
446 };
447
448 // Note that to avoid platform-specific alignment issues, all members of the node
449 // structures should be the same size, or should contain explicit padding to
450 // natural alignment boundaries.
451
452 // We can't use a bitfield for the flags+count field, because the layout of those
453 // is not portable. 12 bits of count allows for up to 4096 entries in a node.
454 struct CompactTrieNode {
455 uint16_t flagscount; // Count of sub-entries, plus flags
456 };
457
458 enum CompactTrieNodeFlags {
459 kVerticalNode = 0x1000, // This is a vertical node
460 kParentEndsWord = 0x2000, // The node whose equal link points to this ends a word
461 kExceedsCount = 0x4000, // new MSB for count >= 4096, originally kReservedFlag1
462 kEqualOverflows = 0x8000, // Links to nodeIDs > 2^16, orig. kReservedFlag2
463 kCountMask = 0x0FFF, // The count portion of flagscount
464 kFlagMask = 0xF000, // The flags portion of flagscount
465 kRootCountMask = 0x7FFF // The count portion of flagscount in the root node
466
467 //offset flags:
468 //kOffsetContainsValue = 0x80000000 // Offset contains value for parent node
469 };
470
471 // The two node types are distinguished by the kVerticalNode flag.
472
473 struct CompactTrieHorizontalEntry {
474 uint16_t ch; // UChar
475 uint16_t equal; // Equal link node index
476 };
477
478 // We don't use inheritance here because C++ does not guarantee that the
479 // base class comes first in memory!!
480
481 struct CompactTrieHorizontalNode {
482 uint16_t flagscount; // Count of sub-entries, plus flags
483 CompactTrieHorizontalEntry entries[1];
484 };
485
486 struct CompactTrieVerticalNode {
487 uint16_t flagscount; // Count of sub-entries, plus flags
488 uint16_t equal; // Equal link node index
489 uint16_t chars[1]; // Code units
490 };
491
CompactTrieDictionary(UDataMemory * dataObj,UErrorCode & status)492 CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj,
493 UErrorCode &status )
494 : fUData(dataObj)
495 {
496 fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));
497 *fInfo = CompactTrieInfo(udata_getMemory(dataObj), status);
498 fOwnData = FALSE;
499 }
500
CompactTrieDictionary(const void * data,UErrorCode & status)501 CompactTrieDictionary::CompactTrieDictionary( const void *data,
502 UErrorCode &status )
503 : fUData(NULL)
504 {
505 fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));
506 *fInfo = CompactTrieInfo(data, status);
507 fOwnData = FALSE;
508 }
509
CompactTrieDictionary(const MutableTrieDictionary & dict,UErrorCode & status)510 CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict,
511 UErrorCode &status )
512 : fUData(NULL)
513 {
514 const CompactTrieHeader* header = compactMutableTrieDictionary(dict, status);
515 if (U_SUCCESS(status)) {
516 fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));
517 *fInfo = CompactTrieInfo(header, status);
518 }
519
520 fOwnData = !U_FAILURE(status);
521 }
522
~CompactTrieDictionary()523 CompactTrieDictionary::~CompactTrieDictionary() {
524 if (fOwnData) {
525 uprv_free((void *)(fInfo->address));
526 }
527 uprv_free((void *)fInfo);
528
529 if (fUData) {
530 udata_close(fUData);
531 }
532 }
533
getValued() const534 UBool CompactTrieDictionary::getValued() const{
535 return fInfo->magic == COMPACT_TRIE_MAGIC_3;
536 }
537
538 uint32_t
dataSize() const539 CompactTrieDictionary::dataSize() const {
540 return fInfo->size;
541 }
542
543 const void *
data() const544 CompactTrieDictionary::data() const {
545 return fInfo->address;
546 }
547
548 //This function finds the address of a node for us, given its node ID
549 static inline const CompactTrieNode *
getCompactNode(const CompactTrieInfo * info,uint32_t node)550 getCompactNode(const CompactTrieInfo *info, uint32_t node) {
551 if(node < info->root-1) {
552 return (const CompactTrieNode *)(&info->offsets[node]);
553 } else {
554 return (const CompactTrieNode *)(info->address + info->offsets[node]);
555 }
556 }
557
558 //this version of getCompactNode is currently only used in compactMutableTrieDictionary()
559 static inline const CompactTrieNode *
getCompactNode(const CompactTrieHeader * header,uint32_t node)560 getCompactNode(const CompactTrieHeader *header, uint32_t node) {
561 if(node < header->root-1) {
562 return (const CompactTrieNode *)(&header->offsets[node]);
563 } else {
564 return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]);
565 }
566 }
567
568
569 /**
570 * Calculates the number of links in a node
571 * @node The specified node
572 */
573 static inline const uint16_t
getCount(const CompactTrieNode * node)574 getCount(const CompactTrieNode *node){
575 return (node->flagscount & kCountMask);
576 //use the code below if number of links ever exceed 4096
577 //return (node->flagscount & kCountMask) + ((node->flagscount & kExceedsCount) >> 2);
578 }
579
580 /**
581 * calculates an equal link node ID of a horizontal node
582 * @hnode The horizontal node containing the equal link
583 * @param index The index into hnode->entries[]
584 * @param nodeCount The length of hnode->entries[]
585 */
calcEqualLink(const CompactTrieVerticalNode * vnode)586 static inline uint32_t calcEqualLink(const CompactTrieVerticalNode *vnode){
587 if(vnode->flagscount & kEqualOverflows){
588 // treat overflow bits as an extension of chars[]
589 uint16_t *overflow = (uint16_t *) &vnode->chars[getCount((CompactTrieNode*)vnode)];
590 return vnode->equal + (((uint32_t)*overflow) << 16);
591 }else{
592 return vnode->equal;
593 }
594 }
595
596 /**
597 * calculates an equal link node ID of a horizontal node
598 * @hnode The horizontal node containing the equal link
599 * @param index The index into hnode->entries[]
600 * @param nodeCount The length of hnode->entries[]
601 */
calcEqualLink(const CompactTrieHorizontalNode * hnode,uint16_t index,uint16_t nodeCount)602 static inline uint32_t calcEqualLink(const CompactTrieHorizontalNode *hnode, uint16_t index, uint16_t nodeCount){
603 if(hnode->flagscount & kEqualOverflows){
604 //set overflow to point to the uint16_t containing the overflow bits
605 uint16_t *overflow = (uint16_t *) &hnode->entries[nodeCount];
606 overflow += index/4;
607 uint16_t extraBits = (*overflow >> (3 - (index % 4)) * 4) % 0x10;
608 return hnode->entries[index].equal + (((uint32_t)extraBits) << 16);
609 } else {
610 return hnode->entries[index].equal;
611 }
612 }
613
614 /**
615 * Returns the value stored in the specified node which is associated with its
616 * parent node.
617 * TODO: how to tell that value is stored in node or in offset? check whether
618 * node ID < fInfo->root!
619 */
getValue(const CompactTrieHorizontalNode * hnode)620 static inline uint16_t getValue(const CompactTrieHorizontalNode *hnode){
621 uint16_t count = getCount((CompactTrieNode *)hnode);
622 uint16_t overflowSize = 0; //size of node ID overflow storage in bytes
623
624 if(hnode->flagscount & kEqualOverflows)
625 overflowSize = (count + 3) / 4 * sizeof(uint16_t);
626 return *((uint16_t *)((uint8_t *)&hnode->entries[count] + overflowSize));
627 }
628
getValue(const CompactTrieVerticalNode * vnode)629 static inline uint16_t getValue(const CompactTrieVerticalNode *vnode){
630 // calculate size of total node ID overflow storage in bytes
631 uint16_t overflowSize = (vnode->flagscount & kEqualOverflows)? sizeof(uint16_t) : 0;
632 return *((uint16_t *)((uint8_t *)&vnode->chars[getCount((CompactTrieNode *)vnode)] + overflowSize));
633 }
634
getValue(const CompactTrieNode * node)635 static inline uint16_t getValue(const CompactTrieNode *node){
636 if(node->flagscount & kVerticalNode)
637 return getValue((const CompactTrieVerticalNode *)node);
638 else
639 return getValue((const CompactTrieHorizontalNode *)node);
640 }
641
642 //returns index of match in CompactTrieHorizontalNode.entries[] using binary search
643 inline int16_t
searchHorizontalEntries(const CompactTrieHorizontalEntry * entries,UChar uc,uint16_t nodeCount)644 searchHorizontalEntries(const CompactTrieHorizontalEntry *entries,
645 UChar uc, uint16_t nodeCount){
646 int low = 0;
647 int high = nodeCount-1;
648 int middle;
649 while (high >= low) {
650 middle = (high+low)/2;
651 if (uc == entries[middle].ch) {
652 return middle;
653 }
654 else if (uc < entries[middle].ch) {
655 high = middle-1;
656 }
657 else {
658 low = middle+1;
659 }
660 }
661
662 return -1;
663 }
664
665 int32_t
matches(UText * text,int32_t maxLength,int32_t * lengths,int & count,int limit,uint16_t * values) const666 CompactTrieDictionary::matches( UText *text,
667 int32_t maxLength,
668 int32_t *lengths,
669 int &count,
670 int limit,
671 uint16_t *values /*= NULL*/) const {
672 if (fInfo->magic == COMPACT_TRIE_MAGIC_2)
673 values = NULL;
674
675 // TODO: current implementation works in UTF-16 space
676 const CompactTrieNode *node = getCompactNode(fInfo, fInfo->root);
677 int mycount = 0;
678
679 UChar uc = utext_current32(text);
680 int i = 0;
681
682 // handle root node with only kEqualOverflows flag: assume horizontal node without parent
683 if(node != NULL){
684 const CompactTrieHorizontalNode *root = (const CompactTrieHorizontalNode *) node;
685 int index = searchHorizontalEntries(root->entries, uc, root->flagscount & kRootCountMask);
686 if(index > -1){
687 node = getCompactNode(fInfo, calcEqualLink(root, index, root->flagscount & kRootCountMask));
688 utext_next32(text);
689 uc = utext_current32(text);
690 ++i;
691 }else{
692 node = NULL;
693 }
694 }
695
696 while (node != NULL) {
697 // Check if the node we just exited ends a word
698 if (limit > 0 && (node->flagscount & kParentEndsWord)) {
699 if(values != NULL){
700 values[mycount] = getValue(node);
701 }
702 lengths[mycount++] = i;
703 --limit;
704 }
705 // Check that we haven't exceeded the maximum number of input characters.
706 // We have to do that here rather than in the while condition so that
707 // we can check for ending a word, above.
708 if (i >= maxLength) {
709 break;
710 }
711
712 int nodeCount = getCount(node);
713 if (nodeCount == 0) {
714 // Special terminal node; return now
715 break;
716 }
717 if (node->flagscount & kVerticalNode) {
718 // Vertical node; check all the characters in it
719 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node;
720 for (int j = 0; j < nodeCount && i < maxLength; ++j) {
721 if (uc != vnode->chars[j]) {
722 // We hit a non-equal character; return
723 goto exit;
724 }
725 utext_next32(text);
726 uc = utext_current32(text);
727 ++i;
728 }
729 // To get here we must have come through the whole list successfully;
730 // go on to the next node. Note that a word cannot end in the middle
731 // of a vertical node.
732 node = getCompactNode(fInfo, calcEqualLink(vnode));
733 }
734 else {
735 // Horizontal node; do binary search
736 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;
737 const CompactTrieHorizontalEntry *entries;
738 entries = hnode->entries;
739
740 int index = searchHorizontalEntries(entries, uc, nodeCount);
741 if(index > -1){ //
742 // We hit a match; get the next node and next character
743 node = getCompactNode(fInfo, calcEqualLink(hnode, index, nodeCount));
744 utext_next32(text);
745 uc = utext_current32(text);
746 ++i;
747 }else{
748 node = NULL; // If we don't find a match, we'll fall out of the loop
749 }
750 }
751 }
752 exit:
753 count = mycount;
754 return i;
755 }
756
757 // Implementation of iteration for CompactTrieDictionary
758 class CompactTrieEnumeration : public StringEnumeration {
759 private:
760 UVector32 fNodeStack; // Stack of nodes to process
761 UVector32 fIndexStack; // Stack of where in node we are
762 const CompactTrieInfo *fInfo; // Trie data
763
764 public:
765 static UClassID U_EXPORT2 getStaticClassID(void);
766 virtual UClassID getDynamicClassID(void) const;
767 public:
CompactTrieEnumeration(const CompactTrieInfo * info,UErrorCode & status)768 CompactTrieEnumeration(const CompactTrieInfo *info, UErrorCode &status)
769 : fNodeStack(status), fIndexStack(status) {
770 fInfo = info;
771 fNodeStack.push(info->root, status);
772 fIndexStack.push(0, status);
773 unistr.remove();
774 }
775
~CompactTrieEnumeration()776 virtual ~CompactTrieEnumeration() {
777 }
778
clone() const779 virtual StringEnumeration *clone() const {
780 UErrorCode status = U_ZERO_ERROR;
781 return new CompactTrieEnumeration(fInfo, status);
782 }
783
784 virtual const UnicodeString * snext(UErrorCode &status);
785
786 // Very expensive, but this should never be used.
count(UErrorCode & status) const787 virtual int32_t count(UErrorCode &status) const {
788 CompactTrieEnumeration counter(fInfo, status);
789 int32_t result = 0;
790 while (counter.snext(status) != NULL && U_SUCCESS(status)) {
791 ++result;
792 }
793 return result;
794 }
795
reset(UErrorCode & status)796 virtual void reset(UErrorCode &status) {
797 fNodeStack.removeAllElements();
798 fIndexStack.removeAllElements();
799 fNodeStack.push(fInfo->root, status);
800 fIndexStack.push(0, status);
801 unistr.remove();
802 }
803 };
804
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompactTrieEnumeration) const805 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompactTrieEnumeration)
806
807 const UnicodeString *
808 CompactTrieEnumeration::snext(UErrorCode &status) {
809 if (fNodeStack.empty() || U_FAILURE(status)) {
810 return NULL;
811 }
812 const CompactTrieNode *node = getCompactNode(fInfo, fNodeStack.peeki());
813 int where = fIndexStack.peeki();
814 while (!fNodeStack.empty() && U_SUCCESS(status)) {
815 int nodeCount;
816
817 bool isRoot = fNodeStack.peeki() == static_cast<int32_t>(fInfo->root);
818 if(isRoot){
819 nodeCount = node->flagscount & kRootCountMask;
820 } else {
821 nodeCount = getCount(node);
822 }
823
824 UBool goingDown = FALSE;
825 if (nodeCount == 0) {
826 // Terminal node; go up immediately
827 fNodeStack.popi();
828 fIndexStack.popi();
829 node = getCompactNode(fInfo, fNodeStack.peeki());
830 where = fIndexStack.peeki();
831 }
832 else if ((node->flagscount & kVerticalNode) && !isRoot) {
833 // Vertical node
834 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node;
835 if (where == 0) {
836 // Going down
837 unistr.append((const UChar *)vnode->chars, nodeCount);
838 fIndexStack.setElementAt(1, fIndexStack.size()-1);
839 node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(vnode), status));
840 where = fIndexStack.push(0, status);
841 goingDown = TRUE;
842 }
843 else {
844 // Going up
845 unistr.truncate(unistr.length()-nodeCount);
846 fNodeStack.popi();
847 fIndexStack.popi();
848 node = getCompactNode(fInfo, fNodeStack.peeki());
849 where = fIndexStack.peeki();
850 }
851 }
852 else {
853 // Horizontal node
854 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;
855 if (where > 0) {
856 // Pop previous char
857 unistr.truncate(unistr.length()-1);
858 }
859 if (where < nodeCount) {
860 // Push on next node
861 unistr.append((UChar)hnode->entries[where].ch);
862 fIndexStack.setElementAt(where+1, fIndexStack.size()-1);
863 node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(hnode, where, nodeCount), status));
864 where = fIndexStack.push(0, status);
865 goingDown = TRUE;
866 }
867 else {
868 // Going up
869 fNodeStack.popi();
870 fIndexStack.popi();
871 node = getCompactNode(fInfo, fNodeStack.peeki());
872 where = fIndexStack.peeki();
873 }
874 }
875
876 // Check if the parent of the node we've just gone down to ends a
877 // word. If so, return it.
878 // The root node should never end up here.
879 if (goingDown && (node->flagscount & kParentEndsWord)) {
880 return &unistr;
881 }
882 }
883 return NULL;
884 }
885
886 StringEnumeration *
openWords(UErrorCode & status) const887 CompactTrieDictionary::openWords( UErrorCode &status ) const {
888 if (U_FAILURE(status)) {
889 return NULL;
890 }
891 return new CompactTrieEnumeration(fInfo, status);
892 }
893
894 //
895 // Below here is all code related to converting a ternary trie to a compact trie
896 // and back again
897 //
898
899 enum CompactTrieNodeType {
900 kHorizontalType = 0,
901 kVerticalType = 1,
902 kValueType = 2
903 };
904
905 /**
906 * The following classes (i.e. BuildCompactTrie*Node) are helper classes to
907 * construct the compact trie by storing information for each node and later
908 * writing the node to memory in a sequential format.
909 */
910 class BuildCompactTrieNode: public UMemory {
911 public:
912 UBool fParentEndsWord;
913 CompactTrieNodeType fNodeType;
914 UBool fHasDuplicate;
915 UBool fEqualOverflows;
916 int32_t fNodeID;
917 UnicodeString fChars;
918 uint16_t fValue;
919
920 public:
BuildCompactTrieNode(UBool parentEndsWord,CompactTrieNodeType nodeType,UStack & nodes,UErrorCode & status,uint16_t value=0)921 BuildCompactTrieNode(UBool parentEndsWord, CompactTrieNodeType nodeType,
922 UStack &nodes, UErrorCode &status, uint16_t value = 0) {
923 fParentEndsWord = parentEndsWord;
924 fHasDuplicate = FALSE;
925 fNodeType = nodeType;
926 fEqualOverflows = FALSE;
927 fNodeID = nodes.size();
928 fValue = parentEndsWord? value : 0;
929 nodes.push(this, status);
930 }
931
~BuildCompactTrieNode()932 virtual ~BuildCompactTrieNode() {
933 }
934
size()935 virtual uint32_t size() {
936 if(fValue > 0)
937 return sizeof(uint16_t) * 2;
938 else
939 return sizeof(uint16_t);
940 }
941
write(uint8_t * bytes,uint32_t & offset,const UVector32 &)942 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &/*translate*/) {
943 // Write flag/count
944
945 // if this ever fails, a flag bit (i.e. kExceedsCount) will need to be
946 // used as a 5th MSB.
947 U_ASSERT(fChars.length() < 4096 || fNodeID == 2);
948
949 *((uint16_t *)(bytes+offset)) = (fEqualOverflows? kEqualOverflows : 0) |
950 ((fNodeID == 2)? (fChars.length() & kRootCountMask):
951 (
952 (fChars.length() & kCountMask) |
953 //((fChars.length() << 2) & kExceedsCount) |
954 (fNodeType == kVerticalType ? kVerticalNode : 0) |
955 (fParentEndsWord ? kParentEndsWord : 0 )
956 )
957 );
958 offset += sizeof(uint16_t);
959 }
960
writeValue(uint8_t * bytes,uint32_t & offset)961 virtual void writeValue(uint8_t *bytes, uint32_t &offset) {
962 if(fValue > 0){
963 *((uint16_t *)(bytes+offset)) = fValue;
964 offset += sizeof(uint16_t);
965 }
966 }
967
968 };
969
970 /**
971 * Stores value of parent terminating nodes that have no more subtries.
972 */
973 class BuildCompactTrieValueNode: public BuildCompactTrieNode {
974 public:
BuildCompactTrieValueNode(UStack & nodes,UErrorCode & status,uint16_t value)975 BuildCompactTrieValueNode(UStack &nodes, UErrorCode &status, uint16_t value)
976 : BuildCompactTrieNode(TRUE, kValueType, nodes, status, value){
977 }
978
~BuildCompactTrieValueNode()979 virtual ~BuildCompactTrieValueNode(){
980 }
981
size()982 virtual uint32_t size() {
983 return sizeof(uint16_t) * 2;
984 }
985
write(uint8_t * bytes,uint32_t & offset,const UVector32 & translate)986 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {
987 // don't write value directly to memory but store it in offset to be written later
988 //offset = fValue & kOffsetContainsValue;
989 BuildCompactTrieNode::write(bytes, offset, translate);
990 BuildCompactTrieNode::writeValue(bytes, offset);
991 }
992 };
993
994 class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode {
995 public:
996 UStack fLinks;
997 UBool fMayOverflow; //intermediate value for fEqualOverflows
998
999 public:
BuildCompactTrieHorizontalNode(UBool parentEndsWord,UStack & nodes,UErrorCode & status,uint16_t value=0)1000 BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status, uint16_t value = 0)
1001 : BuildCompactTrieNode(parentEndsWord, kHorizontalType, nodes, status, value), fLinks(status) {
1002 fMayOverflow = FALSE;
1003 }
1004
~BuildCompactTrieHorizontalNode()1005 virtual ~BuildCompactTrieHorizontalNode() {
1006 }
1007
1008 // It is impossible to know beforehand exactly how much space the node will
1009 // need in memory before being written, because the node IDs in the equal
1010 // links may or may not overflow after node coalescing. Therefore, this method
1011 // returns the maximum size possible for the node.
size()1012 virtual uint32_t size() {
1013 uint32_t estimatedSize = offsetof(CompactTrieHorizontalNode,entries) +
1014 (fChars.length()*sizeof(CompactTrieHorizontalEntry));
1015
1016 if(fValue > 0)
1017 estimatedSize += sizeof(uint16_t);
1018
1019 //estimate extra space needed to store overflow for node ID links
1020 //may be more than what is actually needed
1021 for(int i=0; i < fChars.length(); i++){
1022 if(((BuildCompactTrieNode *)fLinks[i])->fNodeID > 0xFFFF){
1023 fMayOverflow = TRUE;
1024 break;
1025 }
1026 }
1027 if(fMayOverflow) // added space for overflow should be same as ceil(fChars.length()/4) * sizeof(uint16_t)
1028 estimatedSize += (sizeof(uint16_t) * fChars.length() + 2)/4;
1029
1030 return estimatedSize;
1031 }
1032
write(uint8_t * bytes,uint32_t & offset,const UVector32 & translate)1033 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {
1034 int32_t count = fChars.length();
1035
1036 //if largest nodeID > 2^16, set flag
1037 //large node IDs are more likely to be at the back of the array
1038 for (int32_t i = count-1; i >= 0; --i) {
1039 if(translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID) > 0xFFFF){
1040 fEqualOverflows = TRUE;
1041 break;
1042 }
1043 }
1044
1045 BuildCompactTrieNode::write(bytes, offset, translate);
1046
1047 // write entries[] to memory
1048 for (int32_t i = 0; i < count; ++i) {
1049 CompactTrieHorizontalEntry *entry = (CompactTrieHorizontalEntry *)(bytes+offset);
1050 entry->ch = fChars[i];
1051 entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID);
1052 #ifdef DEBUG_TRIE_DICT
1053
1054 if ((entry->equal == 0) && !fEqualOverflows) {
1055 fprintf(stderr, "ERROR: horizontal link %d, logical node %d maps to physical node zero\n",
1056 i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID);
1057 }
1058 #endif
1059 offset += sizeof(CompactTrieHorizontalEntry);
1060 }
1061
1062 // append extra bits of equal nodes to end if fEqualOverflows
1063 if (fEqualOverflows) {
1064 uint16_t leftmostBits = 0;
1065 for (int16_t i = 0; i < count; i++) {
1066 leftmostBits = (leftmostBits << 4) | getLeftmostBits(translate, i);
1067
1068 // write filled uint16_t to memory
1069 if(i % 4 == 3){
1070 *((uint16_t *)(bytes+offset)) = leftmostBits;
1071 leftmostBits = 0;
1072 offset += sizeof(uint16_t);
1073 }
1074 }
1075
1076 // pad last uint16_t with zeroes if necessary
1077 int remainder = count % 4;
1078 if (remainder > 0) {
1079 *((uint16_t *)(bytes+offset)) = (leftmostBits << (16 - 4 * remainder));
1080 offset += sizeof(uint16_t);
1081 }
1082 }
1083
1084 BuildCompactTrieNode::writeValue(bytes, offset);
1085 }
1086
1087 // returns leftmost bits of physical node link
getLeftmostBits(const UVector32 & translate,uint32_t i)1088 uint16_t getLeftmostBits(const UVector32 &translate, uint32_t i){
1089 uint16_t leftmostBits = (uint16_t) (translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID) >> 16);
1090 #ifdef DEBUG_TRIE_DICT
1091 if (leftmostBits > 0xF) {
1092 fprintf(stderr, "ERROR: horizontal link %d, logical node %d exceeds maximum possible node ID value\n",
1093 i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID);
1094 }
1095 #endif
1096 return leftmostBits;
1097 }
1098
addNode(UChar ch,BuildCompactTrieNode * link,UErrorCode & status)1099 void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) {
1100 fChars.append(ch);
1101 fLinks.push(link, status);
1102 }
1103
1104 };
1105
1106 class BuildCompactTrieVerticalNode: public BuildCompactTrieNode {
1107 public:
1108 BuildCompactTrieNode *fEqual;
1109
1110 public:
BuildCompactTrieVerticalNode(UBool parentEndsWord,UStack & nodes,UErrorCode & status,uint16_t value=0)1111 BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status, uint16_t value = 0)
1112 : BuildCompactTrieNode(parentEndsWord, kVerticalType, nodes, status, value) {
1113 fEqual = NULL;
1114 }
1115
~BuildCompactTrieVerticalNode()1116 virtual ~BuildCompactTrieVerticalNode() {
1117 }
1118
1119 // Returns the maximum possible size of this node. See comment in
1120 // BuildCompactTrieHorizontal node for more information.
size()1121 virtual uint32_t size() {
1122 uint32_t estimatedSize = offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof(uint16_t));
1123 if(fValue > 0){
1124 estimatedSize += sizeof(uint16_t);
1125 }
1126
1127 if(fEqual->fNodeID > 0xFFFF){
1128 estimatedSize += sizeof(uint16_t);
1129 }
1130 return estimatedSize;
1131 }
1132
write(uint8_t * bytes,uint32_t & offset,const UVector32 & translate)1133 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {
1134 CompactTrieVerticalNode *node = (CompactTrieVerticalNode *)(bytes+offset);
1135 fEqualOverflows = (translate.elementAti(fEqual->fNodeID) > 0xFFFF);
1136 BuildCompactTrieNode::write(bytes, offset, translate);
1137 node->equal = translate.elementAti(fEqual->fNodeID);
1138 offset += sizeof(node->equal);
1139 #ifdef DEBUG_TRIE_DICT
1140 if ((node->equal == 0) && !fEqualOverflows) {
1141 fprintf(stderr, "ERROR: vertical link, logical node %d maps to physical node zero\n",
1142 fEqual->fNodeID);
1143 }
1144 #endif
1145 fChars.extract(0, fChars.length(), (UChar *)node->chars);
1146 offset += sizeof(UChar)*fChars.length();
1147
1148 // append 16 bits of to end for equal node if fEqualOverflows
1149 if (fEqualOverflows) {
1150 *((uint16_t *)(bytes+offset)) = (translate.elementAti(fEqual->fNodeID) >> 16);
1151 offset += sizeof(uint16_t);
1152 }
1153
1154 BuildCompactTrieNode::writeValue(bytes, offset);
1155 }
1156
addChar(UChar ch)1157 void addChar(UChar ch) {
1158 fChars.append(ch);
1159 }
1160
setLink(BuildCompactTrieNode * node)1161 void setLink(BuildCompactTrieNode *node) {
1162 fEqual = node;
1163 }
1164
1165 };
1166
1167 // Forward declaration
1168 static void walkHorizontal(const TernaryNode *node,
1169 BuildCompactTrieHorizontalNode *building,
1170 UStack &nodes,
1171 UErrorCode &status,
1172 Hashtable *values);
1173
1174 // Convert one TernaryNode into a BuildCompactTrieNode. Uses recursion.
1175
1176 static BuildCompactTrieNode *
compactOneNode(const TernaryNode * node,UBool parentEndsWord,UStack & nodes,UErrorCode & status,Hashtable * values=NULL,uint16_t parentValue=0)1177 compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes,
1178 UErrorCode &status, Hashtable *values = NULL, uint16_t parentValue = 0) {
1179 if (U_FAILURE(status)) {
1180 return NULL;
1181 }
1182 BuildCompactTrieNode *result = NULL;
1183 UBool horizontal = (node->low != NULL || node->high != NULL);
1184 if (horizontal) {
1185 BuildCompactTrieHorizontalNode *hResult;
1186 if(values != NULL){
1187 hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status, parentValue);
1188 } else {
1189 hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status);
1190 }
1191
1192 if (hResult == NULL) {
1193 status = U_MEMORY_ALLOCATION_ERROR;
1194 return NULL;
1195 }
1196 if (U_SUCCESS(status)) {
1197 walkHorizontal(node, hResult, nodes, status, values);
1198 result = hResult;
1199 }
1200 }
1201 else {
1202 BuildCompactTrieVerticalNode *vResult;
1203 if(values != NULL){
1204 vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status, parentValue);
1205 } else {
1206 vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status);
1207 }
1208
1209 if (vResult == NULL) {
1210 status = U_MEMORY_ALLOCATION_ERROR;
1211 return NULL;
1212 }
1213 else if (U_SUCCESS(status)) {
1214 uint16_t value = 0;
1215 UBool endsWord = FALSE;
1216 // Take up nodes until we end a word, or hit a node with < or > links
1217 do {
1218 vResult->addChar(node->ch);
1219 value = node->flags;
1220 endsWord = value > 0;
1221 node = node->equal;
1222 }
1223 while(node != NULL && !endsWord && node->low == NULL && node->high == NULL);
1224
1225 if (node == NULL) {
1226 if (!endsWord) {
1227 status = U_ILLEGAL_ARGUMENT_ERROR; // Corrupt input trie
1228 }
1229 else if(values != NULL){
1230 UnicodeString key(value); //store value as a single-char UnicodeString
1231 BuildCompactTrieValueNode *link = (BuildCompactTrieValueNode *) values->get(key);
1232 if(link == NULL){
1233 link = new BuildCompactTrieValueNode(nodes, status, value); //take out nodes?
1234 values->put(key, link, status);
1235 }
1236 vResult->setLink(link);
1237 } else {
1238 vResult->setLink((BuildCompactTrieNode *)nodes[1]);
1239 }
1240 }
1241 else {
1242 vResult->setLink(compactOneNode(node, endsWord, nodes, status, values, value));
1243 }
1244 result = vResult;
1245 }
1246 }
1247 return result;
1248 }
1249
1250 // Walk the set of peers at the same level, to build a horizontal node.
1251 // Uses recursion.
1252
walkHorizontal(const TernaryNode * node,BuildCompactTrieHorizontalNode * building,UStack & nodes,UErrorCode & status,Hashtable * values=NULL)1253 static void walkHorizontal(const TernaryNode *node,
1254 BuildCompactTrieHorizontalNode *building,
1255 UStack &nodes,
1256 UErrorCode &status, Hashtable *values = NULL) {
1257 while (U_SUCCESS(status) && node != NULL) {
1258 if (node->low != NULL) {
1259 walkHorizontal(node->low, building, nodes, status, values);
1260 }
1261 BuildCompactTrieNode *link = NULL;
1262 if (node->equal != NULL) {
1263 link = compactOneNode(node->equal, node->flags > 0, nodes, status, values, node->flags);
1264 }
1265 else if (node->flags > 0) {
1266 if(values != NULL) {
1267 UnicodeString key(node->flags); //store value as a single-char UnicodeString
1268 link = (BuildCompactTrieValueNode *) values->get(key);
1269 if(link == NULL) {
1270 link = new BuildCompactTrieValueNode(nodes, status, node->flags); //take out nodes?
1271 values->put(key, link, status);
1272 }
1273 } else {
1274 link = (BuildCompactTrieNode *)nodes[1];
1275 }
1276 }
1277 if (U_SUCCESS(status) && link != NULL) {
1278 building->addNode(node->ch, link, status);
1279 }
1280 // Tail recurse manually instead of leaving it to the compiler.
1281 //if (node->high != NULL) {
1282 // walkHorizontal(node->high, building, nodes, status);
1283 //}
1284 node = node->high;
1285 }
1286 }
1287
1288 U_NAMESPACE_END
1289 U_NAMESPACE_USE
1290 U_CDECL_BEGIN
1291 static int32_t U_CALLCONV
_sortBuildNodes(const void *,const void * voidl,const void * voidr)1292 _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr) {
1293 BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl;
1294 BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr;
1295
1296 // Check for comparing a node to itself, to avoid spurious duplicates
1297 if (left == right) {
1298 return 0;
1299 }
1300
1301 // Most significant is type of node. Can never coalesce.
1302 if (left->fNodeType != right->fNodeType) {
1303 return left->fNodeType - right->fNodeType;
1304 }
1305 // Next, the "parent ends word" flag. If that differs, we cannot coalesce.
1306 if (left->fParentEndsWord != right->fParentEndsWord) {
1307 return left->fParentEndsWord - right->fParentEndsWord;
1308 }
1309 // Next, the string. If that differs, we can never coalesce.
1310 int32_t result = left->fChars.compare(right->fChars);
1311 if (result != 0) {
1312 return result;
1313 }
1314
1315 // If the node value differs, we should not coalesce.
1316 // If values aren't stored, all fValues should be 0.
1317 if (left->fValue != right->fValue) {
1318 return left->fValue - right->fValue;
1319 }
1320
1321 // We know they're both the same node type, so branch for the two cases.
1322 if (left->fNodeType == kVerticalType) {
1323 result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID
1324 - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID;
1325 }
1326 else if(left->fChars.length() > 0 && right->fChars.length() > 0){
1327 // We need to compare the links vectors. They should be the
1328 // same size because the strings were equal.
1329 // We compare the node IDs instead of the pointers, to handle
1330 // coalesced nodes.
1331 BuildCompactTrieHorizontalNode *hleft, *hright;
1332 hleft = (BuildCompactTrieHorizontalNode *)left;
1333 hright = (BuildCompactTrieHorizontalNode *)right;
1334 int32_t count = hleft->fLinks.size();
1335 for (int32_t i = 0; i < count && result == 0; ++i) {
1336 result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID -
1337 ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;
1338 }
1339 }
1340
1341 // If they are equal to each other, mark them (speeds coalescing)
1342 if (result == 0) {
1343 left->fHasDuplicate = TRUE;
1344 right->fHasDuplicate = TRUE;
1345 }
1346 return result;
1347 }
1348 U_CDECL_END
1349 U_NAMESPACE_BEGIN
1350
coalesceDuplicates(UStack & nodes,UErrorCode & status)1351 static void coalesceDuplicates(UStack &nodes, UErrorCode &status) {
1352 // We sort the array of nodes to place duplicates next to each other
1353 if (U_FAILURE(status)) {
1354 return;
1355 }
1356 int32_t size = nodes.size();
1357 void **array = (void **)uprv_malloc(sizeof(void *)*size);
1358 if (array == NULL) {
1359 status = U_MEMORY_ALLOCATION_ERROR;
1360 return;
1361 }
1362 (void) nodes.toArray(array);
1363
1364 // Now repeatedly identify duplicates until there are no more
1365 int32_t dupes = 0;
1366 long passCount = 0;
1367 #ifdef DEBUG_TRIE_DICT
1368 long totalDupes = 0;
1369 #endif
1370 do {
1371 BuildCompactTrieNode *node;
1372 BuildCompactTrieNode *first = NULL;
1373 BuildCompactTrieNode **p;
1374 BuildCompactTrieNode **pFirst = NULL;
1375 int32_t counter = size - 2;
1376 // Sort the array, skipping nodes 0 and 1. Use quicksort for the first
1377 // pass for speed. For the second and subsequent passes, we use stable
1378 // (insertion) sort for two reasons:
1379 // 1. The array is already mostly ordered, so we get better performance.
1380 // 2. The way we find one and only one instance of a set of duplicates is to
1381 // check that the node ID equals the array index. If we used an unstable
1382 // sort for the second or later passes, it's possible that none of the
1383 // duplicates would wind up with a node ID equal to its array index.
1384 // The sort stability guarantees that, because as we coalesce more and
1385 // more groups, the first element of the resultant group will be one of
1386 // the first elements of the groups being coalesced.
1387 // To use quicksort for the second and subsequent passes, we would have to
1388 // find the minimum of the node numbers in a group, and set all the nodes
1389 // in the group to that node number.
1390 uprv_sortArray(array+2, counter, sizeof(void *), _sortBuildNodes, NULL, (passCount > 0), &status);
1391 dupes = 0;
1392 for (p = (BuildCompactTrieNode **)array + 2; counter > 0; --counter, ++p) {
1393 node = *p;
1394 if (node->fHasDuplicate) {
1395 if (first == NULL) {
1396 first = node;
1397 pFirst = p;
1398 }
1399 else if (_sortBuildNodes(NULL, pFirst, p) != 0) {
1400 // Starting a new run of dupes
1401 first = node;
1402 pFirst = p;
1403 }
1404 else if (node->fNodeID != first->fNodeID) {
1405 // Slave one to the other, note duplicate
1406 node->fNodeID = first->fNodeID;
1407 dupes += 1;
1408 }
1409 }
1410 else {
1411 // This node has no dupes
1412 first = NULL;
1413 pFirst = NULL;
1414 }
1415 }
1416 passCount += 1;
1417 #ifdef DEBUG_TRIE_DICT
1418 totalDupes += dupes;
1419 fprintf(stderr, "Trie node dupe removal, pass %d: %d nodes tagged\n", passCount, dupes);
1420 #endif
1421 }
1422 while (dupes > 0);
1423 #ifdef DEBUG_TRIE_DICT
1424 fprintf(stderr, "Trie node dupe removal complete: %d tagged in %d passes\n", totalDupes, passCount);
1425 #endif
1426
1427 // We no longer need the temporary array, as the nodes have all been marked appropriately.
1428 uprv_free(array);
1429 }
1430
1431 U_NAMESPACE_END
1432 U_CDECL_BEGIN
_deleteBuildNode(void * obj)1433 static void U_CALLCONV _deleteBuildNode(void *obj) {
1434 delete (BuildCompactTrieNode *) obj;
1435 }
1436 U_CDECL_END
1437 U_NAMESPACE_BEGIN
1438
1439 CompactTrieHeader *
compactMutableTrieDictionary(const MutableTrieDictionary & dict,UErrorCode & status)1440 CompactTrieDictionary::compactMutableTrieDictionary( const MutableTrieDictionary &dict,
1441 UErrorCode &status ) {
1442 if (U_FAILURE(status)) {
1443 return NULL;
1444 }
1445 #ifdef DEBUG_TRIE_DICT
1446 struct tms timing;
1447 struct tms previous;
1448 (void) ::times(&previous);
1449 #endif
1450 UStack nodes(_deleteBuildNode, NULL, status); // Index of nodes
1451
1452 // Add node 0, used as the NULL pointer/sentinel.
1453 nodes.addElement((int32_t)0, status);
1454
1455 Hashtable *values = NULL; // Index of (unique) values
1456 if (dict.fValued) {
1457 values = new Hashtable(status);
1458 }
1459
1460 // Start by creating the special empty node we use to indicate that the parent
1461 // terminates a word. This must be node 1, because the builder assumes
1462 // that. This node will never be used for tries storing numerical values.
1463 if (U_FAILURE(status)) {
1464 return NULL;
1465 }
1466 BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, kHorizontalType, nodes, status);
1467 if (terminal == NULL) {
1468 status = U_MEMORY_ALLOCATION_ERROR;
1469 }
1470
1471 // This call does all the work of building the new trie structure. The root
1472 // will have node ID 2 before writing to memory.
1473 BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status, values);
1474 #ifdef DEBUG_TRIE_DICT
1475 (void) ::times(&timing);
1476 fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n",
1477 nodes.size(), (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
1478 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
1479 previous = timing;
1480 #endif
1481
1482 // Now coalesce all duplicate nodes.
1483 coalesceDuplicates(nodes, status);
1484 #ifdef DEBUG_TRIE_DICT
1485 (void) ::times(&timing);
1486 fprintf(stderr, "Duplicates coalesced, time user %f system %f\n",
1487 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
1488 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
1489 previous = timing;
1490 #endif
1491
1492 // Next, build the output trie.
1493 // First we compute all the sizes and build the node ID translation table.
1494 uint32_t totalSize = offsetof(CompactTrieHeader,offsets);
1495 int32_t count = nodes.size();
1496 int32_t nodeCount = 1; // The sentinel node we already have
1497 BuildCompactTrieNode *node;
1498 int32_t i;
1499 UVector32 translate(count, status); // Should be no growth needed after this
1500 translate.push(0, status); // The sentinel node
1501
1502 if (U_FAILURE(status)) {
1503 return NULL;
1504 }
1505
1506 //map terminal value nodes
1507 int valueCount = 0;
1508 UVector valueNodes(status);
1509 if(values != NULL) {
1510 valueCount = values->count(); //number of unique terminal value nodes
1511 }
1512
1513 // map non-terminal nodes
1514 int valuePos = 1;//, nodePos = valueCount + valuePos;
1515 nodeCount = valueCount + valuePos;
1516 for (i = 1; i < count; ++i) {
1517 node = (BuildCompactTrieNode *)nodes[i];
1518 if (node->fNodeID == i) {
1519 // Only one node out of each duplicate set is used
1520 if (node->fNodeID >= translate.size()) {
1521 // Logically extend the mapping table
1522 translate.setSize(i + 1);
1523 }
1524 //translate.setElementAt(object, index)!
1525 if(node->fNodeType == kValueType) {
1526 valueNodes.addElement(node, status);
1527 translate.setElementAt(valuePos++, i);
1528 } else {
1529 translate.setElementAt(nodeCount++, i);
1530 }
1531 totalSize += node->size();
1532 }
1533 }
1534
1535 // Check for overflowing 20 bits worth of nodes.
1536 if (nodeCount > 0x100000) {
1537 status = U_ILLEGAL_ARGUMENT_ERROR;
1538 return NULL;
1539 }
1540
1541 // Add enough room for the offsets.
1542 totalSize += nodeCount*sizeof(uint32_t);
1543 #ifdef DEBUG_TRIE_DICT
1544 (void) ::times(&timing);
1545 fprintf(stderr, "Sizes/mapping done, time user %f system %f\n",
1546 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
1547 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
1548 previous = timing;
1549 fprintf(stderr, "%d nodes, %d unique, %d bytes\n", nodes.size(), nodeCount, totalSize);
1550 #endif
1551 uint8_t *bytes = (uint8_t *)uprv_malloc(totalSize);
1552 if (bytes == NULL) {
1553 status = U_MEMORY_ALLOCATION_ERROR;
1554 return NULL;
1555 }
1556
1557 CompactTrieHeader *header = (CompactTrieHeader *)bytes;
1558 //header->size = totalSize;
1559 if(dict.fValued){
1560 header->magic = COMPACT_TRIE_MAGIC_3;
1561 } else {
1562 header->magic = COMPACT_TRIE_MAGIC_2;
1563 }
1564 header->nodeCount = nodeCount;
1565 header->offsets[0] = 0; // Sentinel
1566 header->root = translate.elementAti(root->fNodeID);
1567 #ifdef DEBUG_TRIE_DICT
1568 if (header->root == 0) {
1569 fprintf(stderr, "ERROR: root node %d translate to physical zero\n", root->fNodeID);
1570 }
1571 #endif
1572 uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uint32_t));
1573 nodeCount = valueCount + 1;
1574
1575 // Write terminal value nodes to memory
1576 for (i=0; i < valueNodes.size(); i++) {
1577 //header->offsets[i + 1] = offset;
1578 uint32_t tmpOffset = 0;
1579 node = (BuildCompactTrieNode *) valueNodes.elementAt(i);
1580 //header->offsets[i + 1] = (uint32_t)node->fValue;
1581 node->write((uint8_t *)&header->offsets[i+1], tmpOffset, translate);
1582 }
1583
1584 // Now write the data
1585 for (i = 1; i < count; ++i) {
1586 node = (BuildCompactTrieNode *)nodes[i];
1587 if (node->fNodeID == i && node->fNodeType != kValueType) {
1588 header->offsets[nodeCount++] = offset;
1589 node->write(bytes, offset, translate);
1590 }
1591 }
1592
1593 //free all extra space
1594 uprv_realloc(bytes, offset);
1595 header->size = offset;
1596
1597 #ifdef DEBUG_TRIE_DICT
1598 fprintf(stdout, "Space freed: %d\n", totalSize-offset);
1599
1600 (void) ::times(&timing);
1601 fprintf(stderr, "Trie built, time user %f system %f\n",
1602 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
1603 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
1604 previous = timing;
1605 fprintf(stderr, "Final offset is %d\n", offset);
1606
1607 // Collect statistics on node types and sizes
1608 int hCount = 0;
1609 int vCount = 0;
1610 size_t hSize = 0;
1611 size_t vSize = 0;
1612 size_t hItemCount = 0;
1613 size_t vItemCount = 0;
1614 uint32_t previousOff = offset;
1615 uint32_t numOverflow = 0;
1616 uint32_t valueSpace = 0;
1617 for (uint32_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {
1618 const CompactTrieNode *node = getCompactNode(header, nodeIdx);
1619 int itemCount;
1620 if(nodeIdx == header->root)
1621 itemCount = node->flagscount & kRootCountMask;
1622 else
1623 itemCount = getCount(node);
1624 if(node->flagscount & kEqualOverflows){
1625 numOverflow++;
1626 }
1627 if (node->flagscount & kVerticalNode && nodeIdx != header->root) {
1628 vCount += 1;
1629 vItemCount += itemCount;
1630 vSize += previousOff-header->offsets[nodeIdx];
1631 }
1632 else {
1633 hCount += 1;
1634 hItemCount += itemCount;
1635 if(nodeIdx >= header->root) {
1636 hSize += previousOff-header->offsets[nodeIdx];
1637 }
1638 }
1639
1640 if(header->magic == COMPACT_TRIE_MAGIC_3 && node->flagscount & kParentEndsWord)
1641 valueSpace += sizeof(uint16_t);
1642 previousOff = header->offsets[nodeIdx];
1643 }
1644 fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items\n", hCount,
1645 (double)hSize/hCount, (double)hItemCount/hCount);
1646 fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n", vCount,
1647 (double)vSize/vCount, (double)vItemCount/vCount);
1648 fprintf(stderr, "Number of nodes with overflowing nodeIDs: %d \n", numOverflow);
1649 fprintf(stderr, "Space taken up by values: %d \n", valueSpace);
1650 #endif
1651
1652 if (U_FAILURE(status)) {
1653 uprv_free(bytes);
1654 header = NULL;
1655 }
1656 return header;
1657 }
1658
1659 // Forward declaration
1660 static TernaryNode *
1661 unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorCode &status );
1662
1663 // Convert a horizontal node (or subarray thereof) into a ternary subtrie
1664 static TernaryNode *
unpackHorizontalArray(const CompactTrieInfo * info,const CompactTrieHorizontalNode * hnode,int low,int high,int nodeCount,UErrorCode & status)1665 unpackHorizontalArray( const CompactTrieInfo *info, const CompactTrieHorizontalNode *hnode,
1666 int low, int high, int nodeCount, UErrorCode &status) {
1667 if (U_FAILURE(status) || low > high) {
1668 return NULL;
1669 }
1670 int middle = (low+high)/2;
1671 TernaryNode *result = new TernaryNode(hnode->entries[middle].ch);
1672 if (result == NULL) {
1673 status = U_MEMORY_ALLOCATION_ERROR;
1674 return NULL;
1675 }
1676 const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(hnode, middle, nodeCount));
1677 if (equal->flagscount & kParentEndsWord) {
1678 if(info->magic == COMPACT_TRIE_MAGIC_3){
1679 result->flags = getValue(equal);
1680 }else{
1681 result->flags |= kEndsWord;
1682 }
1683 }
1684 result->low = unpackHorizontalArray(info, hnode, low, middle-1, nodeCount, status);
1685 result->high = unpackHorizontalArray(info, hnode, middle+1, high, nodeCount, status);
1686 result->equal = unpackOneNode(info, equal, status);
1687 return result;
1688 }
1689
1690 // Convert one compact trie node into a ternary subtrie
1691 static TernaryNode *
unpackOneNode(const CompactTrieInfo * info,const CompactTrieNode * node,UErrorCode & status)1692 unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorCode &status ) {
1693 int nodeCount = getCount(node);
1694 if (nodeCount == 0 || U_FAILURE(status)) {
1695 // Failure, or terminal node
1696 return NULL;
1697 }
1698 if (node->flagscount & kVerticalNode) {
1699 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node;
1700 TernaryNode *head = NULL;
1701 TernaryNode *previous = NULL;
1702 TernaryNode *latest = NULL;
1703 for (int i = 0; i < nodeCount; ++i) {
1704 latest = new TernaryNode(vnode->chars[i]);
1705 if (latest == NULL) {
1706 status = U_MEMORY_ALLOCATION_ERROR;
1707 break;
1708 }
1709 if (head == NULL) {
1710 head = latest;
1711 }
1712 if (previous != NULL) {
1713 previous->equal = latest;
1714 }
1715 previous = latest;
1716 }
1717 if (latest != NULL) {
1718 const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(vnode));
1719 if (equal->flagscount & kParentEndsWord) {
1720 if(info->magic == COMPACT_TRIE_MAGIC_3){
1721 latest->flags = getValue(equal);
1722 } else {
1723 latest->flags |= kEndsWord;
1724 }
1725 }
1726 latest->equal = unpackOneNode(info, equal, status);
1727 }
1728 return head;
1729 }
1730 else {
1731 // Horizontal node
1732 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;
1733 return unpackHorizontalArray(info, hnode, 0, nodeCount-1, nodeCount, status);
1734 }
1735 }
1736
1737 // returns a MutableTrieDictionary generated from the CompactTrieDictionary
1738 MutableTrieDictionary *
cloneMutable(UErrorCode & status) const1739 CompactTrieDictionary::cloneMutable( UErrorCode &status ) const {
1740 MutableTrieDictionary *result = new MutableTrieDictionary( status, fInfo->magic == COMPACT_TRIE_MAGIC_3 );
1741 if (result == NULL) {
1742 status = U_MEMORY_ALLOCATION_ERROR;
1743 return NULL;
1744 }
1745 // treat root node as special case: don't call unpackOneNode() or unpackHorizontalArray() directly
1746 // because only kEqualOverflows flag should be checked in root's flagscount
1747 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)
1748 getCompactNode(fInfo, fInfo->root);
1749 uint16_t nodeCount = hnode->flagscount & kRootCountMask;
1750 TernaryNode *root = unpackHorizontalArray(fInfo, hnode, 0, nodeCount-1,
1751 nodeCount, status);
1752
1753 if (U_FAILURE(status)) {
1754 delete root; // Clean up
1755 delete result;
1756 return NULL;
1757 }
1758 result->fTrie = root;
1759 return result;
1760 }
1761
1762 U_NAMESPACE_END
1763
1764 U_CAPI int32_t U_EXPORT2
triedict_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * status)1765 triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
1766 UErrorCode *status) {
1767
1768 if (status == NULL || U_FAILURE(*status)) {
1769 return 0;
1770 }
1771 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
1772 *status=U_ILLEGAL_ARGUMENT_ERROR;
1773 return 0;
1774 }
1775
1776 //
1777 // Check that the data header is for for dictionary data.
1778 // (Header contents are defined in genxxx.cpp)
1779 //
1780 const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4);
1781 if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */
1782 pInfo->dataFormat[1]==0x72 &&
1783 pInfo->dataFormat[2]==0x44 &&
1784 pInfo->dataFormat[3]==0x63 &&
1785 pInfo->formatVersion[0]==1 )) {
1786 udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
1787 pInfo->dataFormat[0], pInfo->dataFormat[1],
1788 pInfo->dataFormat[2], pInfo->dataFormat[3],
1789 pInfo->formatVersion[0]);
1790 *status=U_UNSUPPORTED_ERROR;
1791 return 0;
1792 }
1793
1794 //
1795 // Swap the data header. (This is the generic ICU Data Header, not the
1796 // CompactTrieHeader). This swap also conveniently gets us
1797 // the size of the ICU d.h., which lets us locate the start
1798 // of the RBBI specific data.
1799 //
1800 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
1801
1802 //
1803 // Get the CompactTrieHeader, and check that it appears to be OK.
1804 //
1805 const uint8_t *inBytes =(const uint8_t *)inData+headerSize;
1806 const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes;
1807 uint32_t magic = ds->readUInt32(header->magic);
1808 if (magic != COMPACT_TRIE_MAGIC_1 && magic != COMPACT_TRIE_MAGIC_2 && magic != COMPACT_TRIE_MAGIC_3
1809 || magic == COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeaderV1)
1810 || magic != COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeader))
1811 {
1812 udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n");
1813 *status=U_UNSUPPORTED_ERROR;
1814 return 0;
1815 }
1816
1817 //
1818 // Prefight operation? Just return the size
1819 //
1820 uint32_t totalSize = ds->readUInt32(header->size);
1821 int32_t sizeWithUData = (int32_t)totalSize + headerSize;
1822 if (length < 0) {
1823 return sizeWithUData;
1824 }
1825
1826 //
1827 // Check that length passed in is consistent with length from RBBI data header.
1828 //
1829 if (length < sizeWithUData) {
1830 udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n",
1831 totalSize);
1832 *status=U_INDEX_OUTOFBOUNDS_ERROR;
1833 return 0;
1834 }
1835
1836 //
1837 // Swap the Data. Do the data itself first, then the CompactTrieHeader, because
1838 // we need to reference the header to locate the data, and an
1839 // inplace swap of the header leaves it unusable.
1840 //
1841 uint8_t *outBytes = (uint8_t *)outData + headerSize;
1842 CompactTrieHeader *outputHeader = (CompactTrieHeader *)outBytes;
1843
1844 #if 0
1845 //
1846 // If not swapping in place, zero out the output buffer before starting.
1847 //
1848 if (inBytes != outBytes) {
1849 uprv_memset(outBytes, 0, totalSize);
1850 }
1851
1852 // We need to loop through all the nodes in the offset table, and swap each one.
1853 uint32_t nodeCount, rootId;
1854 if(header->magic == COMPACT_TRIE_MAGIC_1) {
1855 nodeCount = ds->readUInt16(((CompactTrieHeaderV1 *)header)->nodeCount);
1856 rootId = ds->readUInt16(((CompactTrieHeaderV1 *)header)->root);
1857 } else {
1858 nodeCount = ds->readUInt32(header->nodeCount);
1859 rootId = ds->readUInt32(header->root);
1860 }
1861
1862 // Skip node 0, which should always be 0.
1863 for (uint32_t i = 1; i < nodeCount; ++i) {
1864 uint32_t nodeOff = ds->readUInt32(header->offsets[i]);
1865 const CompactTrieNode *inNode = (const CompactTrieNode *)(inBytes + nodeOff);
1866 CompactTrieNode *outNode = (CompactTrieNode *)(outBytes + nodeOff);
1867 uint16_t flagscount = ds->readUInt16(inNode->flagscount);
1868 uint16_t itemCount = getCount(inNode);
1869 //uint16_t itemCount = flagscount & kCountMask;
1870 ds->writeUInt16(&outNode->flagscount, flagscount);
1871 if (itemCount > 0) {
1872 uint16_t overflow = 0; //number of extra uint16_ts needed to be swapped
1873 if (flagscount & kVerticalNode && i != rootId) {
1874 if(flagscount & kEqualOverflows){
1875 // include overflow bits
1876 overflow += 1;
1877 }
1878 if (header->magic == COMPACT_TRIE_MAGIC_3 && flagscount & kEndsParentWord) {
1879 //include values
1880 overflow += 1;
1881 }
1882 ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars),
1883 (itemCount + overflow)*sizeof(uint16_t),
1884 outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), status);
1885 uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal);
1886 ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal));
1887 }
1888 else {
1889 const CompactTrieHorizontalNode *inHNode = (const CompactTrieHorizontalNode *)inNode;
1890 CompactTrieHorizontalNode *outHNode = (CompactTrieHorizontalNode *)outNode;
1891 for (int j = 0; j < itemCount; ++j) {
1892 uint16_t word = ds->readUInt16(inHNode->entries[j].ch);
1893 ds->writeUInt16(&outHNode->entries[j].ch, word);
1894 word = ds->readUInt16(inHNode->entries[j].equal);
1895 ds->writeUInt16(&outHNode->entries[j].equal, word);
1896 }
1897
1898 // swap overflow/value information
1899 if(flagscount & kEqualOverflows){
1900 overflow += (itemCount + 3) / 4;
1901 }
1902
1903 if (header->magic == COMPACT_TRIE_MAGIC_3 && i != rootId && flagscount & kEndsParentWord) {
1904 //include values
1905 overflow += 1;
1906 }
1907
1908 uint16_t *inOverflow = (uint16_t *) &inHNode->entries[itemCount];
1909 uint16_t *outOverflow = (uint16_t *) &outHNode->entries[itemCount];
1910 for(int j = 0; j<overflow; j++){
1911 uint16_t extraInfo = ds->readUInt16(*inOverflow);
1912 ds->writeUInt16(outOverflow, extraInfo);
1913
1914 inOverflow++;
1915 outOverflow++;
1916 }
1917 }
1918 }
1919 }
1920 #endif
1921
1922 // Swap the header
1923 ds->writeUInt32(&outputHeader->size, totalSize);
1924 ds->writeUInt32(&outputHeader->magic, magic);
1925
1926 uint32_t nodeCount;
1927 uint32_t offsetPos;
1928 if (header->magic == COMPACT_TRIE_MAGIC_1) {
1929 CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *)header;
1930 CompactTrieHeaderV1 *outputHeaderV1 = (CompactTrieHeaderV1 *)outputHeader;
1931
1932 nodeCount = ds->readUInt16(headerV1->nodeCount);
1933 ds->writeUInt16(&outputHeaderV1->nodeCount, nodeCount);
1934 uint16_t root = ds->readUInt16(headerV1->root);
1935 ds->writeUInt16(&outputHeaderV1->root, root);
1936 offsetPos = offsetof(CompactTrieHeaderV1,offsets);
1937 } else {
1938 nodeCount = ds->readUInt32(header->nodeCount);
1939 ds->writeUInt32(&outputHeader->nodeCount, nodeCount);
1940 uint32_t root = ds->readUInt32(header->root);
1941 ds->writeUInt32(&outputHeader->root, root);
1942 offsetPos = offsetof(CompactTrieHeader,offsets);
1943 }
1944
1945 // All the data in all the nodes consist of 16 bit items. Swap them all at once.
1946 uint32_t nodesOff = offsetPos+((uint32_t)nodeCount*sizeof(uint32_t));
1947 ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff, status);
1948
1949 //swap offsets
1950 ds->swapArray32(ds, inBytes+offsetPos,
1951 sizeof(uint32_t)*(uint32_t)nodeCount,
1952 outBytes+offsetPos, status);
1953
1954 return sizeWithUData;
1955 }
1956
1957 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1958