1--- source/common/brkeng.cpp 2009-11-11 07:47:22.000000000 -0800 2+++ source/common/brkeng.cpp 2011-01-21 14:12:45.479922000 -0800 3@@ -226,6 +226,30 @@ 4 case USCRIPT_THAI: 5 engine = new ThaiBreakEngine(dict, status); 6 break; 7+ 8+ case USCRIPT_HANGUL: 9+ engine = new CjkBreakEngine(dict, kKorean, status); 10+ break; 11+ 12+ // use same BreakEngine and dictionary for both Chinese and Japanese 13+ case USCRIPT_HIRAGANA: 14+ case USCRIPT_KATAKANA: 15+ case USCRIPT_HAN: 16+ engine = new CjkBreakEngine(dict, kChineseJapanese, status); 17+ break; 18+#if 0 19+ // TODO: Have to get some characters with script=common handled 20+ // by CjkBreakEngine (e.g. U+309B). Simply subjecting 21+ // them to CjkBreakEngine does not work. The engine has to 22+ // special-case them. 23+ case USCRIPT_COMMON: 24+ { 25+ UBlockCode block = ublock_getCode(code); 26+ if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) 27+ engine = new CjkBreakEngine(dict, kChineseJapanese, status); 28+ break; 29+ } 30+#endif 31 default: 32 break; 33 } 34@@ -281,6 +305,13 @@ 35 dict = NULL; 36 } 37 return dict; 38+ } else if (dictfname != NULL){ 39+ //create dummy dict if dictionary filename not valid 40+ UChar c = 0x0020; 41+ status = U_ZERO_ERROR; 42+ MutableTrieDictionary *mtd = new MutableTrieDictionary(c, status, TRUE); 43+ mtd->addWord(&c, 1, status, 1); 44+ return new CompactTrieDictionary(*mtd, status); 45 } 46 return NULL; 47 } 48--- source/common/dictbe.cpp 2008-06-13 12:21:12.000000000 -0700 49+++ source/common/dictbe.cpp 2011-01-21 14:12:45.468928000 -0800 50@@ -16,6 +16,9 @@ 51 #include "unicode/ubrk.h" 52 #include "uvector.h" 53 #include "triedict.h" 54+#include "uassert.h" 55+#include "unicode/normlzr.h" 56+#include "cmemory.h" 57 58 U_NAMESPACE_BEGIN 59 60@@ -422,6 +425,294 @@ 61 return wordsFound; 62 } 63 64+/* 65+ ****************************************************************** 66+ * CjkBreakEngine 67+ */ 68+static const uint32_t kuint32max = 0xFFFFFFFF; 69+CjkBreakEngine::CjkBreakEngine(const TrieWordDictionary *adoptDictionary, LanguageType type, UErrorCode &status) 70+: DictionaryBreakEngine(1<<UBRK_WORD), fDictionary(adoptDictionary){ 71+ if (!adoptDictionary->getValued()) { 72+ status = U_ILLEGAL_ARGUMENT_ERROR; 73+ return; 74+ } 75+ 76+ // Korean dictionary only includes Hangul syllables 77+ fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status); 78+ fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status); 79+ fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status); 80+ fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status); 81+ 82+ if (U_SUCCESS(status)) { 83+ // handle Korean and Japanese/Chinese using different dictionaries 84+ if (type == kKorean) { 85+ setCharacters(fHangulWordSet); 86+ } else { //Chinese and Japanese 87+ UnicodeSet cjSet; 88+ cjSet.addAll(fHanWordSet); 89+ cjSet.addAll(fKatakanaWordSet); 90+ cjSet.addAll(fHiraganaWordSet); 91+ cjSet.add(UNICODE_STRING_SIMPLE("\\uff70\\u30fc")); 92+ setCharacters(cjSet); 93+ } 94+ } 95+} 96+ 97+CjkBreakEngine::~CjkBreakEngine(){ 98+ delete fDictionary; 99+} 100+ 101+// The katakanaCost values below are based on the length frequencies of all 102+// katakana phrases in the dictionary 103+static const int kMaxKatakanaLength = 8; 104+static const int kMaxKatakanaGroupLength = 20; 105+static const uint32_t maxSnlp = 255; 106+ 107+static inline uint32_t getKatakanaCost(int wordLength){ 108+ //TODO: fill array with actual values from dictionary! 109+ static const uint32_t katakanaCost[kMaxKatakanaLength + 1] 110+ = {8192, 984, 408, 240, 204, 252, 300, 372, 480}; 111+ return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength]; 112+} 113+ 114+static inline bool isKatakana(uint16_t value) { 115+ return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) || 116+ (value >= 0xFF66u && value <= 0xFF9fu); 117+} 118+ 119+// A very simple helper class to streamline the buffer handling in 120+// divideUpDictionaryRange. 121+template<class T, size_t N> 122+class AutoBuffer { 123+ public: 124+ AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) { 125+ if (size > N) { 126+ buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size)); 127+ capacity = size; 128+ } 129+ } 130+ ~AutoBuffer() { 131+ if (buffer != stackBuffer) 132+ uprv_free(buffer); 133+ } 134+#if 0 135+ T* operator& () { 136+ return buffer; 137+ } 138+#endif 139+ T* elems() { 140+ return buffer; 141+ } 142+ const T& operator[] (size_t i) const { 143+ return buffer[i]; 144+ } 145+ T& operator[] (size_t i) { 146+ return buffer[i]; 147+ } 148+ 149+ // resize without copy 150+ void resize(size_t size) { 151+ if (size <= capacity) 152+ return; 153+ if (buffer != stackBuffer) 154+ uprv_free(buffer); 155+ buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size)); 156+ capacity = size; 157+ } 158+ private: 159+ T stackBuffer[N]; 160+ T* buffer; 161+ AutoBuffer(); 162+ size_t capacity; 163+}; 164+ 165+ 166+/* 167+ * @param text A UText representing the text 168+ * @param rangeStart The start of the range of dictionary characters 169+ * @param rangeEnd The end of the range of dictionary characters 170+ * @param foundBreaks Output of C array of int32_t break positions, or 0 171+ * @return The number of breaks found 172+ */ 173+int32_t 174+CjkBreakEngine::divideUpDictionaryRange( UText *text, 175+ int32_t rangeStart, 176+ int32_t rangeEnd, 177+ UStack &foundBreaks ) const { 178+ if (rangeStart >= rangeEnd) { 179+ return 0; 180+ } 181+ 182+ const size_t defaultInputLength = 80; 183+ size_t inputLength = rangeEnd - rangeStart; 184+ AutoBuffer<UChar, defaultInputLength> charString(inputLength); 185+ 186+ // Normalize the input string and put it in normalizedText. 187+ // The map from the indices of the normalized input to the raw 188+ // input is kept in charPositions. 189+ UErrorCode status = U_ZERO_ERROR; 190+ utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status); 191+ if (U_FAILURE(status)) 192+ return 0; 193+ 194+ UnicodeString inputString(charString.elems(), inputLength); 195+ UNormalizationMode norm_mode = UNORM_NFKC; 196+ UBool isNormalized = 197+ Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES || 198+ Normalizer::isNormalized(inputString, norm_mode, status); 199+ 200+ AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1); 201+ int numChars = 0; 202+ UText normalizedText = UTEXT_INITIALIZER; 203+ // Needs to be declared here because normalizedText holds onto its buffer. 204+ UnicodeString normalizedString; 205+ if (isNormalized) { 206+ int32_t index = 0; 207+ charPositions[0] = 0; 208+ while(index < inputString.length()) { 209+ index = inputString.moveIndex32(index, 1); 210+ charPositions[++numChars] = index; 211+ } 212+ utext_openUnicodeString(&normalizedText, &inputString, &status); 213+ } 214+ else { 215+ Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status); 216+ if (U_FAILURE(status)) 217+ return 0; 218+ charPositions.resize(normalizedString.length() + 1); 219+ Normalizer normalizer(charString.elems(), inputLength, norm_mode); 220+ int32_t index = 0; 221+ charPositions[0] = 0; 222+ while(index < normalizer.endIndex()){ 223+ UChar32 uc = normalizer.next(); 224+ charPositions[++numChars] = index = normalizer.getIndex(); 225+ } 226+ utext_openUnicodeString(&normalizedText, &normalizedString, &status); 227+ } 228+ 229+ if (U_FAILURE(status)) 230+ return 0; 231+ 232+ // From this point on, all the indices refer to the indices of 233+ // the normalized input string. 234+ 235+ // bestSnlp[i] is the snlp of the best segmentation of the first i 236+ // characters in the range to be matched. 237+ AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1); 238+ bestSnlp[0] = 0; 239+ for(int i=1; i<=numChars; i++){ 240+ bestSnlp[i] = kuint32max; 241+ } 242+ 243+ // prev[i] is the index of the last CJK character in the previous word in 244+ // the best segmentation of the first i characters. 245+ AutoBuffer<int, defaultInputLength> prev(numChars + 1); 246+ for(int i=0; i<=numChars; i++){ 247+ prev[i] = -1; 248+ } 249+ 250+ const size_t maxWordSize = 20; 251+ AutoBuffer<uint16_t, maxWordSize> values(numChars); 252+ AutoBuffer<int32_t, maxWordSize> lengths(numChars); 253+ 254+ // Dynamic programming to find the best segmentation. 255+ bool is_prev_katakana = false; 256+ for (int i = 0; i < numChars; ++i) { 257+ //utext_setNativeIndex(text, rangeStart + i); 258+ utext_setNativeIndex(&normalizedText, i); 259+ if (bestSnlp[i] == kuint32max) 260+ continue; 261+ 262+ int count; 263+ // limit maximum word length matched to size of current substring 264+ int maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize: numChars - i; 265+ 266+ fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems()); 267+ 268+ // if there are no single character matches found in the dictionary 269+ // starting with this charcter, treat character as a 1-character word 270+ // with the highest value possible, i.e. the least likely to occur. 271+ // Exclude Korean characters from this treatment, as they should be left 272+ // together by default. 273+ if((count == 0 || lengths[0] != 1) && 274+ !fHangulWordSet.contains(utext_current32(&normalizedText))){ 275+ values[count] = maxSnlp; 276+ lengths[count++] = 1; 277+ } 278+ 279+ for (int j = 0; j < count; j++){ 280+ //U_ASSERT(values[j] >= 0 && values[j] <= maxSnlp); 281+ uint32_t newSnlp = bestSnlp[i] + values[j]; 282+ if (newSnlp < bestSnlp[lengths[j] + i]) { 283+ bestSnlp[lengths[j] + i] = newSnlp; 284+ prev[lengths[j] + i] = i; 285+ } 286+ } 287+ 288+ // In Japanese, 289+ // Katakana word in single character is pretty rare. So we apply 290+ // the following heuristic to Katakana: any continuous run of Katakana 291+ // characters is considered a candidate word with a default cost 292+ // specified in the katakanaCost table according to its length. 293+ //utext_setNativeIndex(text, rangeStart + i); 294+ utext_setNativeIndex(&normalizedText, i); 295+ bool is_katakana = isKatakana(utext_current32(&normalizedText)); 296+ if (!is_prev_katakana && is_katakana) { 297+ int j = i + 1; 298+ utext_next32(&normalizedText); 299+ // Find the end of the continuous run of Katakana characters 300+ while (j < numChars && (j - i) < kMaxKatakanaGroupLength && 301+ isKatakana(utext_current32(&normalizedText))) { 302+ utext_next32(&normalizedText); 303+ ++j; 304+ } 305+ if ((j - i) < kMaxKatakanaGroupLength) { 306+ uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i); 307+ if (newSnlp < bestSnlp[j]) { 308+ bestSnlp[j] = newSnlp; 309+ prev[j] = i; 310+ } 311+ } 312+ } 313+ is_prev_katakana = is_katakana; 314+ } 315+ 316+ // Start pushing the optimal offset index into t_boundary (t for tentative). 317+ // prev[numChars] is guaranteed to be meaningful. 318+ // We'll first push in the reverse order, i.e., 319+ // t_boundary[0] = numChars, and afterwards do a swap. 320+ AutoBuffer<int, maxWordSize> t_boundary(numChars + 1); 321+ 322+ int numBreaks = 0; 323+ // No segmentation found, set boundary to end of range 324+ if (bestSnlp[numChars] == kuint32max) { 325+ t_boundary[numBreaks++] = numChars; 326+ } else { 327+ for (int i = numChars; i > 0; i = prev[i]){ 328+ t_boundary[numBreaks++] = i; 329+ 330+ } 331+ U_ASSERT(prev[t_boundary[numBreaks-1]] == 0); 332+ } 333+ 334+ // Reverse offset index in t_boundary. 335+ // Don't add a break for the start of the dictionary range if there is one 336+ // there already. 337+ if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) { 338+ t_boundary[numBreaks++] = 0; 339+ } 340+ 341+ // Now that we're done, convert positions in t_bdry[] (indices in 342+ // the normalized input string) back to indices in the raw input string 343+ // while reversing t_bdry and pushing values to foundBreaks. 344+ for (int i = numBreaks-1; i >= 0; i--) { 345+ foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status); 346+ } 347+ 348+ utext_close(&normalizedText); 349+ return numBreaks; 350+} 351+ 352 U_NAMESPACE_END 353 354 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 355--- source/common/dictbe.h 2006-09-29 17:37:45.000000000 -0700 356+++ source/common/dictbe.h 2011-01-21 14:12:45.492920000 -0800 357@@ -1,8 +1,8 @@ 358 /** 359- ******************************************************************************* 360- * Copyright (C) 2006, International Business Machines Corporation and others. * 361- * All Rights Reserved. * 362- ******************************************************************************* 363+ ********************************************************************************** 364+ * Copyright (C) 2006-2010, International Business Machines Corporation and others. 365+ * All Rights Reserved. 366+ ********************************************************************************** 367 */ 368 369 #ifndef DICTBE_H 370@@ -65,31 +65,31 @@ 371 */ 372 virtual ~DictionaryBreakEngine(); 373 374- /** 375- * <p>Indicate whether this engine handles a particular character for 376- * a particular kind of break.</p> 377- * 378- * @param c A character which begins a run that the engine might handle 379- * @param breakType The type of text break which the caller wants to determine 380- * @return TRUE if this engine handles the particular character and break 381- * type. 382- */ 383+ /** 384+ * <p>Indicate whether this engine handles a particular character for 385+ * a particular kind of break.</p> 386+ * 387+ * @param c A character which begins a run that the engine might handle 388+ * @param breakType The type of text break which the caller wants to determine 389+ * @return TRUE if this engine handles the particular character and break 390+ * type. 391+ */ 392 virtual UBool handles( UChar32 c, int32_t breakType ) const; 393 394- /** 395- * <p>Find any breaks within a run in the supplied text.</p> 396- * 397- * @param text A UText representing the text. The 398- * iterator is left at the end of the run of characters which the engine 399- * is capable of handling. 400- * @param startPos The start of the run within the supplied text. 401- * @param endPos The end of the run within the supplied text. 402- * @param reverse Whether the caller is looking for breaks in a reverse 403- * direction. 404- * @param breakType The type of break desired, or -1. 405- * @param foundBreaks An allocated C array of the breaks found, if any 406- * @return The number of breaks found. 407- */ 408+ /** 409+ * <p>Find any breaks within a run in the supplied text.</p> 410+ * 411+ * @param text A UText representing the text. The iterator is left at 412+ * the end of the run of characters which the engine is capable of handling 413+ * that starts from the first (or last) character in the range. 414+ * @param startPos The start of the run within the supplied text. 415+ * @param endPos The end of the run within the supplied text. 416+ * @param reverse Whether the caller is looking for breaks in a reverse 417+ * direction. 418+ * @param breakType The type of break desired, or -1. 419+ * @param foundBreaks An allocated C array of the breaks found, if any 420+ * @return The number of breaks found. 421+ */ 422 virtual int32_t findBreaks( UText *text, 423 int32_t startPos, 424 int32_t endPos, 425@@ -114,7 +114,7 @@ 426 // virtual void setBreakTypes( uint32_t breakTypes ); 427 428 /** 429- * <p>Divide up a range of known dictionary characters.</p> 430+ * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 431 * 432 * @param text A UText representing the text 433 * @param rangeStart The start of the range of dictionary characters 434@@ -171,7 +171,7 @@ 435 436 protected: 437 /** 438- * <p>Divide up a range of known dictionary characters.</p> 439+ * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 440 * 441 * @param text A UText representing the text 442 * @param rangeStart The start of the range of dictionary characters 443@@ -186,6 +186,66 @@ 444 445 }; 446 447+/******************************************************************* 448+ * CjkBreakEngine 449+ */ 450+ 451+//indicates language/script that the CjkBreakEngine will handle 452+enum LanguageType { 453+ kKorean, 454+ kChineseJapanese 455+}; 456+ 457+/** 458+ * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a 459+ * TrieWordDictionary with costs associated with each word and 460+ * Viterbi decoding to determine CJK-specific breaks.</p> 461+ */ 462+class CjkBreakEngine : public DictionaryBreakEngine { 463+ protected: 464+ /** 465+ * The set of characters handled by this engine 466+ * @internal 467+ */ 468+ UnicodeSet fHangulWordSet; 469+ UnicodeSet fHanWordSet; 470+ UnicodeSet fKatakanaWordSet; 471+ UnicodeSet fHiraganaWordSet; 472+ 473+ const TrieWordDictionary *fDictionary; 474+ 475+ public: 476+ 477+ /** 478+ * <p>Default constructor.</p> 479+ * 480+ * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the 481+ * engine is deleted. The TrieWordDictionary must contain costs for each word 482+ * in order for the dictionary to work properly. 483+ */ 484+ CjkBreakEngine(const TrieWordDictionary *adoptDictionary, LanguageType type, UErrorCode &status); 485+ 486+ /** 487+ * <p>Virtual destructor.</p> 488+ */ 489+ virtual ~CjkBreakEngine(); 490+ 491+ protected: 492+ /** 493+ * <p>Divide up a range of known dictionary characters handled by this break engine.</p> 494+ * 495+ * @param text A UText representing the text 496+ * @param rangeStart The start of the range of dictionary characters 497+ * @param rangeEnd The end of the range of dictionary characters 498+ * @param foundBreaks Output of C array of int32_t break positions, or 0 499+ * @return The number of breaks found 500+ */ 501+ virtual int32_t divideUpDictionaryRange( UText *text, 502+ int32_t rangeStart, 503+ int32_t rangeEnd, 504+ UStack &foundBreaks ) const; 505+ 506+}; 507 508 U_NAMESPACE_END 509 510--- source/common/rbbi.cpp 2010-07-22 17:15:37.000000000 -0700 511+++ source/common/rbbi.cpp 2011-01-21 14:12:45.457938000 -0800 512@@ -1555,10 +1555,12 @@ 513 int32_t endPos, 514 UBool reverse) { 515 // Reset the old break cache first. 516- uint32_t dictionaryCount = fDictionaryCharCount; 517 reset(); 518 519- if (dictionaryCount <= 1 || (endPos - startPos) <= 1) { 520+ // note: code segment below assumes that dictionary chars are in the 521+ // startPos-endPos range 522+ // value returned should be next character in sequence 523+ if ((endPos - startPos) <= 1) { 524 return (reverse ? startPos : endPos); 525 } 526 527@@ -1711,7 +1713,7 @@ 528 // proposed break by one of the breaks we found. Use following() and 529 // preceding() to do the work. They should never recurse in this case. 530 if (reverse) { 531- return preceding(endPos - 1); 532+ return preceding(endPos); 533 } 534 else { 535 return following(startPos); 536--- source/common/triedict.cpp 2008-02-13 01:35:50.000000000 -0800 537+++ source/common/triedict.cpp 2011-01-21 14:12:45.271006000 -0800 538@@ -20,6 +20,7 @@ 539 #include "uvector.h" 540 #include "uvectr32.h" 541 #include "uarrsort.h" 542+#include "hash.h" 543 544 //#define DEBUG_TRIE_DICT 1 545 546@@ -27,6 +28,11 @@ 547 #include <sys/times.h> 548 #include <limits.h> 549 #include <stdio.h> 550+#include <time.h> 551+#ifndef CLK_TCK 552+#define CLK_TCK CLOCKS_PER_SEC 553+#endif 554+ 555 #endif 556 557 U_NAMESPACE_BEGIN 558@@ -45,6 +51,11 @@ 559 * MutableTrieDictionary 560 */ 561 562+//#define MAX_VALUE 65535 563+ 564+// forward declaration 565+inline uint16_t scaleLogProbabilities(double logprob); 566+ 567 // Node structure for the ternary, uncompressed trie 568 struct TernaryNode : public UMemory { 569 UChar ch; // UTF-16 code unit 570@@ -77,7 +88,8 @@ 571 delete high; 572 } 573 574-MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status ) { 575+MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status, 576+ UBool containsValue /* = FALSE */ ) { 577 // Start the trie off with something. Having the root node already present 578 // cuts a special case out of the search/insertion functions. 579 // Making it a median character cuts the worse case for searches from 580@@ -91,14 +103,19 @@ 581 if (U_SUCCESS(status) && fIter == NULL) { 582 status = U_MEMORY_ALLOCATION_ERROR; 583 } 584+ 585+ fValued = containsValue; 586 } 587 588-MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) { 589+MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status, 590+ UBool containsValue /* = false */ ) { 591 fTrie = NULL; 592 fIter = utext_openUChars(NULL, NULL, 0, &status); 593 if (U_SUCCESS(status) && fIter == NULL) { 594 status = U_MEMORY_ALLOCATION_ERROR; 595 } 596+ 597+ fValued = containsValue; 598 } 599 600 MutableTrieDictionary::~MutableTrieDictionary() { 601@@ -108,12 +125,13 @@ 602 603 int32_t 604 MutableTrieDictionary::search( UText *text, 605- int32_t maxLength, 606- int32_t *lengths, 607- int &count, 608- int limit, 609- TernaryNode *&parent, 610- UBool &pMatched ) const { 611+ int32_t maxLength, 612+ int32_t *lengths, 613+ int &count, 614+ int limit, 615+ TernaryNode *&parent, 616+ UBool &pMatched, 617+ uint16_t *values /*=NULL*/) const { 618 // TODO: current implementation works in UTF-16 space 619 const TernaryNode *up = NULL; 620 const TernaryNode *p = fTrie; 621@@ -121,6 +139,10 @@ 622 pMatched = TRUE; 623 int i; 624 625+ if (!fValued) { 626+ values = NULL; 627+ } 628+ 629 UChar uc = utext_current32(text); 630 for (i = 0; i < maxLength && p != NULL; ++i) { 631 while (p != NULL) { 632@@ -141,7 +163,11 @@ 633 break; 634 } 635 // Must be equal to get here 636- if (limit > 0 && (p->flags & kEndsWord)) { 637+ if (limit > 0 && (p->flags > 0)) { 638+ //is there a more efficient way to add values? ie. remove if stmt 639+ if(values != NULL) { 640+ values[mycount] = p->flags; 641+ } 642 lengths[mycount++] = i+1; 643 --limit; 644 } 645@@ -161,13 +187,14 @@ 646 void 647 MutableTrieDictionary::addWord( const UChar *word, 648 int32_t length, 649- UErrorCode &status ) { 650-#if 0 651- if (length <= 0) { 652+ UErrorCode &status, 653+ uint16_t value /* = 0 */ ) { 654+ // dictionary cannot store zero values, would interfere with flags 655+ if (length <= 0 || (!fValued && value > 0) || (fValued && value == 0)) { 656 status = U_ILLEGAL_ARGUMENT_ERROR; 657 return; 658 } 659-#endif 660+ 661 TernaryNode *parent; 662 UBool pMatched; 663 int count; 664@@ -177,7 +204,7 @@ 665 matched = search(fIter, length, NULL, count, 0, parent, pMatched); 666 667 while (matched++ < length) { 668- UChar32 uc = utext_next32(fIter); // TODO: supplemetary support? 669+ UChar32 uc = utext_next32(fIter); // TODO: supplementary support? 670 U_ASSERT(uc != U_SENTINEL); 671 TernaryNode *newNode = new TernaryNode(uc); 672 if (newNode == NULL) { 673@@ -199,30 +226,23 @@ 674 parent = newNode; 675 } 676 677- parent->flags |= kEndsWord; 678-} 679- 680-#if 0 681-void 682-MutableTrieDictionary::addWords( UEnumeration *words, 683- UErrorCode &status ) { 684- int32_t length; 685- const UChar *word; 686- while ((word = uenum_unext(words, &length, &status)) && U_SUCCESS(status)) { 687- addWord(word, length, status); 688+ if(fValued && value > 0){ 689+ parent->flags = value; 690+ } else { 691+ parent->flags |= kEndsWord; 692 } 693 } 694-#endif 695 696 int32_t 697 MutableTrieDictionary::matches( UText *text, 698 int32_t maxLength, 699 int32_t *lengths, 700 int &count, 701- int limit ) const { 702+ int limit, 703+ uint16_t *values /*=NULL*/) const { 704 TernaryNode *parent; 705 UBool pMatched; 706- return search(text, maxLength, lengths, count, limit, parent, pMatched); 707+ return search(text, maxLength, lengths, count, limit, parent, pMatched, values); 708 } 709 710 // Implementation of iteration for MutableTrieDictionary 711@@ -277,7 +297,7 @@ 712 break; 713 } 714 case kEqual: 715- emit = (node->flags & kEndsWord) != 0; 716+ emit = node->flags > 0; 717 equal = (node->equal != NULL); 718 // If this node should be part of the next emitted string, append 719 // the UChar to the string, and make sure we pop it when we come 720@@ -299,7 +319,7 @@ 721 } 722 case kGreaterThan: 723 // If this node's character is in the string, remove it. 724- if (node->equal != NULL || (node->flags & kEndsWord)) { 725+ if (node->equal != NULL || node->flags > 0) { 726 unistr.truncate(unistr.length()-1); 727 } 728 if (node->high != NULL) { 729@@ -354,12 +374,75 @@ 730 * CompactTrieDictionary 731 */ 732 733+//TODO further optimization: 734+// minimise size of trie with logprobs by storing values 735+// for terminal nodes directly in offsets[] 736+// --> calculating from next offset *might* be simpler, but would have to add 737+// one last offset for logprob of last node 738+// --> if calculate from current offset, need to factor in possible overflow 739+// as well. 740+// idea: store in offset, set first bit to indicate logprob storage-->won't 741+// have to access additional node 742+ 743+// {'Dic', 1}, version 1: uses old header, no values 744+#define COMPACT_TRIE_MAGIC_1 0x44696301 745+// version 2: uses new header (more than 2^16 nodes), no values 746+#define COMPACT_TRIE_MAGIC_2 0x44696302 747+// version 3: uses new header, includes values 748+#define COMPACT_TRIE_MAGIC_3 0x44696303 749+ 750 struct CompactTrieHeader { 751 uint32_t size; // Size of the data in bytes 752 uint32_t magic; // Magic number (including version) 753+ uint32_t nodeCount; // Number of entries in offsets[] 754+ uint32_t root; // Node number of the root node 755+ uint32_t offsets[1]; // Offsets to nodes from start of data 756+}; 757+ 758+// old version of CompactTrieHeader kept for backwards compatibility 759+struct CompactTrieHeaderV1 { 760+ uint32_t size; // Size of the data in bytes 761+ uint32_t magic; // Magic number (including version) 762 uint16_t nodeCount; // Number of entries in offsets[] 763 uint16_t root; // Node number of the root node 764- uint32_t offsets[1]; // Offsets to nodes from start of data 765+ uint32_t offsets[1]; // Offsets to nodes from start of data 766+}; 767+ 768+// Helper class for managing CompactTrieHeader and CompactTrieHeaderV1 769+struct CompactTrieInfo { 770+ uint32_t size; // Size of the data in bytes 771+ uint32_t magic; // Magic number (including version) 772+ uint32_t nodeCount; // Number of entries in offsets[] 773+ uint32_t root; // Node number of the root node 774+ uint32_t *offsets; // Offsets to nodes from start of data 775+ uint8_t *address; // pointer to header bytes in memory 776+ 777+ CompactTrieInfo(const void *data, UErrorCode &status){ 778+ CompactTrieHeader *header = (CompactTrieHeader *) data; 779+ if (header->magic != COMPACT_TRIE_MAGIC_1 && 780+ header->magic != COMPACT_TRIE_MAGIC_2 && 781+ header->magic != COMPACT_TRIE_MAGIC_3) { 782+ status = U_ILLEGAL_ARGUMENT_ERROR; 783+ } else { 784+ size = header->size; 785+ magic = header->magic; 786+ 787+ if (header->magic == COMPACT_TRIE_MAGIC_1) { 788+ CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *) header; 789+ nodeCount = headerV1->nodeCount; 790+ root = headerV1->root; 791+ offsets = &(headerV1->offsets[0]); 792+ address = (uint8_t *)headerV1; 793+ } else { 794+ nodeCount = header->nodeCount; 795+ root = header->root; 796+ offsets = &(header->offsets[0]); 797+ address = (uint8_t *)header; 798+ } 799+ } 800+ } 801+ 802+ ~CompactTrieInfo(){} 803 }; 804 805 // Note that to avoid platform-specific alignment issues, all members of the node 806@@ -375,10 +458,14 @@ 807 enum CompactTrieNodeFlags { 808 kVerticalNode = 0x1000, // This is a vertical node 809 kParentEndsWord = 0x2000, // The node whose equal link points to this ends a word 810- kReservedFlag1 = 0x4000, 811- kReservedFlag2 = 0x8000, 812+ kExceedsCount = 0x4000, // new MSB for count >= 4096, originally kReservedFlag1 813+ kEqualOverflows = 0x8000, // Links to nodeIDs > 2^16, orig. kReservedFlag2 814 kCountMask = 0x0FFF, // The count portion of flagscount 815- kFlagMask = 0xF000 // The flags portion of flagscount 816+ kFlagMask = 0xF000, // The flags portion of flagscount 817+ kRootCountMask = 0x7FFF // The count portion of flagscount in the root node 818+ 819+ //offset flags: 820+ //kOffsetContainsValue = 0x80000000 // Offset contains value for parent node 821 }; 822 823 // The two node types are distinguished by the kVerticalNode flag. 824@@ -402,63 +489,177 @@ 825 uint16_t chars[1]; // Code units 826 }; 827 828-// {'Dic', 1}, version 1 829-#define COMPACT_TRIE_MAGIC_1 0x44696301 830- 831 CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj, 832 UErrorCode &status ) 833 : fUData(dataObj) 834 { 835- fData = (const CompactTrieHeader *) udata_getMemory(dataObj); 836+ fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo)); 837+ *fInfo = CompactTrieInfo(udata_getMemory(dataObj), status); 838 fOwnData = FALSE; 839- if (fData->magic != COMPACT_TRIE_MAGIC_1) { 840- status = U_ILLEGAL_ARGUMENT_ERROR; 841- fData = NULL; 842- } 843 } 844+ 845 CompactTrieDictionary::CompactTrieDictionary( const void *data, 846 UErrorCode &status ) 847 : fUData(NULL) 848 { 849- fData = (const CompactTrieHeader *) data; 850+ fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo)); 851+ *fInfo = CompactTrieInfo(data, status); 852 fOwnData = FALSE; 853- if (fData->magic != COMPACT_TRIE_MAGIC_1) { 854- status = U_ILLEGAL_ARGUMENT_ERROR; 855- fData = NULL; 856- } 857 } 858 859 CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict, 860 UErrorCode &status ) 861 : fUData(NULL) 862 { 863- fData = compactMutableTrieDictionary(dict, status); 864+ const CompactTrieHeader* header = compactMutableTrieDictionary(dict, status); 865+ if (U_SUCCESS(status)) { 866+ fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo)); 867+ *fInfo = CompactTrieInfo(header, status); 868+ } 869+ 870 fOwnData = !U_FAILURE(status); 871 } 872 873 CompactTrieDictionary::~CompactTrieDictionary() { 874 if (fOwnData) { 875- uprv_free((void *)fData); 876+ uprv_free((void *)(fInfo->address)); 877 } 878+ uprv_free((void *)fInfo); 879+ 880 if (fUData) { 881 udata_close(fUData); 882 } 883 } 884 885+UBool CompactTrieDictionary::getValued() const{ 886+ return fInfo->magic == COMPACT_TRIE_MAGIC_3; 887+} 888+ 889 uint32_t 890 CompactTrieDictionary::dataSize() const { 891- return fData->size; 892+ return fInfo->size; 893 } 894 895 const void * 896 CompactTrieDictionary::data() const { 897- return fData; 898+ return fInfo->address; 899+} 900+ 901+//This function finds the address of a node for us, given its node ID 902+static inline const CompactTrieNode * 903+getCompactNode(const CompactTrieInfo *info, uint32_t node) { 904+ if(node < info->root-1) { 905+ return (const CompactTrieNode *)(&info->offsets[node]); 906+ } else { 907+ return (const CompactTrieNode *)(info->address + info->offsets[node]); 908+ } 909 } 910 911-// This function finds the address of a node for us, given its node ID 912+//this version of getCompactNode is currently only used in compactMutableTrieDictionary() 913 static inline const CompactTrieNode * 914-getCompactNode(const CompactTrieHeader *header, uint16_t node) { 915- return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]); 916+getCompactNode(const CompactTrieHeader *header, uint32_t node) { 917+ if(node < header->root-1) { 918+ return (const CompactTrieNode *)(&header->offsets[node]); 919+ } else { 920+ return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]); 921+ } 922+} 923+ 924+ 925+/** 926+ * Calculates the number of links in a node 927+ * @node The specified node 928+ */ 929+static inline const uint16_t 930+getCount(const CompactTrieNode *node){ 931+ return (node->flagscount & kCountMask); 932+ //use the code below if number of links ever exceed 4096 933+ //return (node->flagscount & kCountMask) + ((node->flagscount & kExceedsCount) >> 2); 934+} 935+ 936+/** 937+ * calculates an equal link node ID of a horizontal node 938+ * @hnode The horizontal node containing the equal link 939+ * @param index The index into hnode->entries[] 940+ * @param nodeCount The length of hnode->entries[] 941+ */ 942+static inline uint32_t calcEqualLink(const CompactTrieVerticalNode *vnode){ 943+ if(vnode->flagscount & kEqualOverflows){ 944+ // treat overflow bits as an extension of chars[] 945+ uint16_t *overflow = (uint16_t *) &vnode->chars[getCount((CompactTrieNode*)vnode)]; 946+ return vnode->equal + (((uint32_t)*overflow) << 16); 947+ }else{ 948+ return vnode->equal; 949+ } 950+} 951+ 952+/** 953+ * calculates an equal link node ID of a horizontal node 954+ * @hnode The horizontal node containing the equal link 955+ * @param index The index into hnode->entries[] 956+ * @param nodeCount The length of hnode->entries[] 957+ */ 958+static inline uint32_t calcEqualLink(const CompactTrieHorizontalNode *hnode, uint16_t index, uint16_t nodeCount){ 959+ if(hnode->flagscount & kEqualOverflows){ 960+ //set overflow to point to the uint16_t containing the overflow bits 961+ uint16_t *overflow = (uint16_t *) &hnode->entries[nodeCount]; 962+ overflow += index/4; 963+ uint16_t extraBits = (*overflow >> (3 - (index % 4)) * 4) % 0x10; 964+ return hnode->entries[index].equal + (((uint32_t)extraBits) << 16); 965+ } else { 966+ return hnode->entries[index].equal; 967+ } 968+} 969+ 970+/** 971+ * Returns the value stored in the specified node which is associated with its 972+ * parent node. 973+ * TODO: how to tell that value is stored in node or in offset? check whether 974+ * node ID < fInfo->root! 975+ */ 976+static inline uint16_t getValue(const CompactTrieHorizontalNode *hnode){ 977+ uint16_t count = getCount((CompactTrieNode *)hnode); 978+ uint16_t overflowSize = 0; //size of node ID overflow storage in bytes 979+ 980+ if(hnode->flagscount & kEqualOverflows) 981+ overflowSize = (count + 3) / 4 * sizeof(uint16_t); 982+ return *((uint16_t *)((uint8_t *)&hnode->entries[count] + overflowSize)); 983+} 984+ 985+static inline uint16_t getValue(const CompactTrieVerticalNode *vnode){ 986+ // calculate size of total node ID overflow storage in bytes 987+ uint16_t overflowSize = (vnode->flagscount & kEqualOverflows)? sizeof(uint16_t) : 0; 988+ return *((uint16_t *)((uint8_t *)&vnode->chars[getCount((CompactTrieNode *)vnode)] + overflowSize)); 989+} 990+ 991+static inline uint16_t getValue(const CompactTrieNode *node){ 992+ if(node->flagscount & kVerticalNode) 993+ return getValue((const CompactTrieVerticalNode *)node); 994+ else 995+ return getValue((const CompactTrieHorizontalNode *)node); 996+} 997+ 998+//returns index of match in CompactTrieHorizontalNode.entries[] using binary search 999+inline int16_t 1000+searchHorizontalEntries(const CompactTrieHorizontalEntry *entries, 1001+ UChar uc, uint16_t nodeCount){ 1002+ int low = 0; 1003+ int high = nodeCount-1; 1004+ int middle; 1005+ while (high >= low) { 1006+ middle = (high+low)/2; 1007+ if (uc == entries[middle].ch) { 1008+ return middle; 1009+ } 1010+ else if (uc < entries[middle].ch) { 1011+ high = middle-1; 1012+ } 1013+ else { 1014+ low = middle+1; 1015+ } 1016+ } 1017+ 1018+ return -1; 1019 } 1020 1021 int32_t 1022@@ -466,17 +667,38 @@ 1023 int32_t maxLength, 1024 int32_t *lengths, 1025 int &count, 1026- int limit ) const { 1027+ int limit, 1028+ uint16_t *values /*= NULL*/) const { 1029+ if (fInfo->magic == COMPACT_TRIE_MAGIC_2) 1030+ values = NULL; 1031+ 1032 // TODO: current implementation works in UTF-16 space 1033- const CompactTrieNode *node = getCompactNode(fData, fData->root); 1034+ const CompactTrieNode *node = getCompactNode(fInfo, fInfo->root); 1035 int mycount = 0; 1036 1037 UChar uc = utext_current32(text); 1038 int i = 0; 1039 1040+ // handle root node with only kEqualOverflows flag: assume horizontal node without parent 1041+ if(node != NULL){ 1042+ const CompactTrieHorizontalNode *root = (const CompactTrieHorizontalNode *) node; 1043+ int index = searchHorizontalEntries(root->entries, uc, root->flagscount & kRootCountMask); 1044+ if(index > -1){ 1045+ node = getCompactNode(fInfo, calcEqualLink(root, index, root->flagscount & kRootCountMask)); 1046+ utext_next32(text); 1047+ uc = utext_current32(text); 1048+ ++i; 1049+ }else{ 1050+ node = NULL; 1051+ } 1052+ } 1053+ 1054 while (node != NULL) { 1055 // Check if the node we just exited ends a word 1056 if (limit > 0 && (node->flagscount & kParentEndsWord)) { 1057+ if(values != NULL){ 1058+ values[mycount] = getValue(node); 1059+ } 1060 lengths[mycount++] = i; 1061 --limit; 1062 } 1063@@ -487,7 +709,7 @@ 1064 break; 1065 } 1066 1067- int nodeCount = (node->flagscount & kCountMask); 1068+ int nodeCount = getCount(node); 1069 if (nodeCount == 0) { 1070 // Special terminal node; return now 1071 break; 1072@@ -507,35 +729,27 @@ 1073 // To get here we must have come through the whole list successfully; 1074 // go on to the next node. Note that a word cannot end in the middle 1075 // of a vertical node. 1076- node = getCompactNode(fData, vnode->equal); 1077+ node = getCompactNode(fInfo, calcEqualLink(vnode)); 1078 } 1079 else { 1080 // Horizontal node; do binary search 1081 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node; 1082- int low = 0; 1083- int high = nodeCount-1; 1084- int middle; 1085- node = NULL; // If we don't find a match, we'll fall out of the loop 1086- while (high >= low) { 1087- middle = (high+low)/2; 1088- if (uc == hnode->entries[middle].ch) { 1089- // We hit a match; get the next node and next character 1090- node = getCompactNode(fData, hnode->entries[middle].equal); 1091- utext_next32(text); 1092- uc = utext_current32(text); 1093- ++i; 1094- break; 1095- } 1096- else if (uc < hnode->entries[middle].ch) { 1097- high = middle-1; 1098- } 1099- else { 1100- low = middle+1; 1101- } 1102+ const CompactTrieHorizontalEntry *entries; 1103+ entries = hnode->entries; 1104+ 1105+ int index = searchHorizontalEntries(entries, uc, nodeCount); 1106+ if(index > -1){ // 1107+ // We hit a match; get the next node and next character 1108+ node = getCompactNode(fInfo, calcEqualLink(hnode, index, nodeCount)); 1109+ utext_next32(text); 1110+ uc = utext_current32(text); 1111+ ++i; 1112+ }else{ 1113+ node = NULL; // If we don't find a match, we'll fall out of the loop 1114 } 1115 } 1116 } 1117-exit: 1118+ exit: 1119 count = mycount; 1120 return i; 1121 } 1122@@ -545,16 +759,16 @@ 1123 private: 1124 UVector32 fNodeStack; // Stack of nodes to process 1125 UVector32 fIndexStack; // Stack of where in node we are 1126- const CompactTrieHeader *fHeader; // Trie data 1127+ const CompactTrieInfo *fInfo; // Trie data 1128 1129 public: 1130 static UClassID U_EXPORT2 getStaticClassID(void); 1131 virtual UClassID getDynamicClassID(void) const; 1132 public: 1133- CompactTrieEnumeration(const CompactTrieHeader *header, UErrorCode &status) 1134+ CompactTrieEnumeration(const CompactTrieInfo *info, UErrorCode &status) 1135 : fNodeStack(status), fIndexStack(status) { 1136- fHeader = header; 1137- fNodeStack.push(header->root, status); 1138+ fInfo = info; 1139+ fNodeStack.push(info->root, status); 1140 fIndexStack.push(0, status); 1141 unistr.remove(); 1142 } 1143@@ -564,14 +778,14 @@ 1144 1145 virtual StringEnumeration *clone() const { 1146 UErrorCode status = U_ZERO_ERROR; 1147- return new CompactTrieEnumeration(fHeader, status); 1148+ return new CompactTrieEnumeration(fInfo, status); 1149 } 1150 1151 virtual const UnicodeString * snext(UErrorCode &status); 1152 1153 // Very expensive, but this should never be used. 1154 virtual int32_t count(UErrorCode &status) const { 1155- CompactTrieEnumeration counter(fHeader, status); 1156+ CompactTrieEnumeration counter(fInfo, status); 1157 int32_t result = 0; 1158 while (counter.snext(status) != NULL && U_SUCCESS(status)) { 1159 ++result; 1160@@ -582,7 +796,7 @@ 1161 virtual void reset(UErrorCode &status) { 1162 fNodeStack.removeAllElements(); 1163 fIndexStack.removeAllElements(); 1164- fNodeStack.push(fHeader->root, status); 1165+ fNodeStack.push(fInfo->root, status); 1166 fIndexStack.push(0, status); 1167 unistr.remove(); 1168 } 1169@@ -595,26 +809,34 @@ 1170 if (fNodeStack.empty() || U_FAILURE(status)) { 1171 return NULL; 1172 } 1173- const CompactTrieNode *node = getCompactNode(fHeader, fNodeStack.peeki()); 1174+ const CompactTrieNode *node = getCompactNode(fInfo, fNodeStack.peeki()); 1175 int where = fIndexStack.peeki(); 1176 while (!fNodeStack.empty() && U_SUCCESS(status)) { 1177- int nodeCount = (node->flagscount & kCountMask); 1178+ int nodeCount; 1179+ 1180+ bool isRoot = fNodeStack.peeki() == static_cast<int32_t>(fInfo->root); 1181+ if(isRoot){ 1182+ nodeCount = node->flagscount & kRootCountMask; 1183+ } else { 1184+ nodeCount = getCount(node); 1185+ } 1186+ 1187 UBool goingDown = FALSE; 1188 if (nodeCount == 0) { 1189 // Terminal node; go up immediately 1190 fNodeStack.popi(); 1191 fIndexStack.popi(); 1192- node = getCompactNode(fHeader, fNodeStack.peeki()); 1193+ node = getCompactNode(fInfo, fNodeStack.peeki()); 1194 where = fIndexStack.peeki(); 1195 } 1196- else if (node->flagscount & kVerticalNode) { 1197+ else if ((node->flagscount & kVerticalNode) && !isRoot) { 1198 // Vertical node 1199 const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node; 1200 if (where == 0) { 1201 // Going down 1202- unistr.append((const UChar *)vnode->chars, (int32_t) nodeCount); 1203+ unistr.append((const UChar *)vnode->chars, nodeCount); 1204 fIndexStack.setElementAt(1, fIndexStack.size()-1); 1205- node = getCompactNode(fHeader, fNodeStack.push(vnode->equal, status)); 1206+ node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(vnode), status)); 1207 where = fIndexStack.push(0, status); 1208 goingDown = TRUE; 1209 } 1210@@ -623,7 +845,7 @@ 1211 unistr.truncate(unistr.length()-nodeCount); 1212 fNodeStack.popi(); 1213 fIndexStack.popi(); 1214- node = getCompactNode(fHeader, fNodeStack.peeki()); 1215+ node = getCompactNode(fInfo, fNodeStack.peeki()); 1216 where = fIndexStack.peeki(); 1217 } 1218 } 1219@@ -638,7 +860,7 @@ 1220 // Push on next node 1221 unistr.append((UChar)hnode->entries[where].ch); 1222 fIndexStack.setElementAt(where+1, fIndexStack.size()-1); 1223- node = getCompactNode(fHeader, fNodeStack.push(hnode->entries[where].equal, status)); 1224+ node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(hnode, where, nodeCount), status)); 1225 where = fIndexStack.push(0, status); 1226 goingDown = TRUE; 1227 } 1228@@ -646,12 +868,14 @@ 1229 // Going up 1230 fNodeStack.popi(); 1231 fIndexStack.popi(); 1232- node = getCompactNode(fHeader, fNodeStack.peeki()); 1233+ node = getCompactNode(fInfo, fNodeStack.peeki()); 1234 where = fIndexStack.peeki(); 1235 } 1236 } 1237+ 1238 // Check if the parent of the node we've just gone down to ends a 1239 // word. If so, return it. 1240+ // The root node should never end up here. 1241 if (goingDown && (node->flagscount & kParentEndsWord)) { 1242 return &unistr; 1243 } 1244@@ -664,7 +888,7 @@ 1245 if (U_FAILURE(status)) { 1246 return NULL; 1247 } 1248- return new CompactTrieEnumeration(fData, status); 1249+ return new CompactTrieEnumeration(fInfo, status); 1250 } 1251 1252 // 1253@@ -672,21 +896,36 @@ 1254 // and back again 1255 // 1256 1257-// Helper classes to construct the compact trie 1258+enum CompactTrieNodeType { 1259+ kHorizontalType = 0, 1260+ kVerticalType = 1, 1261+ kValueType = 2 1262+}; 1263+ 1264+/** 1265+ * The following classes (i.e. BuildCompactTrie*Node) are helper classes to 1266+ * construct the compact trie by storing information for each node and later 1267+ * writing the node to memory in a sequential format. 1268+ */ 1269 class BuildCompactTrieNode: public UMemory { 1270- public: 1271+public: 1272 UBool fParentEndsWord; 1273- UBool fVertical; 1274+ CompactTrieNodeType fNodeType; 1275 UBool fHasDuplicate; 1276+ UBool fEqualOverflows; 1277 int32_t fNodeID; 1278 UnicodeString fChars; 1279+ uint16_t fValue; 1280 1281- public: 1282- BuildCompactTrieNode(UBool parentEndsWord, UBool vertical, UStack &nodes, UErrorCode &status) { 1283+public: 1284+ BuildCompactTrieNode(UBool parentEndsWord, CompactTrieNodeType nodeType, 1285+ UStack &nodes, UErrorCode &status, uint16_t value = 0) { 1286 fParentEndsWord = parentEndsWord; 1287 fHasDuplicate = FALSE; 1288- fVertical = vertical; 1289+ fNodeType = nodeType; 1290+ fEqualOverflows = FALSE; 1291 fNodeID = nodes.size(); 1292+ fValue = parentEndsWord? value : 0; 1293 nodes.push(this, status); 1294 } 1295 1296@@ -694,87 +933,225 @@ 1297 } 1298 1299 virtual uint32_t size() { 1300- return sizeof(uint16_t); 1301+ if(fValue > 0) 1302+ return sizeof(uint16_t) * 2; 1303+ else 1304+ return sizeof(uint16_t); 1305 } 1306 1307 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &/*translate*/) { 1308 // Write flag/count 1309- *((uint16_t *)(bytes+offset)) = (fChars.length() & kCountMask) 1310- | (fVertical ? kVerticalNode : 0) | (fParentEndsWord ? kParentEndsWord : 0 ); 1311+ 1312+ // if this ever fails, a flag bit (i.e. kExceedsCount) will need to be 1313+ // used as a 5th MSB. 1314+ U_ASSERT(fChars.length() < 4096 || fNodeID == 2); 1315+ 1316+ *((uint16_t *)(bytes+offset)) = (fEqualOverflows? kEqualOverflows : 0) | 1317+ ((fNodeID == 2)? (fChars.length() & kRootCountMask): 1318+ ( 1319+ (fChars.length() & kCountMask) | 1320+ //((fChars.length() << 2) & kExceedsCount) | 1321+ (fNodeType == kVerticalType ? kVerticalNode : 0) | 1322+ (fParentEndsWord ? kParentEndsWord : 0 ) 1323+ ) 1324+ ); 1325 offset += sizeof(uint16_t); 1326 } 1327+ 1328+ virtual void writeValue(uint8_t *bytes, uint32_t &offset) { 1329+ if(fValue > 0){ 1330+ *((uint16_t *)(bytes+offset)) = fValue; 1331+ offset += sizeof(uint16_t); 1332+ } 1333+ } 1334+ 1335+}; 1336+ 1337+/** 1338+ * Stores value of parent terminating nodes that have no more subtries. 1339+ */ 1340+class BuildCompactTrieValueNode: public BuildCompactTrieNode { 1341+public: 1342+ BuildCompactTrieValueNode(UStack &nodes, UErrorCode &status, uint16_t value) 1343+ : BuildCompactTrieNode(TRUE, kValueType, nodes, status, value){ 1344+ } 1345+ 1346+ virtual ~BuildCompactTrieValueNode(){ 1347+ } 1348+ 1349+ virtual uint32_t size() { 1350+ return sizeof(uint16_t) * 2; 1351+ } 1352+ 1353+ virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) { 1354+ // don't write value directly to memory but store it in offset to be written later 1355+ //offset = fValue & kOffsetContainsValue; 1356+ BuildCompactTrieNode::write(bytes, offset, translate); 1357+ BuildCompactTrieNode::writeValue(bytes, offset); 1358+ } 1359 }; 1360 1361 class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode { 1362 public: 1363 UStack fLinks; 1364+ UBool fMayOverflow; //intermediate value for fEqualOverflows 1365 1366 public: 1367- BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status) 1368- : BuildCompactTrieNode(parentEndsWord, FALSE, nodes, status), fLinks(status) { 1369+ BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status, uint16_t value = 0) 1370+ : BuildCompactTrieNode(parentEndsWord, kHorizontalType, nodes, status, value), fLinks(status) { 1371+ fMayOverflow = FALSE; 1372 } 1373 1374 virtual ~BuildCompactTrieHorizontalNode() { 1375 } 1376 1377+ // It is impossible to know beforehand exactly how much space the node will 1378+ // need in memory before being written, because the node IDs in the equal 1379+ // links may or may not overflow after node coalescing. Therefore, this method 1380+ // returns the maximum size possible for the node. 1381 virtual uint32_t size() { 1382- return offsetof(CompactTrieHorizontalNode,entries) + 1383- (fChars.length()*sizeof(CompactTrieHorizontalEntry)); 1384+ uint32_t estimatedSize = offsetof(CompactTrieHorizontalNode,entries) + 1385+ (fChars.length()*sizeof(CompactTrieHorizontalEntry)); 1386+ 1387+ if(fValue > 0) 1388+ estimatedSize += sizeof(uint16_t); 1389+ 1390+ //estimate extra space needed to store overflow for node ID links 1391+ //may be more than what is actually needed 1392+ for(int i=0; i < fChars.length(); i++){ 1393+ if(((BuildCompactTrieNode *)fLinks[i])->fNodeID > 0xFFFF){ 1394+ fMayOverflow = TRUE; 1395+ break; 1396+ } 1397+ } 1398+ if(fMayOverflow) // added space for overflow should be same as ceil(fChars.length()/4) * sizeof(uint16_t) 1399+ estimatedSize += (sizeof(uint16_t) * fChars.length() + 2)/4; 1400+ 1401+ return estimatedSize; 1402 } 1403 1404 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) { 1405- BuildCompactTrieNode::write(bytes, offset, translate); 1406 int32_t count = fChars.length(); 1407+ 1408+ //if largest nodeID > 2^16, set flag 1409+ //large node IDs are more likely to be at the back of the array 1410+ for (int32_t i = count-1; i >= 0; --i) { 1411+ if(translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID) > 0xFFFF){ 1412+ fEqualOverflows = TRUE; 1413+ break; 1414+ } 1415+ } 1416+ 1417+ BuildCompactTrieNode::write(bytes, offset, translate); 1418+ 1419+ // write entries[] to memory 1420 for (int32_t i = 0; i < count; ++i) { 1421 CompactTrieHorizontalEntry *entry = (CompactTrieHorizontalEntry *)(bytes+offset); 1422 entry->ch = fChars[i]; 1423 entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID); 1424 #ifdef DEBUG_TRIE_DICT 1425- if (entry->equal == 0) { 1426+ 1427+ if ((entry->equal == 0) && !fEqualOverflows) { 1428 fprintf(stderr, "ERROR: horizontal link %d, logical node %d maps to physical node zero\n", 1429 i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID); 1430 } 1431 #endif 1432 offset += sizeof(CompactTrieHorizontalEntry); 1433 } 1434+ 1435+ // append extra bits of equal nodes to end if fEqualOverflows 1436+ if (fEqualOverflows) { 1437+ uint16_t leftmostBits = 0; 1438+ for (int16_t i = 0; i < count; i++) { 1439+ leftmostBits = (leftmostBits << 4) | getLeftmostBits(translate, i); 1440+ 1441+ // write filled uint16_t to memory 1442+ if(i % 4 == 3){ 1443+ *((uint16_t *)(bytes+offset)) = leftmostBits; 1444+ leftmostBits = 0; 1445+ offset += sizeof(uint16_t); 1446+ } 1447+ } 1448+ 1449+ // pad last uint16_t with zeroes if necessary 1450+ int remainder = count % 4; 1451+ if (remainder > 0) { 1452+ *((uint16_t *)(bytes+offset)) = (leftmostBits << (16 - 4 * remainder)); 1453+ offset += sizeof(uint16_t); 1454+ } 1455+ } 1456+ 1457+ BuildCompactTrieNode::writeValue(bytes, offset); 1458+ } 1459+ 1460+ // returns leftmost bits of physical node link 1461+ uint16_t getLeftmostBits(const UVector32 &translate, uint32_t i){ 1462+ uint16_t leftmostBits = (uint16_t) (translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID) >> 16); 1463+#ifdef DEBUG_TRIE_DICT 1464+ if (leftmostBits > 0xF) { 1465+ fprintf(stderr, "ERROR: horizontal link %d, logical node %d exceeds maximum possible node ID value\n", 1466+ i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID); 1467+ } 1468+#endif 1469+ return leftmostBits; 1470 } 1471 1472 void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) { 1473 fChars.append(ch); 1474 fLinks.push(link, status); 1475 } 1476+ 1477 }; 1478 1479 class BuildCompactTrieVerticalNode: public BuildCompactTrieNode { 1480- public: 1481+public: 1482 BuildCompactTrieNode *fEqual; 1483 1484- public: 1485- BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status) 1486- : BuildCompactTrieNode(parentEndsWord, TRUE, nodes, status) { 1487+public: 1488+ BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status, uint16_t value = 0) 1489+ : BuildCompactTrieNode(parentEndsWord, kVerticalType, nodes, status, value) { 1490 fEqual = NULL; 1491 } 1492 1493 virtual ~BuildCompactTrieVerticalNode() { 1494 } 1495 1496+ // Returns the maximum possible size of this node. See comment in 1497+ // BuildCompactTrieHorizontal node for more information. 1498 virtual uint32_t size() { 1499- return offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof(uint16_t)); 1500+ uint32_t estimatedSize = offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof(uint16_t)); 1501+ if(fValue > 0){ 1502+ estimatedSize += sizeof(uint16_t); 1503+ } 1504+ 1505+ if(fEqual->fNodeID > 0xFFFF){ 1506+ estimatedSize += sizeof(uint16_t); 1507+ } 1508+ return estimatedSize; 1509 } 1510 1511 virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) { 1512 CompactTrieVerticalNode *node = (CompactTrieVerticalNode *)(bytes+offset); 1513+ fEqualOverflows = (translate.elementAti(fEqual->fNodeID) > 0xFFFF); 1514 BuildCompactTrieNode::write(bytes, offset, translate); 1515 node->equal = translate.elementAti(fEqual->fNodeID); 1516 offset += sizeof(node->equal); 1517 #ifdef DEBUG_TRIE_DICT 1518- if (node->equal == 0) { 1519+ if ((node->equal == 0) && !fEqualOverflows) { 1520 fprintf(stderr, "ERROR: vertical link, logical node %d maps to physical node zero\n", 1521 fEqual->fNodeID); 1522 } 1523 #endif 1524 fChars.extract(0, fChars.length(), (UChar *)node->chars); 1525- offset += sizeof(uint16_t)*fChars.length(); 1526+ offset += sizeof(UChar)*fChars.length(); 1527+ 1528+ // append 16 bits of to end for equal node if fEqualOverflows 1529+ if (fEqualOverflows) { 1530+ *((uint16_t *)(bytes+offset)) = (translate.elementAti(fEqual->fNodeID) >> 16); 1531+ offset += sizeof(uint16_t); 1532+ } 1533+ 1534+ BuildCompactTrieNode::writeValue(bytes, offset); 1535 } 1536 1537 void addChar(UChar ch) { 1538@@ -784,60 +1161,85 @@ 1539 void setLink(BuildCompactTrieNode *node) { 1540 fEqual = node; 1541 } 1542+ 1543 }; 1544 1545 // Forward declaration 1546 static void walkHorizontal(const TernaryNode *node, 1547 BuildCompactTrieHorizontalNode *building, 1548 UStack &nodes, 1549- UErrorCode &status); 1550+ UErrorCode &status, 1551+ Hashtable *values); 1552 1553-// Convert one node. Uses recursion. 1554+// Convert one TernaryNode into a BuildCompactTrieNode. Uses recursion. 1555 1556 static BuildCompactTrieNode * 1557-compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, UErrorCode &status) { 1558+compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, 1559+ UErrorCode &status, Hashtable *values = NULL, uint16_t parentValue = 0) { 1560 if (U_FAILURE(status)) { 1561 return NULL; 1562 } 1563 BuildCompactTrieNode *result = NULL; 1564 UBool horizontal = (node->low != NULL || node->high != NULL); 1565 if (horizontal) { 1566- BuildCompactTrieHorizontalNode *hResult = 1567- new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status); 1568+ BuildCompactTrieHorizontalNode *hResult; 1569+ if(values != NULL){ 1570+ hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status, parentValue); 1571+ } else { 1572+ hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status); 1573+ } 1574+ 1575 if (hResult == NULL) { 1576 status = U_MEMORY_ALLOCATION_ERROR; 1577 return NULL; 1578 } 1579 if (U_SUCCESS(status)) { 1580- walkHorizontal(node, hResult, nodes, status); 1581+ walkHorizontal(node, hResult, nodes, status, values); 1582 result = hResult; 1583 } 1584 } 1585 else { 1586- BuildCompactTrieVerticalNode *vResult = 1587- new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status); 1588+ BuildCompactTrieVerticalNode *vResult; 1589+ if(values != NULL){ 1590+ vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status, parentValue); 1591+ } else { 1592+ vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status); 1593+ } 1594+ 1595 if (vResult == NULL) { 1596 status = U_MEMORY_ALLOCATION_ERROR; 1597+ return NULL; 1598 } 1599 else if (U_SUCCESS(status)) { 1600- UBool endsWord = FALSE; 1601+ uint16_t value = 0; 1602+ UBool endsWord = FALSE; 1603 // Take up nodes until we end a word, or hit a node with < or > links 1604 do { 1605 vResult->addChar(node->ch); 1606- endsWord = (node->flags & kEndsWord) != 0; 1607+ value = node->flags; 1608+ endsWord = value > 0; 1609 node = node->equal; 1610 } 1611 while(node != NULL && !endsWord && node->low == NULL && node->high == NULL); 1612+ 1613 if (node == NULL) { 1614 if (!endsWord) { 1615 status = U_ILLEGAL_ARGUMENT_ERROR; // Corrupt input trie 1616 } 1617- else { 1618+ else if(values != NULL){ 1619+ UnicodeString key(value); //store value as a single-char UnicodeString 1620+ BuildCompactTrieValueNode *link = (BuildCompactTrieValueNode *) values->get(key); 1621+ if(link == NULL){ 1622+ link = new BuildCompactTrieValueNode(nodes, status, value); //take out nodes? 1623+ values->put(key, link, status); 1624+ } 1625+ vResult->setLink(link); 1626+ } else { 1627 vResult->setLink((BuildCompactTrieNode *)nodes[1]); 1628 } 1629 } 1630 else { 1631- vResult->setLink(compactOneNode(node, endsWord, nodes, status)); 1632+ vResult->setLink(compactOneNode(node, endsWord, nodes, status, values, value)); 1633 } 1634 result = vResult; 1635 } 1636@@ -849,19 +1251,28 @@ 1637 // Uses recursion. 1638 1639 static void walkHorizontal(const TernaryNode *node, 1640- BuildCompactTrieHorizontalNode *building, 1641- UStack &nodes, 1642- UErrorCode &status) { 1643+ BuildCompactTrieHorizontalNode *building, 1644+ UStack &nodes, 1645+ UErrorCode &status, Hashtable *values = NULL) { 1646 while (U_SUCCESS(status) && node != NULL) { 1647 if (node->low != NULL) { 1648- walkHorizontal(node->low, building, nodes, status); 1649+ walkHorizontal(node->low, building, nodes, status, values); 1650 } 1651 BuildCompactTrieNode *link = NULL; 1652 if (node->equal != NULL) { 1653- link = compactOneNode(node->equal, (node->flags & kEndsWord) != 0, nodes, status); 1654+ link = compactOneNode(node->equal, node->flags > 0, nodes, status, values, node->flags); 1655 } 1656- else if (node->flags & kEndsWord) { 1657- link = (BuildCompactTrieNode *)nodes[1]; 1658+ else if (node->flags > 0) { 1659+ if(values != NULL) { 1660+ UnicodeString key(node->flags); //store value as a single-char UnicodeString 1661+ link = (BuildCompactTrieValueNode *) values->get(key); 1662+ if(link == NULL) { 1663+ link = new BuildCompactTrieValueNode(nodes, status, node->flags); //take out nodes? 1664+ values->put(key, link, status); 1665+ } 1666+ } else { 1667+ link = (BuildCompactTrieNode *)nodes[1]; 1668+ } 1669 } 1670 if (U_SUCCESS(status) && link != NULL) { 1671 building->addNode(node->ch, link, status); 1672@@ -881,13 +1292,15 @@ 1673 _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr) { 1674 BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl; 1675 BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr; 1676+ 1677 // Check for comparing a node to itself, to avoid spurious duplicates 1678 if (left == right) { 1679 return 0; 1680 } 1681+ 1682 // Most significant is type of node. Can never coalesce. 1683- if (left->fVertical != right->fVertical) { 1684- return left->fVertical - right->fVertical; 1685+ if (left->fNodeType != right->fNodeType) { 1686+ return left->fNodeType - right->fNodeType; 1687 } 1688 // Next, the "parent ends word" flag. If that differs, we cannot coalesce. 1689 if (left->fParentEndsWord != right->fParentEndsWord) { 1690@@ -898,12 +1311,19 @@ 1691 if (result != 0) { 1692 return result; 1693 } 1694+ 1695+ // If the node value differs, we should not coalesce. 1696+ // If values aren't stored, all fValues should be 0. 1697+ if (left->fValue != right->fValue) { 1698+ return left->fValue - right->fValue; 1699+ } 1700+ 1701 // We know they're both the same node type, so branch for the two cases. 1702- if (left->fVertical) { 1703+ if (left->fNodeType == kVerticalType) { 1704 result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID 1705- - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID; 1706+ - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID; 1707 } 1708- else { 1709+ else if(left->fChars.length() > 0 && right->fChars.length() > 0){ 1710 // We need to compare the links vectors. They should be the 1711 // same size because the strings were equal. 1712 // We compare the node IDs instead of the pointers, to handle 1713@@ -914,9 +1334,10 @@ 1714 int32_t count = hleft->fLinks.size(); 1715 for (int32_t i = 0; i < count && result == 0; ++i) { 1716 result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID - 1717- ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID; 1718+ ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID; 1719 } 1720 } 1721+ 1722 // If they are equal to each other, mark them (speeds coalescing) 1723 if (result == 0) { 1724 left->fHasDuplicate = TRUE; 1725@@ -1031,20 +1452,25 @@ 1726 // Add node 0, used as the NULL pointer/sentinel. 1727 nodes.addElement((int32_t)0, status); 1728 1729+ Hashtable *values = NULL; // Index of (unique) values 1730+ if (dict.fValued) { 1731+ values = new Hashtable(status); 1732+ } 1733+ 1734 // Start by creating the special empty node we use to indicate that the parent 1735 // terminates a word. This must be node 1, because the builder assumes 1736- // that. 1737+ // that. This node will never be used for tries storing numerical values. 1738 if (U_FAILURE(status)) { 1739 return NULL; 1740 } 1741- BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, FALSE, nodes, status); 1742+ BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, kHorizontalType, nodes, status); 1743 if (terminal == NULL) { 1744 status = U_MEMORY_ALLOCATION_ERROR; 1745 } 1746 1747 // This call does all the work of building the new trie structure. The root 1748- // will be node 2. 1749- BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status); 1750+ // will have node ID 2 before writing to memory. 1751+ BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status, values); 1752 #ifdef DEBUG_TRIE_DICT 1753 (void) ::times(&timing); 1754 fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n", 1755@@ -1077,21 +1503,37 @@ 1756 return NULL; 1757 } 1758 1759+ //map terminal value nodes 1760+ int valueCount = 0; 1761+ UVector valueNodes(status); 1762+ if(values != NULL) { 1763+ valueCount = values->count(); //number of unique terminal value nodes 1764+ } 1765+ 1766+ // map non-terminal nodes 1767+ int valuePos = 1;//, nodePos = valueCount + valuePos; 1768+ nodeCount = valueCount + valuePos; 1769 for (i = 1; i < count; ++i) { 1770 node = (BuildCompactTrieNode *)nodes[i]; 1771 if (node->fNodeID == i) { 1772 // Only one node out of each duplicate set is used 1773- if (i >= translate.size()) { 1774+ if (node->fNodeID >= translate.size()) { 1775 // Logically extend the mapping table 1776- translate.setSize(i+1); 1777+ translate.setSize(i + 1); 1778+ } 1779+ //translate.setElementAt(object, index)! 1780+ if(node->fNodeType == kValueType) { 1781+ valueNodes.addElement(node, status); 1782+ translate.setElementAt(valuePos++, i); 1783+ } else { 1784+ translate.setElementAt(nodeCount++, i); 1785 } 1786- translate.setElementAt(nodeCount++, i); 1787 totalSize += node->size(); 1788 } 1789 } 1790- 1791- // Check for overflowing 16 bits worth of nodes. 1792- if (nodeCount > 0x10000) { 1793+ 1794+ // Check for overflowing 20 bits worth of nodes. 1795+ if (nodeCount > 0x100000) { 1796 status = U_ILLEGAL_ARGUMENT_ERROR; 1797 return NULL; 1798 } 1799@@ -1111,9 +1553,14 @@ 1800 status = U_MEMORY_ALLOCATION_ERROR; 1801 return NULL; 1802 } 1803- 1804+ 1805 CompactTrieHeader *header = (CompactTrieHeader *)bytes; 1806- header->size = totalSize; 1807+ //header->size = totalSize; 1808+ if(dict.fValued){ 1809+ header->magic = COMPACT_TRIE_MAGIC_3; 1810+ } else { 1811+ header->magic = COMPACT_TRIE_MAGIC_2; 1812+ } 1813 header->nodeCount = nodeCount; 1814 header->offsets[0] = 0; // Sentinel 1815 header->root = translate.elementAti(root->fNodeID); 1816@@ -1123,23 +1570,40 @@ 1817 } 1818 #endif 1819 uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uint32_t)); 1820- nodeCount = 1; 1821+ nodeCount = valueCount + 1; 1822+ 1823+ // Write terminal value nodes to memory 1824+ for (i=0; i < valueNodes.size(); i++) { 1825+ //header->offsets[i + 1] = offset; 1826+ uint32_t tmpOffset = 0; 1827+ node = (BuildCompactTrieNode *) valueNodes.elementAt(i); 1828+ //header->offsets[i + 1] = (uint32_t)node->fValue; 1829+ node->write((uint8_t *)&header->offsets[i+1], tmpOffset, translate); 1830+ } 1831+ 1832 // Now write the data 1833 for (i = 1; i < count; ++i) { 1834 node = (BuildCompactTrieNode *)nodes[i]; 1835- if (node->fNodeID == i) { 1836+ if (node->fNodeID == i && node->fNodeType != kValueType) { 1837 header->offsets[nodeCount++] = offset; 1838 node->write(bytes, offset, translate); 1839 } 1840 } 1841+ 1842+ //free all extra space 1843+ uprv_realloc(bytes, offset); 1844+ header->size = offset; 1845+ 1846 #ifdef DEBUG_TRIE_DICT 1847+ fprintf(stdout, "Space freed: %d\n", totalSize-offset); 1848+ 1849 (void) ::times(&timing); 1850 fprintf(stderr, "Trie built, time user %f system %f\n", 1851 (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, 1852 (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); 1853 previous = timing; 1854 fprintf(stderr, "Final offset is %d\n", offset); 1855- 1856+ 1857 // Collect statistics on node types and sizes 1858 int hCount = 0; 1859 int vCount = 0; 1860@@ -1148,68 +1612,85 @@ 1861 size_t hItemCount = 0; 1862 size_t vItemCount = 0; 1863 uint32_t previousOff = offset; 1864- for (uint16_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) { 1865+ uint32_t numOverflow = 0; 1866+ uint32_t valueSpace = 0; 1867+ for (uint32_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) { 1868 const CompactTrieNode *node = getCompactNode(header, nodeIdx); 1869- if (node->flagscount & kVerticalNode) { 1870+ int itemCount; 1871+ if(nodeIdx == header->root) 1872+ itemCount = node->flagscount & kRootCountMask; 1873+ else 1874+ itemCount = getCount(node); 1875+ if(node->flagscount & kEqualOverflows){ 1876+ numOverflow++; 1877+ } 1878+ if (node->flagscount & kVerticalNode && nodeIdx != header->root) { 1879 vCount += 1; 1880- vItemCount += (node->flagscount & kCountMask); 1881+ vItemCount += itemCount; 1882 vSize += previousOff-header->offsets[nodeIdx]; 1883 } 1884 else { 1885 hCount += 1; 1886- hItemCount += (node->flagscount & kCountMask); 1887- hSize += previousOff-header->offsets[nodeIdx]; 1888+ hItemCount += itemCount; 1889+ if(nodeIdx >= header->root) { 1890+ hSize += previousOff-header->offsets[nodeIdx]; 1891+ } 1892 } 1893+ 1894+ if(header->magic == COMPACT_TRIE_MAGIC_3 && node->flagscount & kParentEndsWord) 1895+ valueSpace += sizeof(uint16_t); 1896 previousOff = header->offsets[nodeIdx]; 1897 } 1898 fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items\n", hCount, 1899 (double)hSize/hCount, (double)hItemCount/hCount); 1900 fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n", vCount, 1901 (double)vSize/vCount, (double)vItemCount/vCount); 1902+ fprintf(stderr, "Number of nodes with overflowing nodeIDs: %d \n", numOverflow); 1903+ fprintf(stderr, "Space taken up by values: %d \n", valueSpace); 1904 #endif 1905 1906 if (U_FAILURE(status)) { 1907 uprv_free(bytes); 1908 header = NULL; 1909 } 1910- else { 1911- header->magic = COMPACT_TRIE_MAGIC_1; 1912- } 1913 return header; 1914 } 1915 1916 // Forward declaration 1917 static TernaryNode * 1918-unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UErrorCode &status ); 1919- 1920+unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorCode &status ); 1921 1922 // Convert a horizontal node (or subarray thereof) into a ternary subtrie 1923 static TernaryNode * 1924-unpackHorizontalArray( const CompactTrieHeader *header, const CompactTrieHorizontalEntry *array, 1925- int low, int high, UErrorCode &status ) { 1926+unpackHorizontalArray( const CompactTrieInfo *info, const CompactTrieHorizontalNode *hnode, 1927+ int low, int high, int nodeCount, UErrorCode &status) { 1928 if (U_FAILURE(status) || low > high) { 1929 return NULL; 1930 } 1931 int middle = (low+high)/2; 1932- TernaryNode *result = new TernaryNode(array[middle].ch); 1933+ TernaryNode *result = new TernaryNode(hnode->entries[middle].ch); 1934 if (result == NULL) { 1935 status = U_MEMORY_ALLOCATION_ERROR; 1936 return NULL; 1937 } 1938- const CompactTrieNode *equal = getCompactNode(header, array[middle].equal); 1939+ const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(hnode, middle, nodeCount)); 1940 if (equal->flagscount & kParentEndsWord) { 1941- result->flags |= kEndsWord; 1942+ if(info->magic == COMPACT_TRIE_MAGIC_3){ 1943+ result->flags = getValue(equal); 1944+ }else{ 1945+ result->flags |= kEndsWord; 1946+ } 1947 } 1948- result->low = unpackHorizontalArray(header, array, low, middle-1, status); 1949- result->high = unpackHorizontalArray(header, array, middle+1, high, status); 1950- result->equal = unpackOneNode(header, equal, status); 1951+ result->low = unpackHorizontalArray(info, hnode, low, middle-1, nodeCount, status); 1952+ result->high = unpackHorizontalArray(info, hnode, middle+1, high, nodeCount, status); 1953+ result->equal = unpackOneNode(info, equal, status); 1954 return result; 1955 } 1956 1957 // Convert one compact trie node into a ternary subtrie 1958 static TernaryNode * 1959-unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UErrorCode &status ) { 1960- int nodeCount = (node->flagscount & kCountMask); 1961+unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorCode &status ) { 1962+ int nodeCount = getCount(node); 1963 if (nodeCount == 0 || U_FAILURE(status)) { 1964 // Failure, or terminal node 1965 return NULL; 1966@@ -1234,29 +1715,41 @@ 1967 previous = latest; 1968 } 1969 if (latest != NULL) { 1970- const CompactTrieNode *equal = getCompactNode(header, vnode->equal); 1971+ const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(vnode)); 1972 if (equal->flagscount & kParentEndsWord) { 1973- latest->flags |= kEndsWord; 1974+ if(info->magic == COMPACT_TRIE_MAGIC_3){ 1975+ latest->flags = getValue(equal); 1976+ } else { 1977+ latest->flags |= kEndsWord; 1978+ } 1979 } 1980- latest->equal = unpackOneNode(header, equal, status); 1981+ latest->equal = unpackOneNode(info, equal, status); 1982 } 1983 return head; 1984 } 1985 else { 1986 // Horizontal node 1987 const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node; 1988- return unpackHorizontalArray(header, &hnode->entries[0], 0, nodeCount-1, status); 1989+ return unpackHorizontalArray(info, hnode, 0, nodeCount-1, nodeCount, status); 1990 } 1991 } 1992 1993+// returns a MutableTrieDictionary generated from the CompactTrieDictionary 1994 MutableTrieDictionary * 1995 CompactTrieDictionary::cloneMutable( UErrorCode &status ) const { 1996- MutableTrieDictionary *result = new MutableTrieDictionary( status ); 1997+ MutableTrieDictionary *result = new MutableTrieDictionary( status, fInfo->magic == COMPACT_TRIE_MAGIC_3 ); 1998 if (result == NULL) { 1999 status = U_MEMORY_ALLOCATION_ERROR; 2000 return NULL; 2001 } 2002- TernaryNode *root = unpackOneNode(fData, getCompactNode(fData, fData->root), status); 2003+ // treat root node as special case: don't call unpackOneNode() or unpackHorizontalArray() directly 2004+ // because only kEqualOverflows flag should be checked in root's flagscount 2005+ const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *) 2006+ getCompactNode(fInfo, fInfo->root); 2007+ uint16_t nodeCount = hnode->flagscount & kRootCountMask; 2008+ TernaryNode *root = unpackHorizontalArray(fInfo, hnode, 0, nodeCount-1, 2009+ nodeCount, status); 2010+ 2011 if (U_FAILURE(status)) { 2012 delete root; // Clean up 2013 delete result; 2014@@ -1270,8 +1763,8 @@ 2015 2016 U_CAPI int32_t U_EXPORT2 2017 triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, 2018- UErrorCode *status) { 2019- 2020+ UErrorCode *status) { 2021+ 2022 if (status == NULL || U_FAILURE(*status)) { 2023 return 0; 2024 } 2025@@ -1286,14 +1779,14 @@ 2026 // 2027 const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4); 2028 if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */ 2029- pInfo->dataFormat[1]==0x72 && 2030- pInfo->dataFormat[2]==0x44 && 2031- pInfo->dataFormat[3]==0x63 && 2032- pInfo->formatVersion[0]==1 )) { 2033+ pInfo->dataFormat[1]==0x72 && 2034+ pInfo->dataFormat[2]==0x44 && 2035+ pInfo->dataFormat[3]==0x63 && 2036+ pInfo->formatVersion[0]==1 )) { 2037 udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n", 2038- pInfo->dataFormat[0], pInfo->dataFormat[1], 2039- pInfo->dataFormat[2], pInfo->dataFormat[3], 2040- pInfo->formatVersion[0]); 2041+ pInfo->dataFormat[0], pInfo->dataFormat[1], 2042+ pInfo->dataFormat[2], pInfo->dataFormat[3], 2043+ pInfo->formatVersion[0]); 2044 *status=U_UNSUPPORTED_ERROR; 2045 return 0; 2046 } 2047@@ -1311,8 +1804,10 @@ 2048 // 2049 const uint8_t *inBytes =(const uint8_t *)inData+headerSize; 2050 const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes; 2051- if (ds->readUInt32(header->magic) != COMPACT_TRIE_MAGIC_1 2052- || ds->readUInt32(header->size) < sizeof(CompactTrieHeader)) 2053+ uint32_t magic = ds->readUInt32(header->magic); 2054+ if (magic != COMPACT_TRIE_MAGIC_1 && magic != COMPACT_TRIE_MAGIC_2 && magic != COMPACT_TRIE_MAGIC_3 2055+ || magic == COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeaderV1) 2056+ || magic != COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeader)) 2057 { 2058 udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n"); 2059 *status=U_UNSUPPORTED_ERROR; 2060@@ -1333,10 +1828,10 @@ 2061 // 2062 if (length < sizeWithUData) { 2063 udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n", 2064- totalSize); 2065+ totalSize); 2066 *status=U_INDEX_OUTOFBOUNDS_ERROR; 2067 return 0; 2068- } 2069+ } 2070 2071 // 2072 // Swap the Data. Do the data itself first, then the CompactTrieHeader, because 2073@@ -1355,20 +1850,38 @@ 2074 } 2075 2076 // We need to loop through all the nodes in the offset table, and swap each one. 2077- uint16_t nodeCount = ds->readUInt16(header->nodeCount); 2078+ uint32_t nodeCount, rootId; 2079+ if(header->magic == COMPACT_TRIE_MAGIC_1) { 2080+ nodeCount = ds->readUInt16(((CompactTrieHeaderV1 *)header)->nodeCount); 2081+ rootId = ds->readUInt16(((CompactTrieHeaderV1 *)header)->root); 2082+ } else { 2083+ nodeCount = ds->readUInt32(header->nodeCount); 2084+ rootId = ds->readUInt32(header->root); 2085+ } 2086+ 2087 // Skip node 0, which should always be 0. 2088- for (int i = 1; i < nodeCount; ++i) { 2089+ for (uint32_t i = 1; i < nodeCount; ++i) { 2090 uint32_t nodeOff = ds->readUInt32(header->offsets[i]); 2091 const CompactTrieNode *inNode = (const CompactTrieNode *)(inBytes + nodeOff); 2092 CompactTrieNode *outNode = (CompactTrieNode *)(outBytes + nodeOff); 2093 uint16_t flagscount = ds->readUInt16(inNode->flagscount); 2094- uint16_t itemCount = flagscount & kCountMask; 2095+ uint16_t itemCount = getCount(inNode); 2096+ //uint16_t itemCount = flagscount & kCountMask; 2097 ds->writeUInt16(&outNode->flagscount, flagscount); 2098 if (itemCount > 0) { 2099- if (flagscount & kVerticalNode) { 2100+ uint16_t overflow = 0; //number of extra uint16_ts needed to be swapped 2101+ if (flagscount & kVerticalNode && i != rootId) { 2102+ if(flagscount & kEqualOverflows){ 2103+ // include overflow bits 2104+ overflow += 1; 2105+ } 2106+ if (header->magic == COMPACT_TRIE_MAGIC_3 && flagscount & kEndsParentWord) { 2107+ //include values 2108+ overflow += 1; 2109+ } 2110 ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), 2111- itemCount*sizeof(uint16_t), 2112- outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), status); 2113+ (itemCount + overflow)*sizeof(uint16_t), 2114+ outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), status); 2115 uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal); 2116 ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal)); 2117 } 2118@@ -1381,26 +1894,62 @@ 2119 word = ds->readUInt16(inHNode->entries[j].equal); 2120 ds->writeUInt16(&outHNode->entries[j].equal, word); 2121 } 2122+ 2123+ // swap overflow/value information 2124+ if(flagscount & kEqualOverflows){ 2125+ overflow += (itemCount + 3) / 4; 2126+ } 2127+ 2128+ if (header->magic == COMPACT_TRIE_MAGIC_3 && i != rootId && flagscount & kEndsParentWord) { 2129+ //include values 2130+ overflow += 1; 2131+ } 2132+ 2133+ uint16_t *inOverflow = (uint16_t *) &inHNode->entries[itemCount]; 2134+ uint16_t *outOverflow = (uint16_t *) &outHNode->entries[itemCount]; 2135+ for(int j = 0; j<overflow; j++){ 2136+ uint16_t extraInfo = ds->readUInt16(*inOverflow); 2137+ ds->writeUInt16(outOverflow, extraInfo); 2138+ 2139+ inOverflow++; 2140+ outOverflow++; 2141+ } 2142 } 2143 } 2144 } 2145 #endif 2146 2147- // All the data in all the nodes consist of 16 bit items. Swap them all at once. 2148- uint16_t nodeCount = ds->readUInt16(header->nodeCount); 2149- uint32_t nodesOff = offsetof(CompactTrieHeader,offsets)+((uint32_t)nodeCount*sizeof(uint32_t)); 2150- ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff, status); 2151- 2152 // Swap the header 2153 ds->writeUInt32(&outputHeader->size, totalSize); 2154- uint32_t magic = ds->readUInt32(header->magic); 2155 ds->writeUInt32(&outputHeader->magic, magic); 2156- ds->writeUInt16(&outputHeader->nodeCount, nodeCount); 2157- uint16_t root = ds->readUInt16(header->root); 2158- ds->writeUInt16(&outputHeader->root, root); 2159- ds->swapArray32(ds, inBytes+offsetof(CompactTrieHeader,offsets), 2160- sizeof(uint32_t)*(int32_t)nodeCount, 2161- outBytes+offsetof(CompactTrieHeader,offsets), status); 2162+ 2163+ uint32_t nodeCount; 2164+ uint32_t offsetPos; 2165+ if (header->magic == COMPACT_TRIE_MAGIC_1) { 2166+ CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *)header; 2167+ CompactTrieHeaderV1 *outputHeaderV1 = (CompactTrieHeaderV1 *)outputHeader; 2168+ 2169+ nodeCount = ds->readUInt16(headerV1->nodeCount); 2170+ ds->writeUInt16(&outputHeaderV1->nodeCount, nodeCount); 2171+ uint16_t root = ds->readUInt16(headerV1->root); 2172+ ds->writeUInt16(&outputHeaderV1->root, root); 2173+ offsetPos = offsetof(CompactTrieHeaderV1,offsets); 2174+ } else { 2175+ nodeCount = ds->readUInt32(header->nodeCount); 2176+ ds->writeUInt32(&outputHeader->nodeCount, nodeCount); 2177+ uint32_t root = ds->readUInt32(header->root); 2178+ ds->writeUInt32(&outputHeader->root, root); 2179+ offsetPos = offsetof(CompactTrieHeader,offsets); 2180+ } 2181+ 2182+ // All the data in all the nodes consist of 16 bit items. Swap them all at once. 2183+ uint32_t nodesOff = offsetPos+((uint32_t)nodeCount*sizeof(uint32_t)); 2184+ ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff, status); 2185+ 2186+ //swap offsets 2187+ ds->swapArray32(ds, inBytes+offsetPos, 2188+ sizeof(uint32_t)*(uint32_t)nodeCount, 2189+ outBytes+offsetPos, status); 2190 2191 return sizeWithUData; 2192 } 2193--- source/common/triedict.h 2006-06-06 15:38:49.000000000 -0700 2194+++ source/common/triedict.h 2011-01-21 14:12:45.496927000 -0800 2195@@ -47,7 +47,6 @@ 2196 U_NAMESPACE_BEGIN 2197 2198 class StringEnumeration; 2199-struct CompactTrieHeader; 2200 2201 /******************************************************************* 2202 * TrieWordDictionary 2203@@ -72,23 +71,29 @@ 2204 */ 2205 virtual ~TrieWordDictionary(); 2206 2207+ /** 2208+ * <p>Returns true if the dictionary contains values associated with each word.</p> 2209+ */ 2210+ virtual UBool getValued() const = 0; 2211+ 2212 /** 2213 * <p>Find dictionary words that match the text.</p> 2214 * 2215 * @param text A UText representing the text. The 2216 * iterator is left after the longest prefix match in the dictionary. 2217- * @param start The current position in text. 2218 * @param maxLength The maximum number of code units to match. 2219 * @param lengths An array that is filled with the lengths of words that matched. 2220 * @param count Filled with the number of elements output in lengths. 2221 * @param limit The size of the lengths array; this limits the number of words output. 2222+ * @param values An array that is filled with the values associated with the matched words. 2223 * @return The number of characters in text that were matched. 2224 */ 2225 virtual int32_t matches( UText *text, 2226 int32_t maxLength, 2227 int32_t *lengths, 2228 int &count, 2229- int limit ) const = 0; 2230+ int limit, 2231+ uint16_t *values = NULL) const = 0; 2232 2233 /** 2234 * <p>Return a StringEnumeration for iterating all the words in the dictionary.</p> 2235@@ -128,6 +133,12 @@ 2236 2237 UText *fIter; 2238 2239+ /** 2240+ * A UText for internal use 2241+ * @internal 2242+ */ 2243+ UBool fValued; 2244+ 2245 friend class CompactTrieDictionary; // For fast conversion 2246 2247 public: 2248@@ -138,14 +149,29 @@ 2249 * @param median A UChar around which to balance the trie. Ideally, it should 2250 * begin at least one word that is near the median of the set in the dictionary 2251 * @param status A status code recording the success of the call. 2252+ * @param containsValue True if the dictionary stores values associated with each word. 2253 */ 2254- MutableTrieDictionary( UChar median, UErrorCode &status ); 2255+ MutableTrieDictionary( UChar median, UErrorCode &status, UBool containsValue = FALSE ); 2256 2257 /** 2258 * <p>Virtual destructor.</p> 2259 */ 2260 virtual ~MutableTrieDictionary(); 2261 2262+ /** 2263+ * Indicate whether the MutableTrieDictionary stores values associated with each word 2264+ */ 2265+ void setValued(UBool valued){ 2266+ fValued = valued; 2267+ } 2268+ 2269+ /** 2270+ * <p>Returns true if the dictionary contains values associated with each word.</p> 2271+ */ 2272+ virtual UBool getValued() const { 2273+ return fValued; 2274+ } 2275+ 2276 /** 2277 * <p>Find dictionary words that match the text.</p> 2278 * 2279@@ -155,13 +181,15 @@ 2280 * @param lengths An array that is filled with the lengths of words that matched. 2281 * @param count Filled with the number of elements output in lengths. 2282 * @param limit The size of the lengths array; this limits the number of words output. 2283+ * @param values An array that is filled with the values associated with the matched words. 2284 * @return The number of characters in text that were matched. 2285 */ 2286 virtual int32_t matches( UText *text, 2287 int32_t maxLength, 2288 int32_t *lengths, 2289 int &count, 2290- int limit ) const; 2291+ int limit, 2292+ uint16_t *values = NULL) const; 2293 2294 /** 2295 * <p>Return a StringEnumeration for iterating all the words in the dictionary.</p> 2296@@ -173,15 +201,17 @@ 2297 virtual StringEnumeration *openWords( UErrorCode &status ) const; 2298 2299 /** 2300- * <p>Add one word to the dictionary.</p> 2301+ * <p>Add one word to the dictionary with an optional associated value.</p> 2302 * 2303 * @param word A UChar buffer containing the word. 2304 * @param length The length of the word. 2305- * @param status The resultant status 2306+ * @param status The resultant status. 2307+ * @param value The nonzero value associated with this word. 2308 */ 2309 virtual void addWord( const UChar *word, 2310 int32_t length, 2311- UErrorCode &status); 2312+ UErrorCode &status, 2313+ uint16_t value = 0); 2314 2315 #if 0 2316 /** 2317@@ -203,8 +233,9 @@ 2318 * @param lengths An array that is filled with the lengths of words that matched. 2319 * @param count Filled with the number of elements output in lengths. 2320 * @param limit The size of the lengths array; this limits the number of words output. 2321- * @param parent The parent of the current node 2322- * @param pMatched The returned parent node matched the input 2323+ * @param parent The parent of the current node. 2324+ * @param pMatched The returned parent node matched the input/ 2325+ * @param values An array that is filled with the values associated with the matched words. 2326 * @return The number of characters in text that were matched. 2327 */ 2328 virtual int32_t search( UText *text, 2329@@ -213,40 +244,46 @@ 2330 int &count, 2331 int limit, 2332 TernaryNode *&parent, 2333- UBool &pMatched ) const; 2334+ UBool &pMatched, 2335+ uint16_t *values = NULL) const; 2336 2337 private: 2338 /** 2339 * <p>Private constructor. The root node it not allocated.</p> 2340 * 2341 * @param status A status code recording the success of the call. 2342+ * @param containsValues True if the dictionary will store a value associated 2343+ * with each word added. 2344 */ 2345- MutableTrieDictionary( UErrorCode &status ); 2346+ MutableTrieDictionary( UErrorCode &status, UBool containsValues = false ); 2347 }; 2348 2349 /******************************************************************* 2350 * CompactTrieDictionary 2351 */ 2352 2353+//forward declarations 2354+struct CompactTrieHeader; 2355+struct CompactTrieInfo; 2356+ 2357 /** 2358 * <p>CompactTrieDictionary is a TrieWordDictionary that has been compacted 2359 * to save space.</p> 2360 */ 2361 class U_COMMON_API CompactTrieDictionary : public TrieWordDictionary { 2362 private: 2363- /** 2364- * The root node of the trie 2365- */ 2366+ /** 2367+ * The header of the CompactTrieDictionary which contains all info 2368+ */ 2369 2370- const CompactTrieHeader *fData; 2371- 2372- /** 2373- * A UBool indicating whether or not we own the fData. 2374- */ 2375+ CompactTrieInfo *fInfo; 2376 2377+ /** 2378+ * A UBool indicating whether or not we own the fData. 2379+ */ 2380 UBool fOwnData; 2381 2382- UDataMemory *fUData; 2383+ UDataMemory *fUData; 2384 public: 2385 /** 2386 * <p>Construct a dictionary from a UDataMemory.</p> 2387@@ -277,6 +314,11 @@ 2388 */ 2389 virtual ~CompactTrieDictionary(); 2390 2391+ /** 2392+ * <p>Returns true if the dictionary contains values associated with each word.</p> 2393+ */ 2394+ virtual UBool getValued() const; 2395+ 2396 /** 2397 * <p>Find dictionary words that match the text.</p> 2398 * 2399@@ -286,13 +328,15 @@ 2400 * @param lengths An array that is filled with the lengths of words that matched. 2401 * @param count Filled with the number of elements output in lengths. 2402 * @param limit The size of the lengths array; this limits the number of words output. 2403+ * @param values An array that is filled with the values associated with the matched words. 2404 * @return The number of characters in text that were matched. 2405 */ 2406 virtual int32_t matches( UText *text, 2407- int32_t rangeEnd, 2408+ int32_t maxLength, 2409 int32_t *lengths, 2410 int &count, 2411- int limit ) const; 2412+ int limit, 2413+ uint16_t *values = NULL) const; 2414 2415 /** 2416 * <p>Return a StringEnumeration for iterating all the words in the dictionary.</p> 2417@@ -311,7 +355,7 @@ 2418 virtual uint32_t dataSize() const; 2419 2420 /** 2421- * <p>Return a void * pointer to the compact data, platform-endian.</p> 2422+ * <p>Return a void * pointer to the (unmanaged) compact data, platform-endian.</p> 2423 * 2424 * @return The data for the compact dictionary, suitable for passing to the 2425 * constructor. 2426@@ -342,5 +386,5 @@ 2427 2428 U_NAMESPACE_END 2429 2430- /* TRIEDICT_H */ 2431+/* TRIEDICT_H */ 2432 #endif 2433--- source/data/Makefile.in 2010-10-29 13:21:33.000000000 -0700 2434+++ source/data/Makefile.in 2011-01-26 16:24:24.856798000 -0800 2435@@ -509,8 +520,9 @@ 2436 #################################################### CTD 2437 # CTD FILES 2438 2439-$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES) 2440- $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $< 2441+# .ctd file now generated regardless of whether dictionary file exists 2442+$(BRKBLDDIR)/%.ctd: $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES) 2443+ $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $(BRKSRCDIR)/$(*F).txt 2444 2445 #################################################### CFU 2446 # CFU FILES 2447--- source/data/brkitr/root.txt 2010-07-28 17:18:28.000000000 -0700 2448+++ source/data/brkitr/root.txt 2011-01-21 14:12:45.653922000 -0800 2449@@ -17,5 +17,8 @@ 2450 } 2451 dictionaries{ 2452 Thai:process(dependency){"thaidict.ctd"} 2453+ Hani:process(dependency){"cjdict.ctd"} 2454+ Hira:process(dependency){"cjdict.ctd"} 2455+ Kata:process(dependency){"cjdict.ctd"} 2456 } 2457 } 2458--- source/data/xml/brkitr/root.xml 2010-03-01 15:13:18.000000000 -0800 2459+++ source/data/xml/brkitr/root.xml 2011-01-21 14:12:45.735922000 -0800 2460@@ -25,6 +25,9 @@ 2461 </icu:boundaries> 2462 <icu:dictionaries> 2463 <icu:dictionary type="Thai" icu:dependency="thaidict.ctd"/> 2464+ <icu:dictionary type="Hani" icu:dependency="cjdict.ctd"/> 2465+ <icu:dictionary type="Hira" icu:dependency="cjdict.ctd"/> 2466+ <icu:dictionary type="Kata" icu:dependency="cjdict.ctd"/> 2467 </icu:dictionaries> 2468 </icu:breakIteratorData> 2469 </special> 2470--- source/test/cintltst/creststn.c 2010-10-28 10:44:02.000000000 -0700 2471+++ source/test/cintltst/creststn.c 2011-01-21 14:12:44.995020000 -0800 2472@@ -2188,21 +2188,21 @@ 2473 2474 2475 { 2476- UResourceBundle* ja = ures_open(U_ICUDATA_BRKITR,"ja", &status); 2477+ UResourceBundle* th = ures_open(U_ICUDATA_BRKITR,"th", &status); 2478 const UChar *got = NULL, *exp=NULL; 2479 int32_t gotLen = 0, expLen=0; 2480- ja = ures_getByKey(ja, "boundaries", ja, &status); 2481- exp = tres_getString(ja, -1, "word", &expLen, &status); 2482+ th = ures_getByKey(th, "boundaries", th, &status); 2483+ exp = tres_getString(th, -1, "grapheme", &expLen, &status); 2484 2485 tb = ures_getByKey(aliasB, "boundaries", tb, &status); 2486- got = tres_getString(tb, -1, "word", &gotLen, &status); 2487+ got = tres_getString(tb, -1, "grapheme", &gotLen, &status); 2488 2489 if(U_FAILURE(status)) { 2490 log_err("%s trying to read str boundaries\n", u_errorName(status)); 2491 } else if(gotLen != expLen || u_strncmp(exp, got, gotLen) != 0) { 2492 log_err("Referencing alias didn't get the right data\n"); 2493 } 2494- ures_close(ja); 2495+ ures_close(th); 2496 status = U_ZERO_ERROR; 2497 } 2498 /* simple alias */ 2499--- source/test/intltest/rbbiapts.cpp 2010-07-12 11:03:29.000000000 -0700 2500+++ source/test/intltest/rbbiapts.cpp 2011-01-21 14:12:45.033014000 -0800 2501@@ -156,9 +156,13 @@ 2502 if(*a!=*b){ 2503 errln("Failed: boilerplate method operator!= does not return correct results"); 2504 } 2505- BreakIterator* c = BreakIterator::createWordInstance(Locale("ja"),status); 2506- if(a && c){ 2507- if(*c==*a){ 2508+ // Japanese word break iteratos is identical to root with 2509+ // a dictionary-based break iterator, but Thai character break iterator 2510+ // is still different from Root. 2511+ BreakIterator* c = BreakIterator::createCharacterInstance(Locale("ja"),status); 2512+ BreakIterator* d = BreakIterator::createCharacterInstance(Locale("th"),status); 2513+ if(c && d){ 2514+ if(*c==*d){ 2515 errln("Failed: boilerplate method opertator== does not return correct results"); 2516 } 2517 }else{ 2518@@ -167,6 +171,7 @@ 2519 delete a; 2520 delete b; 2521 delete c; 2522+ delete d; 2523 } 2524 2525 void RBBIAPITest::TestgetRules() 2526@@ -635,21 +640,21 @@ 2527 // 2528 void RBBIAPITest::TestRuleStatus() { 2529 UChar str[30]; 2530- u_unescape("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094", 2531- // 012345678901234567 8 9 0 1 2 3 4 5 6 2532- // Ideographic Katakana Hiragana 2533+ //no longer test Han or hiragana breaking here: ruleStatusVec would return nothing 2534+ // changed UBRK_WORD_KANA to UBRK_WORD_IDEO 2535+ u_unescape("plain word 123.45 \\u30a1\\u30a2 ", 2536+ // 012345678901234567 8 9 0 2537+ // Katakana 2538 str, 30); 2539 UnicodeString testString1(str); 2540- int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 19, 20, 21, 23, 24, 25, 26}; 2541+ int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 20, 21}; 2542 int32_t tag_lo[] = {UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_LETTER, 2543 UBRK_WORD_NONE, UBRK_WORD_NUMBER, UBRK_WORD_NONE, 2544- UBRK_WORD_IDEO, UBRK_WORD_IDEO, UBRK_WORD_NONE, 2545- UBRK_WORD_KANA, UBRK_WORD_NONE, UBRK_WORD_KANA, UBRK_WORD_KANA}; 2546+ UBRK_WORD_IDEO, UBRK_WORD_NONE}; 2547 2548 int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, 2549 UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT, 2550- UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT, 2551- UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_KANA_LIMIT, UBRK_WORD_KANA_LIMIT}; 2552+ UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT}; 2553 2554 UErrorCode status=U_ZERO_ERROR; 2555 2556@@ -888,9 +893,11 @@ 2557 2558 URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD, status); 2559 { 2560+#if 0 // With a dictionary based word breaking, ja_word is identical to root. 2561 if (ja_word && *ja_word == *root_word) { 2562 errln("japan not different from root"); 2563 } 2564+#endif 2565 } 2566 2567 { 2568--- source/test/intltest/rbbitst.cpp 2010-10-08 18:23:28.000000000 -0700 2569+++ source/test/intltest/rbbitst.cpp 2011-01-21 14:12:45.180030000 -0800 2570@@ -35,6 +35,8 @@ 2571 #include <string.h> 2572 #include <stdio.h> 2573 #include <stdlib.h> 2574+#include "unicode/numfmt.h" 2575+#include "unicode/uscript.h" 2576 2577 #define TEST_ASSERT(x) {if (!(x)) { \ 2578 errln("Failure in file %s, line %d", __FILE__, __LINE__);}} 2579@@ -138,11 +140,13 @@ 2580 if (exec) TestThaiBreaks(); break; 2581 case 23: name = "TestTailoredBreaks"; 2582 if (exec) TestTailoredBreaks(); break; 2583+ case 24: name = "TestTrieDictWithValue"; 2584+ if(exec) TestTrieDictWithValue(); break; 2585 #else 2586- case 21: case 22: case 23: name = "skip"; 2587+ case 21: case 22: case 23: case 24: name = "skip"; 2588 break; 2589 #endif 2590- case 24: name = "TestDictRules"; 2591+ case 25: name = "TestDictRules"; 2592 if (exec) TestDictRules(); break; 2593 case 25: name = "TestBug5532"; 2594 if (exec) TestBug5532(); break; 2595@@ -607,6 +611,8 @@ 2596 2597 2598 void RBBITest::TestJapaneseWordBreak() { 2599+// TODO: Rewrite this test for a dictionary-based word breaking. 2600+#if 0 2601 UErrorCode status = U_ZERO_ERROR; 2602 BITestData japaneseWordSelection(status); 2603 2604@@ -628,6 +634,7 @@ 2605 2606 generalIteratorTest(*e, japaneseWordSelection); 2607 delete e; 2608+#endif 2609 } 2610 2611 void RBBITest::TestTrieDict() { 2612@@ -849,6 +856,372 @@ 2613 delete compact2; 2614 } 2615 2616+/*TODO: delete later*/ 2617+inline void writeEnumerationToFile(StringEnumeration *enumer, char *filename){ 2618+ UErrorCode status = U_ZERO_ERROR; 2619+ FILE *outfile = fopen(filename,"w"); 2620+ UConverter *cvt = ucnv_open("UTF-8", &status); 2621+ if (U_FAILURE(status)) 2622+ return; 2623+ if(outfile != NULL){ 2624+ status = U_ZERO_ERROR; 2625+ const UnicodeString *word = enumer->snext(status); 2626+ while (word != NULL && U_SUCCESS(status)) { 2627+ char u8word[500]; 2628+ status = U_ZERO_ERROR; 2629+ ucnv_fromUChars(cvt, u8word, 500, word->getBuffer(), word->length(), 2630+ &status); 2631+ fprintf(outfile,"%s\n", u8word); 2632+ status = U_ZERO_ERROR; 2633+ word = enumer->snext(status); 2634+ } 2635+ fclose(outfile); 2636+ } 2637+ ucnv_close(cvt); 2638+} 2639+ 2640+// A very simple helper class to streamline the buffer handling in 2641+// TestTrieDictWithValue 2642+template<class T, size_t N> 2643+class AutoBuffer { 2644+ public: 2645+ AutoBuffer(size_t size) : buffer(stackBuffer) { 2646+ if (size > N) 2647+ buffer = new T[size]; 2648+ } 2649+ ~AutoBuffer() { 2650+ if (buffer != stackBuffer) 2651+ delete [] buffer; 2652+ } 2653+ T* elems() { 2654+ return buffer; 2655+ } 2656+ const T& operator[] (size_t i) const { 2657+ return buffer[i]; 2658+ } 2659+ T& operator[] (size_t i) { 2660+ return buffer[i]; 2661+ } 2662+ private: 2663+ T stackBuffer[N]; 2664+ T* buffer; 2665+ AutoBuffer(); 2666+}; 2667+ 2668+//---------------------------------------------------------------------------- 2669+// 2670+// TestTrieDictWithValue Test trie dictionaries with logprob values and 2671+// more than 2^16 nodes after compaction. 2672+// 2673+//---------------------------------------------------------------------------- 2674+void RBBITest::TestTrieDictWithValue() { 2675+ UErrorCode status = U_ZERO_ERROR; 2676+ 2677+ // 2678+ // Open and read the test data file. 2679+ // 2680+ const char *testDataDirectory = IntlTest::getSourceTestData(status); 2681+ const char *filename = "cjdict-truncated.txt"; 2682+ char testFileName[1000]; 2683+ if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen(filename) + 10 >= sizeof(testFileName)) { 2684+ errln("Can't open test data. Path too long."); 2685+ return; 2686+ } 2687+ strcpy(testFileName, testDataDirectory); 2688+ strcat(testFileName, filename); 2689+ 2690+ // Items needing deleting at the end 2691+ MutableTrieDictionary *mutableDict = NULL; 2692+ CompactTrieDictionary *compactDict = NULL; 2693+ UnicodeSet *breaks = NULL; 2694+ UChar *testFile = NULL; 2695+ StringEnumeration *enumer1 = NULL; 2696+ StringEnumeration *enumer2 = NULL; 2697+ MutableTrieDictionary *mutable2 = NULL; 2698+ StringEnumeration *cloneEnum = NULL; 2699+ CompactTrieDictionary *compact2 = NULL; 2700+ NumberFormat *nf = NULL; 2701+ UText *originalText = NULL, *cloneText = NULL; 2702+ 2703+ const UnicodeString *originalWord = NULL; 2704+ const UnicodeString *cloneWord = NULL; 2705+ UChar *current; 2706+ UChar *word; 2707+ UChar uc; 2708+ int32_t wordLen; 2709+ int32_t wordCount; 2710+ int32_t testCount; 2711+ int32_t valueLen; 2712+ int counter = 0; 2713+ 2714+ int len; 2715+ testFile = ReadAndConvertFile(testFileName, len, NULL, status); 2716+ if (U_FAILURE(status)) { 2717+ goto cleanup; /* something went wrong, error already output */ 2718+ } 2719+ 2720+ mutableDict = new MutableTrieDictionary(0x0E1C, status, TRUE); 2721+ if (U_FAILURE(status)) { 2722+ errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status)); 2723+ goto cleanup; 2724+ } 2725+ 2726+ breaks = new UnicodeSet; 2727+ breaks->add(0x000A); // Line Feed 2728+ breaks->add(0x000D); // Carriage Return 2729+ breaks->add(0x2028); // Line Separator 2730+ breaks->add(0x2029); // Paragraph Separator 2731+ breaks->add(0x0009); // Tab character 2732+ 2733+ // Now add each non-comment line of the file as a word. 2734+ current = testFile; 2735+ word = current; 2736+ uc = *current++; 2737+ wordLen = 0; 2738+ wordCount = 0; 2739+ nf = NumberFormat::createInstance(status); 2740+ 2741+ while (uc) { 2742+ UnicodeString ucharValue; 2743+ valueLen = 0; 2744+ 2745+ if (uc == 0x0023) { // #comment line, skip 2746+ while (uc && !breaks->contains(uc)) { 2747+ uc = *current++; 2748+ } 2749+ } 2750+ else{ 2751+ while (uc && !breaks->contains(uc)) { 2752+ ++wordLen; 2753+ uc = *current++; 2754+ } 2755+ if(uc == 0x0009){ //separator is a tab char, read in num after tab 2756+ uc = *current++; 2757+ while (uc && !breaks->contains(uc)) { 2758+ ucharValue.append(uc); 2759+ uc = *current++; 2760+ } 2761+ } 2762+ } 2763+ if (wordLen > 0) { 2764+ Formattable value((int32_t)0); 2765+ nf->parse(ucharValue.getTerminatedBuffer(), value, status); 2766+ 2767+ if(U_FAILURE(status)){ 2768+ errln("parsing of value failed when reading in dictionary\n"); 2769+ goto cleanup; 2770+ } 2771+ mutableDict->addWord(word, wordLen, status, value.getLong()); 2772+ if (U_FAILURE(status)) { 2773+ errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status)); 2774+ goto cleanup; 2775+ } 2776+ wordCount += 1; 2777+ } 2778+ 2779+ // Find beginning of next line 2780+ while (uc && breaks->contains(uc)) { 2781+ uc = *current++; 2782+ } 2783+ word = current-1; 2784+ wordLen = 0; 2785+ } 2786+ 2787+ if (wordCount < 50) { 2788+ errln("Word count (%d) unreasonably small\n", wordCount); 2789+ goto cleanup; 2790+ } 2791+ 2792+ enumer1 = mutableDict->openWords(status); 2793+ if (U_FAILURE(status)) { 2794+ errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status)); 2795+ goto cleanup; 2796+ } 2797+ 2798+ testCount = 0; 2799+ if (wordCount != (testCount = enumer1->count(status))) { 2800+ errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", 2801+ testCount, wordCount, u_errorName(status)); 2802+ goto cleanup; 2803+ } 2804+ 2805+ // Now compact it 2806+ compactDict = new CompactTrieDictionary(*mutableDict, status); 2807+ if (U_FAILURE(status)) { 2808+ errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status)); 2809+ goto cleanup; 2810+ } 2811+ 2812+ enumer2 = compactDict->openWords(status); 2813+ if (U_FAILURE(status)) { 2814+ errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status)); 2815+ goto cleanup; 2816+ } 2817+ 2818+ 2819+ //delete later 2820+// writeEnumerationToFile(enumer1, "/home/jchye/mutable.txt"); 2821+// writeEnumerationToFile(enumer2, "/home/jchye/compact.txt"); 2822+ 2823+ enumer1->reset(status); 2824+ enumer2->reset(status); 2825+ 2826+ originalWord = enumer1->snext(status); 2827+ cloneWord = enumer2->snext(status); 2828+ while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) { 2829+ if (*originalWord != *cloneWord) { 2830+ errln("MutableTrieDictionary and CompactTrieDictionary word mismatch at %d, lengths are %d and %d\n", 2831+ counter, originalWord->length(), cloneWord->length()); 2832+ goto cleanup; 2833+ } 2834+ 2835+ // check if attached values of the same word in both dictionaries tally 2836+#if 0 2837+ int32_t lengths1[originalWord->length()], lengths2[cloneWord->length()]; 2838+ uint16_t values1[originalWord->length()], values2[cloneWord->length()]; 2839+#endif 2840+ AutoBuffer<int32_t, 20> lengths1(originalWord->length()); 2841+ AutoBuffer<int32_t, 20> lengths2(cloneWord->length()); 2842+ AutoBuffer<uint16_t, 20> values1(originalWord->length()); 2843+ AutoBuffer<uint16_t, 20> values2(cloneWord->length()); 2844+ 2845+ originalText = utext_openConstUnicodeString(originalText, originalWord, &status); 2846+ cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status); 2847+ 2848+ int count1, count2; 2849+ mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems()); 2850+ compactDict->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems()); 2851+ 2852+ if(values1[count1-1] != values2[count2-1]){ 2853+ errln("Values of word %d in MutableTrieDictionary and CompactTrieDictionary do not match, with values %d and %d\n", 2854+ counter, values1[count1-1], values2[count2-1]); 2855+ goto cleanup; 2856+ } 2857+ 2858+ counter++; 2859+ originalWord = enumer1->snext(status); 2860+ cloneWord = enumer2->snext(status); 2861+ } 2862+ if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) { 2863+ errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same"); 2864+ } 2865+ 2866+ delete enumer1; 2867+ enumer1 = NULL; 2868+ delete enumer2; 2869+ enumer2 = NULL; 2870+ 2871+ // Now un-compact it 2872+ mutable2 = compactDict->cloneMutable(status); 2873+ if (U_FAILURE(status)) { 2874+ errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status)); 2875+ goto cleanup; 2876+ } 2877+ 2878+ cloneEnum = mutable2->openWords(status); 2879+ if (U_FAILURE(status)) { 2880+ errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status)); 2881+ goto cleanup; 2882+ } 2883+ 2884+ if (wordCount != (testCount = cloneEnum->count(status))) { 2885+ errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", 2886+ testCount, wordCount, u_errorName(status)); 2887+ goto cleanup; 2888+ } 2889+ 2890+ // Compact original dictionary to clone. Note that we can only compare the same kind of 2891+ // dictionary as the order of the enumerators is not guaranteed to be the same between 2892+ // different kinds 2893+ enumer1 = mutableDict->openWords(status); 2894+ if (U_FAILURE(status)) { 2895+ errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status)); 2896+ goto cleanup; 2897+ } 2898+ 2899+ counter = 0; 2900+ originalWord = enumer1->snext(status); 2901+ cloneWord = cloneEnum->snext(status); 2902+ while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) { 2903+ if (*originalWord != *cloneWord) { 2904+ errln("Original and cloned MutableTrieDictionary word mismatch\n"); 2905+ goto cleanup; 2906+ } 2907+ 2908+ // check if attached values of the same word in both dictionaries tally 2909+ AutoBuffer<int32_t, 20> lengths1(originalWord->length()); 2910+ AutoBuffer<int32_t, 20> lengths2(cloneWord->length()); 2911+ AutoBuffer<uint16_t, 20> values1(originalWord->length()); 2912+ AutoBuffer<uint16_t, 20> values2(cloneWord->length()); 2913+ originalText = utext_openConstUnicodeString(originalText, originalWord, &status); 2914+ cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status); 2915+ 2916+ int count1, count2; 2917+ mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems()); 2918+ mutable2->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems()); 2919+ 2920+ if(values1[count1-1] != values2[count2-1]){ 2921+ errln("Values of word %d in original and cloned MutableTrieDictionary do not match, with values %d and %d\n", 2922+ counter, values1[count1-1], values2[count2-1]); 2923+ goto cleanup; 2924+ } 2925+ 2926+ counter++; 2927+ 2928+ originalWord = enumer1->snext(status); 2929+ cloneWord = cloneEnum->snext(status); 2930+ } 2931+ 2932+ if (U_FAILURE(status)) { 2933+ errln("Enumeration failed: %s\n", u_errorName(status)); 2934+ goto cleanup; 2935+ } 2936+ 2937+ if (originalWord != cloneWord) { 2938+ errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n"); 2939+ goto cleanup; 2940+ } 2941+ 2942+ // Test the data copying constructor for CompactTrieDict, and the data access APIs. 2943+ compact2 = new CompactTrieDictionary(compactDict->data(), status); 2944+ if (U_FAILURE(status)) { 2945+ errln("CompactTrieDictionary(const void *,...) failed\n"); 2946+ goto cleanup; 2947+ } 2948+ 2949+ if (compact2->dataSize() == 0) { 2950+ errln("CompactTrieDictionary->dataSize() == 0\n"); 2951+ goto cleanup; 2952+ } 2953+ 2954+ // Now count the words via the second dictionary 2955+ delete enumer1; 2956+ enumer1 = compact2->openWords(status); 2957+ if (U_FAILURE(status)) { 2958+ errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status)); 2959+ goto cleanup; 2960+ } 2961+ 2962+ if (wordCount != (testCount = enumer1->count(status))) { 2963+ errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n", 2964+ testCount, wordCount, u_errorName(status)); 2965+ goto cleanup; 2966+ } 2967+ 2968+ cleanup: 2969+ delete compactDict; 2970+ delete mutableDict; 2971+ delete breaks; 2972+ delete[] testFile; 2973+ delete enumer1; 2974+ delete mutable2; 2975+ delete cloneEnum; 2976+ delete compact2; 2977+ utext_close(originalText); 2978+ utext_close(cloneText); 2979+ 2980+ 2981+} 2982 2983 //---------------------------------------------------------------------------- 2984 // 2985@@ -1870,8 +2243,15 @@ 2986 // Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009). 2987 static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF" 2988 "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002"; 2989+#if 0 2990 static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 17, 18, 20, 21, 24, 27, 28 }; 2991 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 }; 2992+#endif 2993+// There's no separate Japanese word break iterator. Root is the same as Japanese. 2994+// Our dictionary-based iterator has to be tweaked to better handle U+3005, 2995+// U+3007, U+300B and some other cases. 2996+static const int32_t jaWordTOffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 }; 2997+static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 }; 2998 2999 // UBreakIteratorType UBRK_SENTENCE, Locale "el" 3000 // Add break after Greek question mark (cldrbug #2069). 3001@@ -2672,6 +3052,8 @@ 3002 UnicodeSet *fNewlineSet; 3003 UnicodeSet *fKatakanaSet; 3004 UnicodeSet *fALetterSet; 3005+ // TODO(jungshik): Do we still need this change? 3006+ // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt 3007 UnicodeSet *fMidNumLetSet; 3008 UnicodeSet *fMidLetterSet; 3009 UnicodeSet *fMidNumSet; 3010@@ -2680,6 +3062,7 @@ 3011 UnicodeSet *fOtherSet; 3012 UnicodeSet *fExtendSet; 3013 UnicodeSet *fExtendNumLetSet; 3014+ UnicodeSet *fDictionaryCjkSet; 3015 3016 RegexMatcher *fMatcher; 3017 3018@@ -2696,12 +3079,24 @@ 3019 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status); 3020 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status); 3021 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status); 3022- fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status); 3023+ fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status); 3024+ // Exclude Hangul syllables from ALetterSet during testing. 3025+ // Leave CJK dictionary characters out from the monkey tests! 3026+#if 0 3027+ fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}" 3028+ "[\\p{Line_Break = Complex_Context}" 3029+ "-\\p{Grapheme_Cluster_Break = Extend}" 3030+ "-\\p{Grapheme_Cluster_Break = Control}" 3031+ "]]", 3032+ status); 3033+#endif 3034+ fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status); 3035+ fALetterSet->removeAll(*fDictionaryCjkSet); 3036 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status); 3037 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status); 3038 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status); 3039 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status); 3040- fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status); 3041+ fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}[\\uff10-\\uff19]]"), status); 3042 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status); 3043 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status); 3044 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status); 3045@@ -2725,13 +3120,14 @@ 3046 fOtherSet->removeAll(*fFormatSet); 3047 fOtherSet->removeAll(*fExtendSet); 3048 // Inhibit dictionary characters from being tested at all. 3049+ fOtherSet->removeAll(*fDictionaryCjkSet); 3050 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status)); 3051 3052 fSets->addElement(fCRSet, status); 3053 fSets->addElement(fLFSet, status); 3054 fSets->addElement(fNewlineSet, status); 3055 fSets->addElement(fALetterSet, status); 3056- fSets->addElement(fKatakanaSet, status); 3057+ //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana 3058 fSets->addElement(fMidLetterSet, status); 3059 fSets->addElement(fMidNumLetSet, status); 3060 fSets->addElement(fMidNumSet, status); 3061@@ -3978,6 +4374,7 @@ 3062 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { 3063 count --; 3064 if (forward[count] != i) { 3065+ printStringBreaks(ustr, expected, expectedcount); 3066 test->errln("happy break test previous() failed: expected %d but got %d", 3067 forward[count], i); 3068 break; 3069@@ -4011,23 +4408,25 @@ 3070 UErrorCode status = U_ZERO_ERROR; 3071 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3072 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3073+ // Replaced any C+J characters in a row with a random sequence of characters 3074+ // of the same length to make our C+J segmentation not get in the way. 3075 static const char *strlist[] = 3076 { 3077 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", 3078- "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b", 3079+ "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b", 3080 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a", 3081 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", 3082- "\\u90ca\\u3588\\u009c\\u0953\\u194b", 3083+ "\\uac00\\u3588\\u009c\\u0953\\u194b", 3084 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3085 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e", 3086- "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e", 3087+ "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e", 3088 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3089 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3090 "\\u2027\\U000e0067\\u0a47\\u00b7", 3091 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3092 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3093 "\\u0589\\U000e006e\\u0a42\\U000104a5", 3094- "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 3095+ "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a", 3096 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3097 "\\u0027\\u11af\\U000e0057\\u0602", 3098 "\\U0001d7f2\\U000e007\\u0004\\u0589", 3099@@ -4039,7 +4438,7 @@ 3100 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3101 "\\u0233\\U000e0020\\u0a69\\u0d6a", 3102 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3103- "\\u58f4\\U000e0049\\u20e7\\u2027", 3104+ "\\u18f4\\U000e0049\\u20e7\\u2027", 3105 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3106 "\\ua183\\u102d\\u0bec\\u003a", 3107 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3108@@ -4049,7 +4448,7 @@ 3109 "\\U000e005d\\u2044\\u0731\\u0650\\u0061", 3110 "\\u003a\\u0664\\u00b7\\u1fba", 3111 "\\u003b\\u0027\\u00b7\\u47a3", 3112- "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b", 3113+ "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b", 3114 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673", 3115 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", 3116 }; 3117@@ -4104,12 +4503,12 @@ 3118 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3119 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3120 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 3121- "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 3122+ "\\U000e0065\\u302c\\u09ee\\U000e0068", 3123 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3124 "\\u0233\\U000e0020\\u0a69\\u0d6a", 3125 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3126 "\\u58f4\\U000e0049\\u20e7\\u2027", 3127- "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3128+ "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3129 "\\ua183\\u102d\\u0bec\\u003a", 3130 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3131 "\\u003a\\u0e57\\u0fad\\u002e", 3132--- source/test/intltest/rbbitst.h 2010-07-22 17:15:37.000000000 -0700 3133+++ source/test/intltest/rbbitst.h 2011-01-21 14:12:45.152007000 -0800 3134@@ -70,6 +70,7 @@ 3135 void TestBug5775(); 3136 void TestThaiBreaks(); 3137 void TestTailoredBreaks(); 3138+ void TestTrieDictWithValue(); 3139 void TestDictRules(); 3140 void TestBug5532(); 3141 3142--- source/test/testdata/rbbitst.txt 2010-07-28 17:18:28.000000000 -0700 3143+++ source/test/testdata/rbbitst.txt 2011-01-21 14:12:45.221011000 -0800 3144@@ -161,7 +161,23 @@ 3145 <data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data> 3146 3147 # Hiragana & Katakana stay together, but separates from each other and Latin. 3148-<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data> 3149+# *** what to do about theoretical combos of chars? i.e. hiragana + accent 3150+#<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data> 3151+ 3152+# test normalization/dictionary handling of halfwidth katakana: same dictionary phrase in fullwidth and halfwidth 3153+<data>•芽キャベツ<400>芽キャベツ<400></data> 3154+ 3155+# more Japanese tests 3156+# TODO: Currently, U+30FC and other characters (script=common) in the Hiragana 3157+# and the Katakana block are not treated correctly. Enable this later. 3158+#<data>•どー<400>せ<400>日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data> 3159+<data>•日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data> 3160+ 3161+# Testing of word boundary for dictionary word containing both kanji and kana 3162+<data>•中だるみ<400>蔵王の森<400>ウ離島<400></data> 3163+ 3164+# Testing of Chinese segmentation (taken from a Chinese news article) 3165+<data>•400<100>余<400>名<400>中央<400>委员<400>和<400>中央<400>候补<400>委员<400>都<400>领<400>到了<400>“•推荐<400>票<400>”•,•有<400>资格<400>在<400>200<100>多<400>名<400>符合<400>条件<400>的<400>63<100>岁<400>以下<400>中共<400>正<400>部<400>级<400>干部<400>中<400>,•选出<400>他们<400>属意<400>的<400>中央<400>政治局<400>委员<400>以<400>向<400>政治局<400>常委<400>会<400>举荐<400>。•</data> 3166 3167 # Words with interior formatting characters 3168 <data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</data> 3169@@ -169,6 +185,8 @@ 3170 # to test for bug #4097779 3171 <data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data> 3172 3173+# fullwidth numeric, midletter characters etc should be treated like their halfwidth counterparts 3174+<data>•ISN'T<200> •19<100>日<400></data> 3175 3176 # to test for bug #4098467 3177 # What follows is a string of Korean characters (I found it in the Yellow Pages 3178@@ -178,9 +196,15 @@ 3179 # precomposed syllables... 3180 <data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data> 3181 3182-<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> •</data> 3183+# more Korean tests (Jamo not tested here, not counted as dictionary characters) 3184+# Disable them now because we don't include a Korean dictionary. 3185+#<data>•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<200>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data> 3186+#<data>•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2dd<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200> •\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e•</data> 3187+ 3188+<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> •</data> 3189+ 3190+<data>•\u06c9<200>\uc799<200>\ufffa•</data> 3191 3192-<data>•\u06c9\uc799\ufffa<200></data> 3193 3194 # 3195 # Try some words from other scripts. 3196@@ -491,8 +515,7 @@ 3197 <data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c•</data> 3198 3199 # conjoining jamo... 3200-# TODO: rules update needed 3201-#<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data> 3202+<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data> 3203 3204 # to test for bug #4117554: Fullwidth .!? should be treated as postJwrd 3205 <data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data> 3206--- source/test/testdata/testaliases.txt 2009-11-12 13:53:42.000000000 -0800 3207+++ source/test/testdata/testaliases.txt 2011-01-21 14:12:45.204005000 -0800 3208@@ -28,7 +28,7 @@ 3209 LocaleScript:alias { "/ICUDATA/ja/LocaleScript" } 3210 3211 // aliasing using position 3212- boundaries:alias { "/ICUDATA-brkitr/ja" } // Referencing corresponding resource in another bundle 3213+ boundaries:alias { "/ICUDATA-brkitr/th" } // Referencing corresponding resource in another bundle 3214 3215 // aliasing arrays 3216 zoneTests { 3217--- source/tools/genctd/genctd.cpp 2009-08-04 14:09:17.000000000 -0700 3218+++ source/tools/genctd/genctd.cpp 2011-01-21 14:12:45.564923000 -0800 3219@@ -1,6 +1,6 @@ 3220 /* 3221 ********************************************************************** 3222-* Copyright (C) 2002-2009, International Business Machines 3223+* Copyright (C) 2002-2010, International Business Machines 3224 * Corporation and others. All Rights Reserved. 3225 ********************************************************************** 3226 * 3227@@ -34,12 +34,15 @@ 3228 #include "unicode/udata.h" 3229 #include "unicode/putil.h" 3230 3231+//#include "unicode/ustdio.h" 3232+ 3233 #include "uoptions.h" 3234 #include "unewdata.h" 3235 #include "ucmndata.h" 3236 #include "rbbidata.h" 3237 #include "triedict.h" 3238 #include "cmemory.h" 3239+#include "uassert.h" 3240 3241 #include <stdio.h> 3242 #include <stdlib.h> 3243@@ -199,147 +202,191 @@ 3244 long wordFileSize; 3245 FILE *file; 3246 char *wordBufferC; 3247- 3248+ MutableTrieDictionary *mtd = NULL; 3249+ 3250 file = fopen(wordFileName, "rb"); 3251- if( file == 0 ) { 3252- fprintf(stderr, "Could not open file \"%s\"\n", wordFileName); 3253- exit(-1); 3254- } 3255- fseek(file, 0, SEEK_END); 3256- wordFileSize = ftell(file); 3257- fseek(file, 0, SEEK_SET); 3258- wordBufferC = new char[wordFileSize+10]; 3259- 3260- result = (long)fread(wordBufferC, 1, wordFileSize, file); 3261- if (result != wordFileSize) { 3262- fprintf(stderr, "Error reading file \"%s\"\n", wordFileName); 3263- exit (-1); 3264- } 3265- wordBufferC[wordFileSize]=0; 3266- fclose(file); 3267- 3268- // 3269- // Look for a Unicode Signature (BOM) on the word file 3270- // 3271- int32_t signatureLength; 3272- const char * wordSourceC = wordBufferC; 3273- const char* encoding = ucnv_detectUnicodeSignature( 3274- wordSourceC, wordFileSize, &signatureLength, &status); 3275- if (U_FAILURE(status)) { 3276- exit(status); 3277- } 3278- if(encoding!=NULL ){ 3279- wordSourceC += signatureLength; 3280- wordFileSize -= signatureLength; 3281- } 3282- 3283- // 3284- // Open a converter to take the rule file to UTF-16 3285- // 3286- UConverter* conv; 3287- conv = ucnv_open(encoding, &status); 3288- if (U_FAILURE(status)) { 3289- fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); 3290- exit(status); 3291- } 3292- 3293- // 3294- // Convert the words to UChar. 3295- // Preflight first to determine required buffer size. 3296- // 3297- uint32_t destCap = ucnv_toUChars(conv, 3298- NULL, // dest, 3299- 0, // destCapacity, 3300- wordSourceC, 3301- wordFileSize, 3302- &status); 3303- if (status != U_BUFFER_OVERFLOW_ERROR) { 3304- fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 3305- exit(status); 3306- }; 3307- 3308- status = U_ZERO_ERROR; 3309- UChar *wordSourceU = new UChar[destCap+1]; 3310- ucnv_toUChars(conv, 3311- wordSourceU, // dest, 3312- destCap+1, 3313- wordSourceC, 3314- wordFileSize, 3315- &status); 3316- if (U_FAILURE(status)) { 3317- fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 3318- exit(status); 3319- }; 3320- ucnv_close(conv); 3321- 3322- // Get rid of the original file buffer 3323- delete[] wordBufferC; 3324- 3325- // Create a MutableTrieDictionary, and loop through all the lines, inserting 3326- // words. 3327- 3328- // First, pick a median character. 3329- UChar *current = wordSourceU + (destCap/2); 3330- UChar uc = *current++; 3331- UnicodeSet breaks; 3332- breaks.add(0x000A); // Line Feed 3333- breaks.add(0x000D); // Carriage Return 3334- breaks.add(0x2028); // Line Separator 3335- breaks.add(0x2029); // Paragraph Separator 3336- 3337- do { 3338- // Look for line break 3339- while (uc && !breaks.contains(uc)) { 3340- uc = *current++; 3341- } 3342- // Now skip to first non-line-break 3343- while (uc && breaks.contains(uc)) { 3344- uc = *current++; 3345+ if( file == 0 ) { //cannot find file 3346+ //create 1-line dummy file: ie 1 char, 1 value 3347+ UNewDataMemory *pData; 3348+ char msg[1024]; 3349+ 3350+ /* write message with just the name */ 3351+ sprintf(msg, "%s not found, genctd writes dummy %s", wordFileName, outFileName); 3352+ fprintf(stderr, "%s\n", msg); 3353+ 3354+ UChar c = 0x0020; 3355+ mtd = new MutableTrieDictionary(c, status, TRUE); 3356+ mtd->addWord(&c, 1, status, 1); 3357+ 3358+ } else { //read words in from input file 3359+ fseek(file, 0, SEEK_END); 3360+ wordFileSize = ftell(file); 3361+ fseek(file, 0, SEEK_SET); 3362+ wordBufferC = new char[wordFileSize+10]; 3363+ 3364+ result = (long)fread(wordBufferC, 1, wordFileSize, file); 3365+ if (result != wordFileSize) { 3366+ fprintf(stderr, "Error reading file \"%s\"\n", wordFileName); 3367+ exit (-1); 3368 } 3369- } 3370- while (uc && (breaks.contains(uc) || u_isspace(uc))); 3371- 3372- MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status); 3373+ wordBufferC[wordFileSize]=0; 3374+ fclose(file); 3375 3376- if (U_FAILURE(status)) { 3377- fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status)); 3378- exit(status); 3379- } 3380+ // 3381+ // Look for a Unicode Signature (BOM) on the word file 3382+ // 3383+ int32_t signatureLength; 3384+ const char * wordSourceC = wordBufferC; 3385+ const char* encoding = ucnv_detectUnicodeSignature( 3386+ wordSourceC, wordFileSize, &signatureLength, &status); 3387+ if (U_FAILURE(status)) { 3388+ exit(status); 3389+ } 3390+ if(encoding!=NULL ){ 3391+ wordSourceC += signatureLength; 3392+ wordFileSize -= signatureLength; 3393+ } 3394 3395- // Now add the words. Words are non-space characters at the beginning of 3396- // lines, and must be at least one UChar. 3397- current = wordSourceU; 3398- UChar *candidate = current; 3399- uc = *current++; 3400- int32_t length = 0; 3401- 3402- while (uc) { 3403- while (uc && !u_isspace(uc)) { 3404- ++length; 3405- uc = *current++; 3406+ // 3407+ // Open a converter to take the rule file to UTF-16 3408+ // 3409+ UConverter* conv; 3410+ conv = ucnv_open(encoding, &status); 3411+ if (U_FAILURE(status)) { 3412+ fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); 3413+ exit(status); 3414 } 3415- if (length > 0) { 3416- mtd->addWord(candidate, length, status); 3417- if (U_FAILURE(status)) { 3418- fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\"\n", 3419- u_errorName(status)); 3420- exit(status); 3421+ 3422+ // 3423+ // Convert the words to UChar. 3424+ // Preflight first to determine required buffer size. 3425+ // 3426+ uint32_t destCap = ucnv_toUChars(conv, 3427+ NULL, // dest, 3428+ 0, // destCapacity, 3429+ wordSourceC, 3430+ wordFileSize, 3431+ &status); 3432+ if (status != U_BUFFER_OVERFLOW_ERROR) { 3433+ fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 3434+ exit(status); 3435+ }; 3436+ 3437+ status = U_ZERO_ERROR; 3438+ UChar *wordSourceU = new UChar[destCap+1]; 3439+ ucnv_toUChars(conv, 3440+ wordSourceU, // dest, 3441+ destCap+1, 3442+ wordSourceC, 3443+ wordFileSize, 3444+ &status); 3445+ if (U_FAILURE(status)) { 3446+ fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 3447+ exit(status); 3448+ }; 3449+ ucnv_close(conv); 3450+ 3451+ // Get rid of the original file buffer 3452+ delete[] wordBufferC; 3453+ 3454+ // Create a MutableTrieDictionary, and loop through all the lines, inserting 3455+ // words. 3456+ 3457+ // First, pick a median character. 3458+ UChar *current = wordSourceU + (destCap/2); 3459+ UChar uc = *current++; 3460+ UnicodeSet breaks; 3461+ breaks.add(0x000A); // Line Feed 3462+ breaks.add(0x000D); // Carriage Return 3463+ breaks.add(0x2028); // Line Separator 3464+ breaks.add(0x2029); // Paragraph Separator 3465+ 3466+ do { 3467+ // Look for line break 3468+ while (uc && !breaks.contains(uc)) { 3469+ uc = *current++; 3470+ } 3471+ // Now skip to first non-line-break 3472+ while (uc && breaks.contains(uc)) { 3473+ uc = *current++; 3474 } 3475 } 3476- // Find beginning of next line 3477- while (uc && !breaks.contains(uc)) { 3478- uc = *current++; 3479+ while (uc && (breaks.contains(uc) || u_isspace(uc))); 3480+ 3481+ mtd = new MutableTrieDictionary(uc, status); 3482+ 3483+ if (U_FAILURE(status)) { 3484+ fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status)); 3485+ exit(status); 3486 } 3487- while (uc && breaks.contains(uc)) { 3488- uc = *current++; 3489+ 3490+ // Now add the words. Words are non-space characters at the beginning of 3491+ // lines, and must be at least one UChar. If a word has an associated value, 3492+ // the value should follow the word on the same line after a tab character. 3493+ current = wordSourceU; 3494+ UChar *candidate = current; 3495+ uc = *current++; 3496+ int32_t length = 0; 3497+ int count = 0; 3498+ 3499+ while (uc) { 3500+ while (uc && !u_isspace(uc)) { 3501+ ++length; 3502+ uc = *current++; 3503+ } 3504+ 3505+ UnicodeString valueString; 3506+ UChar candidateValue; 3507+ if(uc == 0x0009){ //separator is a tab char, read in number after space 3508+ while (uc && u_isspace(uc)) { 3509+ uc = *current++; 3510+ } 3511+ while (uc && !u_isspace(uc)) { 3512+ valueString.append(uc); 3513+ uc = *current++; 3514+ } 3515+ } 3516+ 3517+ if (length > 0) { 3518+ count++; 3519+ if(valueString.length() > 0){ 3520+ mtd->setValued(TRUE); 3521+ 3522+ uint32_t value = 0; 3523+ char* s = new char[valueString.length()]; 3524+ valueString.extract(0,valueString.length(), s, valueString.length()); 3525+ int n = sscanf(s, "%ud", &value); 3526+ U_ASSERT(n == 1); 3527+ U_ASSERT(value >= 0); 3528+ mtd->addWord(candidate, length, status, (uint16_t)value); 3529+ delete[] s; 3530+ } else { 3531+ mtd->addWord(candidate, length, status); 3532+ } 3533+ 3534+ if (U_FAILURE(status)) { 3535+ fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\" at line %d in input file\n", 3536+ u_errorName(status), count); 3537+ exit(status); 3538+ } 3539+ } 3540+ 3541+ // Find beginning of next line 3542+ while (uc && !breaks.contains(uc)) { 3543+ uc = *current++; 3544+ } 3545+ // Find next non-line-breaking character 3546+ while (uc && breaks.contains(uc)) { 3547+ uc = *current++; 3548+ } 3549+ candidate = current-1; 3550+ length = 0; 3551 } 3552- candidate = current-1; 3553- length = 0; 3554+ 3555+ // Get rid of the Unicode text buffer 3556+ delete[] wordSourceU; 3557 } 3558 3559- // Get rid of the Unicode text buffer 3560- delete[] wordSourceU; 3561- 3562 // Now, create a CompactTrieDictionary from the mutable dictionary 3563 CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status); 3564 if (U_FAILURE(status)) { 3565@@ -393,4 +440,3 @@ 3566 3567 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 3568 } 3569- 3570--- source/tools/genctd/Makefile.in 2006-12-16 13:07:01.000000000 -0800 3571+++ source/tools/genctd/Makefile.in 2011-01-21 14:12:45.555920000 -0800 3572@@ -23,13 +23,13 @@ 3573 ## Extra files to remove for 'make clean' 3574 CLEANFILES = *~ $(DEPS) $(MAN_FILES) 3575 3576-## Target information 3577+## Target informationcd 3578 TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) 3579 3580 ifneq ($(top_builddir),$(top_srcdir)) 3581 CPPFLAGS += -I$(top_builddir)/common 3582 endif 3583-CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil 3584+CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil -I$(top_srcdir)/i18n 3585 LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) 3586 3587 OBJECTS = genctd.o 3588