• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1--- source/common/brkeng.cpp	2009-11-11 07:47:22.000000000 -0800
2+++ source/common/brkeng.cpp	2011-01-21 14:12:45.479922000 -0800
3@@ -226,6 +226,30 @@
4             case USCRIPT_THAI:
5                 engine = new ThaiBreakEngine(dict, status);
6                 break;
7+
8+            case USCRIPT_HANGUL:
9+                engine = new CjkBreakEngine(dict, kKorean, status);
10+                break;
11+
12+            // use same BreakEngine and dictionary for both Chinese and Japanese
13+            case USCRIPT_HIRAGANA:
14+            case USCRIPT_KATAKANA:
15+            case USCRIPT_HAN:
16+                engine = new CjkBreakEngine(dict, kChineseJapanese, status);
17+                break;
18+#if 0
19+            // TODO: Have to get some characters with script=common handled
20+            // by CjkBreakEngine (e.g. U+309B). Simply subjecting
21+            // them to CjkBreakEngine does not work. The engine has to
22+            // special-case them.
23+            case USCRIPT_COMMON:
24+            {
25+                UBlockCode block = ublock_getCode(code);
26+                if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
27+                   engine = new CjkBreakEngine(dict, kChineseJapanese, status);
28+                break;
29+            }
30+#endif
31             default:
32                 break;
33             }
34@@ -281,6 +305,13 @@
35             dict = NULL;
36         }
37         return dict;
38+    } else if (dictfname != NULL){
39+        //create dummy dict if dictionary filename not valid
40+        UChar c = 0x0020;
41+        status = U_ZERO_ERROR;
42+        MutableTrieDictionary *mtd = new MutableTrieDictionary(c, status, TRUE);
43+        mtd->addWord(&c, 1, status, 1);
44+        return new CompactTrieDictionary(*mtd, status);
45     }
46     return NULL;
47 }
48--- source/common/dictbe.cpp	2008-06-13 12:21:12.000000000 -0700
49+++ source/common/dictbe.cpp	2011-01-21 14:12:45.468928000 -0800
50@@ -16,6 +16,9 @@
51 #include "unicode/ubrk.h"
52 #include "uvector.h"
53 #include "triedict.h"
54+#include "uassert.h"
55+#include "unicode/normlzr.h"
56+#include "cmemory.h"
57
58 U_NAMESPACE_BEGIN
59
60@@ -422,6 +425,294 @@
61     return wordsFound;
62 }
63
64+/*
65+ ******************************************************************
66+ * CjkBreakEngine
67+ */
68+static const uint32_t kuint32max = 0xFFFFFFFF;
69+CjkBreakEngine::CjkBreakEngine(const TrieWordDictionary *adoptDictionary, LanguageType type, UErrorCode &status)
70+: DictionaryBreakEngine(1<<UBRK_WORD), fDictionary(adoptDictionary){
71+    if (!adoptDictionary->getValued()) {
72+        status = U_ILLEGAL_ARGUMENT_ERROR;
73+        return;
74+    }
75+
76+    // Korean dictionary only includes Hangul syllables
77+    fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
78+    fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
79+    fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
80+    fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
81+
82+    if (U_SUCCESS(status)) {
83+        // handle Korean and Japanese/Chinese using different dictionaries
84+        if (type == kKorean) {
85+            setCharacters(fHangulWordSet);
86+        } else { //Chinese and Japanese
87+            UnicodeSet cjSet;
88+            cjSet.addAll(fHanWordSet);
89+            cjSet.addAll(fKatakanaWordSet);
90+            cjSet.addAll(fHiraganaWordSet);
91+            cjSet.add(UNICODE_STRING_SIMPLE("\\uff70\\u30fc"));
92+            setCharacters(cjSet);
93+        }
94+    }
95+}
96+
97+CjkBreakEngine::~CjkBreakEngine(){
98+    delete fDictionary;
99+}
100+
101+// The katakanaCost values below are based on the length frequencies of all
102+// katakana phrases in the dictionary
103+static const int kMaxKatakanaLength = 8;
104+static const int kMaxKatakanaGroupLength = 20;
105+static const uint32_t maxSnlp = 255;
106+
107+static inline uint32_t getKatakanaCost(int wordLength){
108+    //TODO: fill array with actual values from dictionary!
109+    static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
110+                                       = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
111+    return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
112+}
113+
114+static inline bool isKatakana(uint16_t value) {
115+    return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
116+            (value >= 0xFF66u && value <= 0xFF9fu);
117+}
118+
119+// A very simple helper class to streamline the buffer handling in
120+// divideUpDictionaryRange.
121+template<class T, size_t N>
122+class AutoBuffer {
123+ public:
124+  AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) {
125+    if (size > N) {
126+      buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
127+      capacity = size;
128+    }
129+  }
130+  ~AutoBuffer() {
131+    if (buffer != stackBuffer)
132+      uprv_free(buffer);
133+  }
134+#if 0
135+  T* operator& () {
136+    return buffer;
137+  }
138+#endif
139+  T* elems() {
140+    return buffer;
141+  }
142+  const T& operator[] (size_t i) const {
143+    return buffer[i];
144+  }
145+  T& operator[] (size_t i) {
146+    return buffer[i];
147+  }
148+
149+  // resize without copy
150+  void resize(size_t size) {
151+    if (size <= capacity)
152+      return;
153+    if (buffer != stackBuffer)
154+      uprv_free(buffer);
155+    buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
156+    capacity = size;
157+  }
158+ private:
159+  T stackBuffer[N];
160+  T* buffer;
161+  AutoBuffer();
162+  size_t capacity;
163+};
164+
165+
166+/*
167+ * @param text A UText representing the text
168+ * @param rangeStart The start of the range of dictionary characters
169+ * @param rangeEnd The end of the range of dictionary characters
170+ * @param foundBreaks Output of C array of int32_t break positions, or 0
171+ * @return The number of breaks found
172+ */
173+int32_t
174+CjkBreakEngine::divideUpDictionaryRange( UText *text,
175+        int32_t rangeStart,
176+        int32_t rangeEnd,
177+        UStack &foundBreaks ) const {
178+    if (rangeStart >= rangeEnd) {
179+        return 0;
180+    }
181+
182+    const size_t defaultInputLength = 80;
183+    size_t inputLength = rangeEnd - rangeStart;
184+    AutoBuffer<UChar, defaultInputLength> charString(inputLength);
185+
186+    // Normalize the input string and put it in normalizedText.
187+    // The map from the indices of the normalized input to the raw
188+    // input is kept in charPositions.
189+    UErrorCode status = U_ZERO_ERROR;
190+    utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status);
191+    if (U_FAILURE(status))
192+        return 0;
193+
194+    UnicodeString inputString(charString.elems(), inputLength);
195+    UNormalizationMode norm_mode = UNORM_NFKC;
196+    UBool isNormalized =
197+        Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES ||
198+        Normalizer::isNormalized(inputString, norm_mode, status);
199+
200+    AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1);
201+    int numChars = 0;
202+    UText normalizedText = UTEXT_INITIALIZER;
203+    // Needs to be declared here because normalizedText holds onto its buffer.
204+    UnicodeString normalizedString;
205+    if (isNormalized) {
206+        int32_t index = 0;
207+        charPositions[0] = 0;
208+        while(index < inputString.length()) {
209+            index = inputString.moveIndex32(index, 1);
210+            charPositions[++numChars] = index;
211+        }
212+        utext_openUnicodeString(&normalizedText, &inputString, &status);
213+    }
214+    else {
215+        Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status);
216+        if (U_FAILURE(status))
217+            return 0;
218+        charPositions.resize(normalizedString.length() + 1);
219+        Normalizer normalizer(charString.elems(), inputLength, norm_mode);
220+        int32_t index = 0;
221+        charPositions[0] = 0;
222+        while(index < normalizer.endIndex()){
223+            UChar32 uc = normalizer.next();
224+            charPositions[++numChars] = index = normalizer.getIndex();
225+        }
226+        utext_openUnicodeString(&normalizedText, &normalizedString, &status);
227+    }
228+
229+    if (U_FAILURE(status))
230+        return 0;
231+
232+    // From this point on, all the indices refer to the indices of
233+    // the normalized input string.
234+
235+    // bestSnlp[i] is the snlp of the best segmentation of the first i
236+    // characters in the range to be matched.
237+    AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1);
238+    bestSnlp[0] = 0;
239+    for(int i=1; i<=numChars; i++){
240+        bestSnlp[i] = kuint32max;
241+    }
242+
243+    // prev[i] is the index of the last CJK character in the previous word in
244+    // the best segmentation of the first i characters.
245+    AutoBuffer<int, defaultInputLength> prev(numChars + 1);
246+    for(int i=0; i<=numChars; i++){
247+        prev[i] = -1;
248+    }
249+
250+    const size_t maxWordSize = 20;
251+    AutoBuffer<uint16_t, maxWordSize> values(numChars);
252+    AutoBuffer<int32_t, maxWordSize> lengths(numChars);
253+
254+    // Dynamic programming to find the best segmentation.
255+    bool is_prev_katakana = false;
256+    for (int i = 0; i < numChars; ++i) {
257+        //utext_setNativeIndex(text, rangeStart + i);
258+        utext_setNativeIndex(&normalizedText, i);
259+        if (bestSnlp[i] == kuint32max)
260+            continue;
261+
262+        int count;
263+        // limit maximum word length matched to size of current substring
264+        int maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize: numChars - i;
265+
266+        fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems());
267+
268+        // if there are no single character matches found in the dictionary
269+        // starting with this charcter, treat character as a 1-character word
270+        // with the highest value possible, i.e. the least likely to occur.
271+        // Exclude Korean characters from this treatment, as they should be left
272+        // together by default.
273+        if((count == 0 || lengths[0] != 1) &&
274+                !fHangulWordSet.contains(utext_current32(&normalizedText))){
275+            values[count] = maxSnlp;
276+            lengths[count++] = 1;
277+        }
278+
279+        for (int j = 0; j < count; j++){
280+            //U_ASSERT(values[j] >= 0 && values[j] <= maxSnlp);
281+            uint32_t newSnlp = bestSnlp[i] + values[j];
282+            if (newSnlp < bestSnlp[lengths[j] + i]) {
283+                bestSnlp[lengths[j] + i] = newSnlp;
284+                prev[lengths[j] + i] = i;
285+            }
286+        }
287+
288+        // In Japanese,
289+        // Katakana word in single character is pretty rare. So we apply
290+        // the following heuristic to Katakana: any continuous run of Katakana
291+        // characters is considered a candidate word with a default cost
292+        // specified in the katakanaCost table according to its length.
293+        //utext_setNativeIndex(text, rangeStart + i);
294+        utext_setNativeIndex(&normalizedText, i);
295+        bool is_katakana = isKatakana(utext_current32(&normalizedText));
296+        if (!is_prev_katakana && is_katakana) {
297+            int j = i + 1;
298+            utext_next32(&normalizedText);
299+            // Find the end of the continuous run of Katakana characters
300+            while (j < numChars && (j - i) < kMaxKatakanaGroupLength &&
301+                    isKatakana(utext_current32(&normalizedText))) {
302+                utext_next32(&normalizedText);
303+                ++j;
304+            }
305+            if ((j - i) < kMaxKatakanaGroupLength) {
306+                uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
307+                if (newSnlp < bestSnlp[j]) {
308+                    bestSnlp[j] = newSnlp;
309+                    prev[j] = i;
310+                }
311+            }
312+        }
313+        is_prev_katakana = is_katakana;
314+    }
315+
316+    // Start pushing the optimal offset index into t_boundary (t for tentative).
317+    // prev[numChars] is guaranteed to be meaningful.
318+    // We'll first push in the reverse order, i.e.,
319+    // t_boundary[0] = numChars, and afterwards do a swap.
320+    AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);
321+
322+    int numBreaks = 0;
323+    // No segmentation found, set boundary to end of range
324+    if (bestSnlp[numChars] == kuint32max) {
325+        t_boundary[numBreaks++] = numChars;
326+    } else {
327+        for (int i = numChars; i > 0; i = prev[i]){
328+            t_boundary[numBreaks++] = i;
329+
330+        }
331+        U_ASSERT(prev[t_boundary[numBreaks-1]] == 0);
332+    }
333+
334+    // Reverse offset index in t_boundary.
335+    // Don't add a break for the start of the dictionary range if there is one
336+    // there already.
337+    if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
338+        t_boundary[numBreaks++] = 0;
339+    }
340+
341+    // Now that we're done, convert positions in t_bdry[] (indices in
342+    // the normalized input string) back to indices in the raw input string
343+    // while reversing t_bdry and pushing values to foundBreaks.
344+    for (int i = numBreaks-1; i >= 0; i--) {
345+        foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status);
346+    }
347+
348+    utext_close(&normalizedText);
349+    return numBreaks;
350+}
351+
352 U_NAMESPACE_END
353
354 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
355--- source/common/dictbe.h	2006-09-29 17:37:45.000000000 -0700
356+++ source/common/dictbe.h	2011-01-21 14:12:45.492920000 -0800
357@@ -1,8 +1,8 @@
358 /**
359- *******************************************************************************
360- * Copyright (C) 2006, International Business Machines Corporation and others. *
361- * All Rights Reserved.                                                        *
362- *******************************************************************************
363+ **********************************************************************************
364+ * Copyright (C) 2006-2010, International Business Machines Corporation and others.
365+ * All Rights Reserved.
366+ **********************************************************************************
367  */
368
369 #ifndef DICTBE_H
370@@ -65,31 +65,31 @@
371    */
372   virtual ~DictionaryBreakEngine();
373
374- /**
375-  * <p>Indicate whether this engine handles a particular character for
376-  * a particular kind of break.</p>
377-  *
378-  * @param c A character which begins a run that the engine might handle
379-  * @param breakType The type of text break which the caller wants to determine
380-  * @return TRUE if this engine handles the particular character and break
381-  * type.
382-  */
383+  /**
384+   * <p>Indicate whether this engine handles a particular character for
385+   * a particular kind of break.</p>
386+   *
387+   * @param c A character which begins a run that the engine might handle
388+   * @param breakType The type of text break which the caller wants to determine
389+   * @return TRUE if this engine handles the particular character and break
390+   * type.
391+   */
392   virtual UBool handles( UChar32 c, int32_t breakType ) const;
393
394- /**
395-  * <p>Find any breaks within a run in the supplied text.</p>
396-  *
397-  * @param text A UText representing the text. The
398-  * iterator is left at the end of the run of characters which the engine
399-  * is capable of handling.
400-  * @param startPos The start of the run within the supplied text.
401-  * @param endPos The end of the run within the supplied text.
402-  * @param reverse Whether the caller is looking for breaks in a reverse
403-  * direction.
404-  * @param breakType The type of break desired, or -1.
405-  * @param foundBreaks An allocated C array of the breaks found, if any
406-  * @return The number of breaks found.
407-  */
408+  /**
409+   * <p>Find any breaks within a run in the supplied text.</p>
410+   *
411+   * @param text A UText representing the text. The iterator is left at
412+   * the end of the run of characters which the engine is capable of handling
413+   * that starts from the first (or last) character in the range.
414+   * @param startPos The start of the run within the supplied text.
415+   * @param endPos The end of the run within the supplied text.
416+   * @param reverse Whether the caller is looking for breaks in a reverse
417+   * direction.
418+   * @param breakType The type of break desired, or -1.
419+   * @param foundBreaks An allocated C array of the breaks found, if any
420+   * @return The number of breaks found.
421+   */
422   virtual int32_t findBreaks( UText *text,
423                               int32_t startPos,
424                               int32_t endPos,
425@@ -114,7 +114,7 @@
426 //  virtual void setBreakTypes( uint32_t breakTypes );
427
428  /**
429-  * <p>Divide up a range of known dictionary characters.</p>
430+  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
431   *
432   * @param text A UText representing the text
433   * @param rangeStart The start of the range of dictionary characters
434@@ -171,7 +171,7 @@
435
436  protected:
437  /**
438-  * <p>Divide up a range of known dictionary characters.</p>
439+  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
440   *
441   * @param text A UText representing the text
442   * @param rangeStart The start of the range of dictionary characters
443@@ -186,6 +186,66 @@
444
445 };
446
447+/*******************************************************************
448+ * CjkBreakEngine
449+ */
450+
451+//indicates language/script that the CjkBreakEngine will handle
452+enum LanguageType {
453+    kKorean,
454+    kChineseJapanese
455+};
456+
457+/**
458+ * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
459+ * TrieWordDictionary with costs associated with each word and
460+ * Viterbi decoding to determine CJK-specific breaks.</p>
461+ */
462+class CjkBreakEngine : public DictionaryBreakEngine {
463+ protected:
464+    /**
465+     * The set of characters handled by this engine
466+     * @internal
467+     */
468+  UnicodeSet                fHangulWordSet;
469+  UnicodeSet                fHanWordSet;
470+  UnicodeSet                fKatakanaWordSet;
471+  UnicodeSet                fHiraganaWordSet;
472+
473+  const TrieWordDictionary  *fDictionary;
474+
475+ public:
476+
477+    /**
478+     * <p>Default constructor.</p>
479+     *
480+     * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the
481+     * engine is deleted. The TrieWordDictionary must contain costs for each word
482+     * in order for the dictionary to work properly.
483+     */
484+  CjkBreakEngine(const TrieWordDictionary *adoptDictionary, LanguageType type, UErrorCode &status);
485+
486+    /**
487+     * <p>Virtual destructor.</p>
488+     */
489+  virtual ~CjkBreakEngine();
490+
491+ protected:
492+    /**
493+     * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
494+     *
495+     * @param text A UText representing the text
496+     * @param rangeStart The start of the range of dictionary characters
497+     * @param rangeEnd The end of the range of dictionary characters
498+     * @param foundBreaks Output of C array of int32_t break positions, or 0
499+     * @return The number of breaks found
500+     */
501+  virtual int32_t divideUpDictionaryRange( UText *text,
502+          int32_t rangeStart,
503+          int32_t rangeEnd,
504+          UStack &foundBreaks ) const;
505+
506+};
507
508 U_NAMESPACE_END
509
510--- source/common/rbbi.cpp	2010-07-22 17:15:37.000000000 -0700
511+++ source/common/rbbi.cpp	2011-01-21 14:12:45.457938000 -0800
512@@ -1555,10 +1555,12 @@
513                             int32_t endPos,
514                             UBool reverse) {
515     // Reset the old break cache first.
516-    uint32_t dictionaryCount = fDictionaryCharCount;
517     reset();
518
519-    if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
520+    // note: code segment below assumes that dictionary chars are in the
521+    // startPos-endPos range
522+    // value returned should be next character in sequence
523+    if ((endPos - startPos) <= 1) {
524         return (reverse ? startPos : endPos);
525     }
526
527@@ -1711,7 +1713,7 @@
528             // proposed break by one of the breaks we found. Use following() and
529             // preceding() to do the work. They should never recurse in this case.
530             if (reverse) {
531-                return preceding(endPos - 1);
532+                return preceding(endPos);
533             }
534             else {
535                 return following(startPos);
536--- source/common/triedict.cpp	2008-02-13 01:35:50.000000000 -0800
537+++ source/common/triedict.cpp	2011-01-21 14:12:45.271006000 -0800
538@@ -20,6 +20,7 @@
539 #include "uvector.h"
540 #include "uvectr32.h"
541 #include "uarrsort.h"
542+#include "hash.h"
543
544 //#define DEBUG_TRIE_DICT 1
545
546@@ -27,6 +28,11 @@
547 #include <sys/times.h>
548 #include <limits.h>
549 #include <stdio.h>
550+#include <time.h>
551+#ifndef CLK_TCK
552+#define CLK_TCK      CLOCKS_PER_SEC
553+#endif
554+
555 #endif
556
557 U_NAMESPACE_BEGIN
558@@ -45,6 +51,11 @@
559  * MutableTrieDictionary
560  */
561
562+//#define MAX_VALUE 65535
563+
564+// forward declaration
565+inline uint16_t scaleLogProbabilities(double logprob);
566+
567 // Node structure for the ternary, uncompressed trie
568 struct TernaryNode : public UMemory {
569     UChar       ch;         // UTF-16 code unit
570@@ -77,7 +88,8 @@
571     delete high;
572 }
573
574-MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status ) {
575+MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status,
576+                                              UBool containsValue /* = FALSE */  ) {
577     // Start the trie off with something. Having the root node already present
578     // cuts a special case out of the search/insertion functions.
579     // Making it a median character cuts the worse case for searches from
580@@ -91,14 +103,19 @@
581     if (U_SUCCESS(status) && fIter == NULL) {
582         status = U_MEMORY_ALLOCATION_ERROR;
583     }
584+
585+    fValued = containsValue;
586 }
587
588-MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) {
589+MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status,
590+                                              UBool containsValue /* = false */ ) {
591     fTrie = NULL;
592     fIter = utext_openUChars(NULL, NULL, 0, &status);
593     if (U_SUCCESS(status) && fIter == NULL) {
594         status = U_MEMORY_ALLOCATION_ERROR;
595     }
596+
597+    fValued = containsValue;
598 }
599
600 MutableTrieDictionary::~MutableTrieDictionary() {
601@@ -108,12 +125,13 @@
602
603 int32_t
604 MutableTrieDictionary::search( UText *text,
605-                                   int32_t maxLength,
606-                                   int32_t *lengths,
607-                                   int &count,
608-                                   int limit,
609-                                   TernaryNode *&parent,
610-                                   UBool &pMatched ) const {
611+                               int32_t maxLength,
612+                               int32_t *lengths,
613+                               int &count,
614+                               int limit,
615+                               TernaryNode *&parent,
616+                               UBool &pMatched,
617+                               uint16_t *values /*=NULL*/) const {
618     // TODO: current implementation works in UTF-16 space
619     const TernaryNode *up = NULL;
620     const TernaryNode *p = fTrie;
621@@ -121,6 +139,10 @@
622     pMatched = TRUE;
623     int i;
624
625+    if (!fValued) {
626+        values = NULL;
627+    }
628+
629     UChar uc = utext_current32(text);
630     for (i = 0; i < maxLength && p != NULL; ++i) {
631         while (p != NULL) {
632@@ -141,7 +163,11 @@
633             break;
634         }
635         // Must be equal to get here
636-        if (limit > 0 && (p->flags & kEndsWord)) {
637+        if (limit > 0 && (p->flags > 0)) {
638+            //is there a more efficient way to add values? ie. remove if stmt
639+            if(values != NULL) {
640+                values[mycount] = p->flags;
641+            }
642             lengths[mycount++] = i+1;
643             --limit;
644         }
645@@ -161,13 +187,14 @@
646 void
647 MutableTrieDictionary::addWord( const UChar *word,
648                                 int32_t length,
649-                                UErrorCode &status ) {
650-#if 0
651-    if (length <= 0) {
652+                                UErrorCode &status,
653+                                uint16_t value /* = 0 */ ) {
654+    // dictionary cannot store zero values, would interfere with flags
655+    if (length <= 0 || (!fValued && value > 0) || (fValued && value == 0)) {
656         status = U_ILLEGAL_ARGUMENT_ERROR;
657         return;
658     }
659-#endif
660+
661     TernaryNode *parent;
662     UBool pMatched;
663     int count;
664@@ -177,7 +204,7 @@
665     matched = search(fIter, length, NULL, count, 0, parent, pMatched);
666
667     while (matched++ < length) {
668-        UChar32 uc = utext_next32(fIter);  // TODO:  supplemetary support?
669+        UChar32 uc = utext_next32(fIter);  // TODO:  supplementary support?
670         U_ASSERT(uc != U_SENTINEL);
671         TernaryNode *newNode = new TernaryNode(uc);
672         if (newNode == NULL) {
673@@ -199,30 +226,23 @@
674         parent = newNode;
675     }
676
677-    parent->flags |= kEndsWord;
678-}
679-
680-#if 0
681-void
682-MutableTrieDictionary::addWords( UEnumeration *words,
683-                                  UErrorCode &status ) {
684-    int32_t length;
685-    const UChar *word;
686-    while ((word = uenum_unext(words, &length, &status)) && U_SUCCESS(status)) {
687-        addWord(word, length, status);
688+    if(fValued && value > 0){
689+        parent->flags = value;
690+    } else {
691+        parent->flags |= kEndsWord;
692     }
693 }
694-#endif
695
696 int32_t
697 MutableTrieDictionary::matches( UText *text,
698                                 int32_t maxLength,
699                                 int32_t *lengths,
700                                 int &count,
701-                                int limit ) const {
702+                                int limit,
703+                                uint16_t *values /*=NULL*/) const {
704     TernaryNode *parent;
705     UBool pMatched;
706-    return search(text, maxLength, lengths, count, limit, parent, pMatched);
707+    return search(text, maxLength, lengths, count, limit, parent, pMatched, values);
708 }
709
710 // Implementation of iteration for MutableTrieDictionary
711@@ -277,7 +297,7 @@
712                     break;
713                 }
714             case kEqual:
715-                emit = (node->flags & kEndsWord) != 0;
716+                emit = node->flags > 0;
717                 equal = (node->equal != NULL);
718                 // If this node should be part of the next emitted string, append
719                 // the UChar to the string, and make sure we pop it when we come
720@@ -299,7 +319,7 @@
721                 }
722             case kGreaterThan:
723                 // If this node's character is in the string, remove it.
724-                if (node->equal != NULL || (node->flags & kEndsWord)) {
725+                if (node->equal != NULL || node->flags > 0) {
726                     unistr.truncate(unistr.length()-1);
727                 }
728                 if (node->high != NULL) {
729@@ -354,12 +374,75 @@
730  * CompactTrieDictionary
731  */
732
733+//TODO further optimization:
734+// minimise size of trie with logprobs by storing values
735+// for terminal nodes directly in offsets[]
736+// --> calculating from next offset *might* be simpler, but would have to add
737+// one last offset for logprob of last node
738+// --> if calculate from current offset, need to factor in possible overflow
739+// as well.
740+// idea: store in offset, set first bit to indicate logprob storage-->won't
741+// have to access additional node
742+
743+// {'Dic', 1}, version 1: uses old header, no values
744+#define COMPACT_TRIE_MAGIC_1 0x44696301
745+// version 2: uses new header (more than 2^16 nodes), no values
746+#define COMPACT_TRIE_MAGIC_2 0x44696302
747+// version 3: uses new header, includes values
748+#define COMPACT_TRIE_MAGIC_3 0x44696303
749+
750 struct CompactTrieHeader {
751     uint32_t        size;           // Size of the data in bytes
752     uint32_t        magic;          // Magic number (including version)
753+    uint32_t        nodeCount;      // Number of entries in offsets[]
754+    uint32_t        root;           // Node number of the root node
755+    uint32_t        offsets[1];     // Offsets to nodes from start of data
756+};
757+
758+// old version of CompactTrieHeader kept for backwards compatibility
759+struct CompactTrieHeaderV1 {
760+    uint32_t        size;           // Size of the data in bytes
761+    uint32_t        magic;          // Magic number (including version)
762     uint16_t        nodeCount;      // Number of entries in offsets[]
763     uint16_t        root;           // Node number of the root node
764-    uint32_t        offsets[1];      // Offsets to nodes from start of data
765+    uint32_t        offsets[1];     // Offsets to nodes from start of data
766+};
767+
768+// Helper class for managing CompactTrieHeader and CompactTrieHeaderV1
769+struct CompactTrieInfo {
770+    uint32_t        size;           // Size of the data in bytes
771+    uint32_t        magic;          // Magic number (including version)
772+    uint32_t        nodeCount;      // Number of entries in offsets[]
773+    uint32_t        root;           // Node number of the root node
774+    uint32_t        *offsets;       // Offsets to nodes from start of data
775+    uint8_t         *address;       // pointer to header bytes in memory
776+
777+    CompactTrieInfo(const void *data, UErrorCode &status){
778+        CompactTrieHeader *header = (CompactTrieHeader *) data;
779+        if (header->magic != COMPACT_TRIE_MAGIC_1 &&
780+                header->magic != COMPACT_TRIE_MAGIC_2 &&
781+                header->magic != COMPACT_TRIE_MAGIC_3) {
782+            status = U_ILLEGAL_ARGUMENT_ERROR;
783+        } else {
784+            size = header->size;
785+            magic = header->magic;
786+
787+            if (header->magic == COMPACT_TRIE_MAGIC_1) {
788+                CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *) header;
789+                nodeCount = headerV1->nodeCount;
790+                root = headerV1->root;
791+                offsets = &(headerV1->offsets[0]);
792+                address = (uint8_t *)headerV1;
793+            } else {
794+                nodeCount = header->nodeCount;
795+                root = header->root;
796+                offsets = &(header->offsets[0]);
797+                address = (uint8_t *)header;
798+            }
799+        }
800+    }
801+
802+    ~CompactTrieInfo(){}
803 };
804
805 // Note that to avoid platform-specific alignment issues, all members of the node
806@@ -375,10 +458,14 @@
807 enum CompactTrieNodeFlags {
808     kVerticalNode   = 0x1000,       // This is a vertical node
809     kParentEndsWord = 0x2000,       // The node whose equal link points to this ends a word
810-    kReservedFlag1  = 0x4000,
811-    kReservedFlag2  = 0x8000,
812+    kExceedsCount   = 0x4000,       // new MSB for count >= 4096, originally kReservedFlag1
813+    kEqualOverflows = 0x8000,       // Links to nodeIDs > 2^16, orig. kReservedFlag2
814     kCountMask      = 0x0FFF,       // The count portion of flagscount
815-    kFlagMask       = 0xF000        // The flags portion of flagscount
816+    kFlagMask       = 0xF000,       // The flags portion of flagscount
817+    kRootCountMask  = 0x7FFF        // The count portion of flagscount in the root node
818+
819+    //offset flags:
820+    //kOffsetContainsValue = 0x80000000       // Offset contains value for parent node
821 };
822
823 // The two node types are distinguished by the kVerticalNode flag.
824@@ -402,63 +489,177 @@
825     uint16_t        chars[1];       // Code units
826 };
827
828-// {'Dic', 1}, version 1
829-#define COMPACT_TRIE_MAGIC_1 0x44696301
830-
831 CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj,
832                                                 UErrorCode &status )
833 : fUData(dataObj)
834 {
835-    fData = (const CompactTrieHeader *) udata_getMemory(dataObj);
836+    fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));
837+    *fInfo = CompactTrieInfo(udata_getMemory(dataObj), status);
838     fOwnData = FALSE;
839-    if (fData->magic != COMPACT_TRIE_MAGIC_1) {
840-        status = U_ILLEGAL_ARGUMENT_ERROR;
841-        fData = NULL;
842-    }
843 }
844+
845 CompactTrieDictionary::CompactTrieDictionary( const void *data,
846                                                 UErrorCode &status )
847 : fUData(NULL)
848 {
849-    fData = (const CompactTrieHeader *) data;
850+    fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));
851+    *fInfo = CompactTrieInfo(data, status);
852     fOwnData = FALSE;
853-    if (fData->magic != COMPACT_TRIE_MAGIC_1) {
854-        status = U_ILLEGAL_ARGUMENT_ERROR;
855-        fData = NULL;
856-    }
857 }
858
859 CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict,
860                                                 UErrorCode &status )
861 : fUData(NULL)
862 {
863-    fData = compactMutableTrieDictionary(dict, status);
864+    const CompactTrieHeader* header = compactMutableTrieDictionary(dict, status);
865+    if (U_SUCCESS(status)) {
866+        fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo));
867+        *fInfo = CompactTrieInfo(header, status);
868+    }
869+
870     fOwnData = !U_FAILURE(status);
871 }
872
873 CompactTrieDictionary::~CompactTrieDictionary() {
874     if (fOwnData) {
875-        uprv_free((void *)fData);
876+        uprv_free((void *)(fInfo->address));
877     }
878+    uprv_free((void *)fInfo);
879+
880     if (fUData) {
881         udata_close(fUData);
882     }
883 }
884
885+UBool CompactTrieDictionary::getValued() const{
886+    return fInfo->magic == COMPACT_TRIE_MAGIC_3;
887+}
888+
889 uint32_t
890 CompactTrieDictionary::dataSize() const {
891-    return fData->size;
892+    return fInfo->size;
893 }
894
895 const void *
896 CompactTrieDictionary::data() const {
897-    return fData;
898+    return fInfo->address;
899+}
900+
901+//This function finds the address of a node for us, given its node ID
902+static inline const CompactTrieNode *
903+getCompactNode(const CompactTrieInfo *info, uint32_t node) {
904+    if(node < info->root-1) {
905+        return (const CompactTrieNode *)(&info->offsets[node]);
906+    } else {
907+        return (const CompactTrieNode *)(info->address + info->offsets[node]);
908+    }
909 }
910
911-// This function finds the address of a node for us, given its node ID
912+//this version of getCompactNode is currently only used in compactMutableTrieDictionary()
913 static inline const CompactTrieNode *
914-getCompactNode(const CompactTrieHeader *header, uint16_t node) {
915-    return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]);
916+getCompactNode(const CompactTrieHeader *header, uint32_t node) {
917+    if(node < header->root-1) {
918+        return (const CompactTrieNode *)(&header->offsets[node]);
919+    } else {
920+        return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]);
921+    }
922+}
923+
924+
925+/**
926+ * Calculates the number of links in a node
927+ * @node The specified node
928+ */
929+static inline const uint16_t
930+getCount(const CompactTrieNode *node){
931+    return (node->flagscount & kCountMask);
932+    //use the code below if number of links ever exceed 4096
933+    //return (node->flagscount & kCountMask) + ((node->flagscount & kExceedsCount) >> 2);
934+}
935+
936+/**
937+ * calculates an equal link node ID of a horizontal node
938+ * @hnode The horizontal node containing the equal link
939+ * @param index The index into hnode->entries[]
940+ * @param nodeCount The length of hnode->entries[]
941+ */
942+static inline uint32_t calcEqualLink(const CompactTrieVerticalNode *vnode){
943+    if(vnode->flagscount & kEqualOverflows){
944+        // treat overflow bits as an extension of chars[]
945+        uint16_t *overflow = (uint16_t *) &vnode->chars[getCount((CompactTrieNode*)vnode)];
946+        return vnode->equal + (((uint32_t)*overflow) << 16);
947+    }else{
948+        return vnode->equal;
949+    }
950+}
951+
952+/**
953+ * calculates an equal link node ID of a horizontal node
954+ * @hnode The horizontal node containing the equal link
955+ * @param index The index into hnode->entries[]
956+ * @param nodeCount The length of hnode->entries[]
957+ */
958+static inline uint32_t calcEqualLink(const CompactTrieHorizontalNode *hnode, uint16_t index, uint16_t nodeCount){
959+    if(hnode->flagscount & kEqualOverflows){
960+        //set overflow to point to the uint16_t containing the overflow bits
961+        uint16_t *overflow = (uint16_t *) &hnode->entries[nodeCount];
962+        overflow += index/4;
963+        uint16_t extraBits = (*overflow >> (3 - (index % 4)) * 4) % 0x10;
964+        return hnode->entries[index].equal + (((uint32_t)extraBits) << 16);
965+    } else {
966+        return hnode->entries[index].equal;
967+    }
968+}
969+
970+/**
971+ * Returns the value stored in the specified node which is associated with its
972+ * parent node.
973+ * TODO: how to tell that value is stored in node or in offset? check whether
974+ * node ID < fInfo->root!
975+ */
976+static inline uint16_t getValue(const CompactTrieHorizontalNode *hnode){
977+    uint16_t count = getCount((CompactTrieNode *)hnode);
978+    uint16_t overflowSize = 0; //size of node ID overflow storage in bytes
979+
980+    if(hnode->flagscount & kEqualOverflows)
981+        overflowSize = (count + 3) / 4 * sizeof(uint16_t);
982+    return *((uint16_t *)((uint8_t *)&hnode->entries[count] + overflowSize));
983+}
984+
985+static inline uint16_t getValue(const CompactTrieVerticalNode *vnode){
986+    // calculate size of total node ID overflow storage in bytes
987+    uint16_t overflowSize = (vnode->flagscount & kEqualOverflows)? sizeof(uint16_t) : 0;
988+    return *((uint16_t *)((uint8_t *)&vnode->chars[getCount((CompactTrieNode *)vnode)] + overflowSize));
989+}
990+
991+static inline uint16_t getValue(const CompactTrieNode *node){
992+    if(node->flagscount & kVerticalNode)
993+        return getValue((const CompactTrieVerticalNode *)node);
994+    else
995+        return getValue((const CompactTrieHorizontalNode *)node);
996+}
997+
998+//returns index of match in CompactTrieHorizontalNode.entries[] using binary search
999+inline int16_t
1000+searchHorizontalEntries(const CompactTrieHorizontalEntry *entries,
1001+        UChar uc, uint16_t nodeCount){
1002+    int low = 0;
1003+    int high = nodeCount-1;
1004+    int middle;
1005+    while (high >= low) {
1006+        middle = (high+low)/2;
1007+        if (uc == entries[middle].ch) {
1008+            return middle;
1009+        }
1010+        else if (uc < entries[middle].ch) {
1011+            high = middle-1;
1012+        }
1013+        else {
1014+            low = middle+1;
1015+        }
1016+    }
1017+
1018+    return -1;
1019 }
1020
1021 int32_t
1022@@ -466,17 +667,38 @@
1023                                 int32_t maxLength,
1024                                 int32_t *lengths,
1025                                 int &count,
1026-                                int limit ) const {
1027+                                int limit,
1028+                                uint16_t *values /*= NULL*/) const {
1029+    if (fInfo->magic == COMPACT_TRIE_MAGIC_2)
1030+        values = NULL;
1031+
1032     // TODO: current implementation works in UTF-16 space
1033-    const CompactTrieNode *node = getCompactNode(fData, fData->root);
1034+    const CompactTrieNode *node = getCompactNode(fInfo, fInfo->root);
1035     int mycount = 0;
1036
1037     UChar uc = utext_current32(text);
1038     int i = 0;
1039
1040+    // handle root node with only kEqualOverflows flag: assume horizontal node without parent
1041+    if(node != NULL){
1042+        const CompactTrieHorizontalNode *root = (const CompactTrieHorizontalNode *) node;
1043+        int index = searchHorizontalEntries(root->entries, uc, root->flagscount & kRootCountMask);
1044+        if(index > -1){
1045+            node = getCompactNode(fInfo, calcEqualLink(root, index, root->flagscount & kRootCountMask));
1046+            utext_next32(text);
1047+            uc = utext_current32(text);
1048+            ++i;
1049+        }else{
1050+            node = NULL;
1051+        }
1052+    }
1053+
1054     while (node != NULL) {
1055         // Check if the node we just exited ends a word
1056         if (limit > 0 && (node->flagscount & kParentEndsWord)) {
1057+            if(values != NULL){
1058+                values[mycount] = getValue(node);
1059+            }
1060             lengths[mycount++] = i;
1061             --limit;
1062         }
1063@@ -487,7 +709,7 @@
1064             break;
1065         }
1066
1067-        int nodeCount = (node->flagscount & kCountMask);
1068+        int nodeCount = getCount(node);
1069         if (nodeCount == 0) {
1070             // Special terminal node; return now
1071             break;
1072@@ -507,35 +729,27 @@
1073             // To get here we must have come through the whole list successfully;
1074             // go on to the next node. Note that a word cannot end in the middle
1075             // of a vertical node.
1076-            node = getCompactNode(fData, vnode->equal);
1077+            node = getCompactNode(fInfo, calcEqualLink(vnode));
1078         }
1079         else {
1080             // Horizontal node; do binary search
1081             const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;
1082-            int low = 0;
1083-            int high = nodeCount-1;
1084-            int middle;
1085-            node = NULL;    // If we don't find a match, we'll fall out of the loop
1086-            while (high >= low) {
1087-                middle = (high+low)/2;
1088-                if (uc == hnode->entries[middle].ch) {
1089-                    // We hit a match; get the next node and next character
1090-                    node = getCompactNode(fData, hnode->entries[middle].equal);
1091-                    utext_next32(text);
1092-                    uc = utext_current32(text);
1093-                    ++i;
1094-                    break;
1095-                }
1096-                else if (uc < hnode->entries[middle].ch) {
1097-                    high = middle-1;
1098-                }
1099-                else {
1100-                    low = middle+1;
1101-                }
1102+            const CompactTrieHorizontalEntry *entries;
1103+            entries = hnode->entries;
1104+
1105+            int index = searchHorizontalEntries(entries, uc, nodeCount);
1106+            if(index > -1){  //
1107+                // We hit a match; get the next node and next character
1108+                node = getCompactNode(fInfo, calcEqualLink(hnode, index, nodeCount));
1109+                utext_next32(text);
1110+                uc = utext_current32(text);
1111+                ++i;
1112+            }else{
1113+                node = NULL;    // If we don't find a match, we'll fall out of the loop
1114             }
1115         }
1116     }
1117-exit:
1118+    exit:
1119     count = mycount;
1120     return i;
1121 }
1122@@ -545,16 +759,16 @@
1123 private:
1124     UVector32               fNodeStack;     // Stack of nodes to process
1125     UVector32               fIndexStack;    // Stack of where in node we are
1126-    const CompactTrieHeader *fHeader;       // Trie data
1127+    const CompactTrieInfo   *fInfo;         // Trie data
1128
1129 public:
1130     static UClassID U_EXPORT2 getStaticClassID(void);
1131     virtual UClassID getDynamicClassID(void) const;
1132 public:
1133-    CompactTrieEnumeration(const CompactTrieHeader *header, UErrorCode &status)
1134+    CompactTrieEnumeration(const CompactTrieInfo *info, UErrorCode &status)
1135         : fNodeStack(status), fIndexStack(status) {
1136-        fHeader = header;
1137-        fNodeStack.push(header->root, status);
1138+        fInfo = info;
1139+        fNodeStack.push(info->root, status);
1140         fIndexStack.push(0, status);
1141         unistr.remove();
1142     }
1143@@ -564,14 +778,14 @@
1144
1145     virtual StringEnumeration *clone() const {
1146         UErrorCode status = U_ZERO_ERROR;
1147-        return new CompactTrieEnumeration(fHeader, status);
1148+        return new CompactTrieEnumeration(fInfo, status);
1149     }
1150
1151     virtual const UnicodeString * snext(UErrorCode &status);
1152
1153     // Very expensive, but this should never be used.
1154     virtual int32_t count(UErrorCode &status) const {
1155-        CompactTrieEnumeration counter(fHeader, status);
1156+        CompactTrieEnumeration counter(fInfo, status);
1157         int32_t result = 0;
1158         while (counter.snext(status) != NULL && U_SUCCESS(status)) {
1159             ++result;
1160@@ -582,7 +796,7 @@
1161     virtual void reset(UErrorCode &status) {
1162         fNodeStack.removeAllElements();
1163         fIndexStack.removeAllElements();
1164-        fNodeStack.push(fHeader->root, status);
1165+        fNodeStack.push(fInfo->root, status);
1166         fIndexStack.push(0, status);
1167         unistr.remove();
1168     }
1169@@ -595,26 +809,34 @@
1170     if (fNodeStack.empty() || U_FAILURE(status)) {
1171         return NULL;
1172     }
1173-    const CompactTrieNode *node = getCompactNode(fHeader, fNodeStack.peeki());
1174+    const CompactTrieNode *node = getCompactNode(fInfo, fNodeStack.peeki());
1175     int where = fIndexStack.peeki();
1176     while (!fNodeStack.empty() && U_SUCCESS(status)) {
1177-        int nodeCount = (node->flagscount & kCountMask);
1178+        int nodeCount;
1179+
1180+        bool isRoot = fNodeStack.peeki() == static_cast<int32_t>(fInfo->root);
1181+        if(isRoot){
1182+            nodeCount = node->flagscount & kRootCountMask;
1183+        } else {
1184+            nodeCount = getCount(node);
1185+        }
1186+
1187         UBool goingDown = FALSE;
1188         if (nodeCount == 0) {
1189             // Terminal node; go up immediately
1190             fNodeStack.popi();
1191             fIndexStack.popi();
1192-            node = getCompactNode(fHeader, fNodeStack.peeki());
1193+            node = getCompactNode(fInfo, fNodeStack.peeki());
1194             where = fIndexStack.peeki();
1195         }
1196-        else if (node->flagscount & kVerticalNode) {
1197+        else if ((node->flagscount & kVerticalNode) && !isRoot) {
1198             // Vertical node
1199             const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node;
1200             if (where == 0) {
1201                 // Going down
1202-                unistr.append((const UChar *)vnode->chars, (int32_t) nodeCount);
1203+                unistr.append((const UChar *)vnode->chars, nodeCount);
1204                 fIndexStack.setElementAt(1, fIndexStack.size()-1);
1205-                node = getCompactNode(fHeader, fNodeStack.push(vnode->equal, status));
1206+                node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(vnode), status));
1207                 where = fIndexStack.push(0, status);
1208                 goingDown = TRUE;
1209             }
1210@@ -623,7 +845,7 @@
1211                 unistr.truncate(unistr.length()-nodeCount);
1212                 fNodeStack.popi();
1213                 fIndexStack.popi();
1214-                node = getCompactNode(fHeader, fNodeStack.peeki());
1215+                node = getCompactNode(fInfo, fNodeStack.peeki());
1216                 where = fIndexStack.peeki();
1217             }
1218         }
1219@@ -638,7 +860,7 @@
1220                 // Push on next node
1221                 unistr.append((UChar)hnode->entries[where].ch);
1222                 fIndexStack.setElementAt(where+1, fIndexStack.size()-1);
1223-                node = getCompactNode(fHeader, fNodeStack.push(hnode->entries[where].equal, status));
1224+                node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(hnode, where, nodeCount), status));
1225                 where = fIndexStack.push(0, status);
1226                 goingDown = TRUE;
1227             }
1228@@ -646,12 +868,14 @@
1229                 // Going up
1230                 fNodeStack.popi();
1231                 fIndexStack.popi();
1232-                node = getCompactNode(fHeader, fNodeStack.peeki());
1233+                node = getCompactNode(fInfo, fNodeStack.peeki());
1234                 where = fIndexStack.peeki();
1235             }
1236         }
1237+
1238         // Check if the parent of the node we've just gone down to ends a
1239         // word. If so, return it.
1240+        // The root node should never end up here.
1241         if (goingDown && (node->flagscount & kParentEndsWord)) {
1242             return &unistr;
1243         }
1244@@ -664,7 +888,7 @@
1245     if (U_FAILURE(status)) {
1246         return NULL;
1247     }
1248-    return new CompactTrieEnumeration(fData, status);
1249+    return new CompactTrieEnumeration(fInfo, status);
1250 }
1251
1252 //
1253@@ -672,21 +896,36 @@
1254 // and back again
1255 //
1256
1257-// Helper classes to construct the compact trie
1258+enum CompactTrieNodeType {
1259+    kHorizontalType = 0,
1260+    kVerticalType = 1,
1261+    kValueType = 2
1262+};
1263+
1264+/**
1265+ * The following classes (i.e. BuildCompactTrie*Node) are helper classes to
1266+ * construct the compact trie by storing information for each node and later
1267+ * writing the node to memory in a sequential format.
1268+ */
1269 class BuildCompactTrieNode: public UMemory {
1270- public:
1271+public:
1272     UBool           fParentEndsWord;
1273-    UBool           fVertical;
1274+    CompactTrieNodeType fNodeType;
1275     UBool           fHasDuplicate;
1276+    UBool           fEqualOverflows;
1277     int32_t         fNodeID;
1278     UnicodeString   fChars;
1279+    uint16_t        fValue;
1280
1281- public:
1282-    BuildCompactTrieNode(UBool parentEndsWord, UBool vertical, UStack &nodes, UErrorCode &status) {
1283+public:
1284+    BuildCompactTrieNode(UBool parentEndsWord, CompactTrieNodeType nodeType,
1285+            UStack &nodes, UErrorCode &status, uint16_t value = 0) {
1286         fParentEndsWord = parentEndsWord;
1287         fHasDuplicate = FALSE;
1288-        fVertical = vertical;
1289+        fNodeType = nodeType;
1290+        fEqualOverflows = FALSE;
1291         fNodeID = nodes.size();
1292+        fValue = parentEndsWord? value : 0;
1293         nodes.push(this, status);
1294     }
1295
1296@@ -694,87 +933,225 @@
1297     }
1298
1299     virtual uint32_t size() {
1300-        return sizeof(uint16_t);
1301+        if(fValue > 0)
1302+            return sizeof(uint16_t) * 2;
1303+        else
1304+            return sizeof(uint16_t);
1305     }
1306
1307     virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &/*translate*/) {
1308         // Write flag/count
1309-        *((uint16_t *)(bytes+offset)) = (fChars.length() & kCountMask)
1310-            | (fVertical ? kVerticalNode : 0) | (fParentEndsWord ? kParentEndsWord : 0 );
1311+
1312+        // if this ever fails, a flag bit (i.e. kExceedsCount) will need to be
1313+        // used as a 5th MSB.
1314+        U_ASSERT(fChars.length() < 4096 || fNodeID == 2);
1315+
1316+        *((uint16_t *)(bytes+offset)) = (fEqualOverflows? kEqualOverflows : 0) |
1317+        ((fNodeID == 2)? (fChars.length() & kRootCountMask):
1318+            (
1319+                    (fChars.length() & kCountMask) |
1320+                    //((fChars.length() << 2) & kExceedsCount) |
1321+                    (fNodeType == kVerticalType ? kVerticalNode : 0) |
1322+                    (fParentEndsWord ? kParentEndsWord : 0 )
1323+            )
1324+        );
1325         offset += sizeof(uint16_t);
1326     }
1327+
1328+    virtual void writeValue(uint8_t *bytes, uint32_t &offset) {
1329+        if(fValue > 0){
1330+            *((uint16_t *)(bytes+offset)) = fValue;
1331+            offset += sizeof(uint16_t);
1332+        }
1333+    }
1334+
1335+};
1336+
1337+/**
1338+ * Stores value of parent terminating nodes that have no more subtries.
1339+ */
1340+class BuildCompactTrieValueNode: public BuildCompactTrieNode {
1341+public:
1342+    BuildCompactTrieValueNode(UStack &nodes, UErrorCode &status, uint16_t value)
1343+        : BuildCompactTrieNode(TRUE, kValueType, nodes, status, value){
1344+    }
1345+
1346+    virtual ~BuildCompactTrieValueNode(){
1347+    }
1348+
1349+    virtual uint32_t size() {
1350+        return sizeof(uint16_t) * 2;
1351+    }
1352+
1353+    virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {
1354+        // don't write value directly to memory but store it in offset to be written later
1355+        //offset = fValue & kOffsetContainsValue;
1356+        BuildCompactTrieNode::write(bytes, offset, translate);
1357+        BuildCompactTrieNode::writeValue(bytes, offset);
1358+    }
1359 };
1360
1361 class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode {
1362  public:
1363     UStack          fLinks;
1364+    UBool           fMayOverflow; //intermediate value for fEqualOverflows
1365
1366  public:
1367-    BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status)
1368-        : BuildCompactTrieNode(parentEndsWord, FALSE, nodes, status), fLinks(status) {
1369+    BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status, uint16_t value = 0)
1370+    : BuildCompactTrieNode(parentEndsWord, kHorizontalType, nodes, status, value), fLinks(status) {
1371+        fMayOverflow = FALSE;
1372     }
1373
1374     virtual ~BuildCompactTrieHorizontalNode() {
1375     }
1376
1377+    // It is impossible to know beforehand exactly how much space the node will
1378+    // need in memory before being written, because the node IDs in the equal
1379+    // links may or may not overflow after node coalescing. Therefore, this method
1380+    // returns the maximum size possible for the node.
1381     virtual uint32_t size() {
1382-        return offsetof(CompactTrieHorizontalNode,entries) +
1383-                (fChars.length()*sizeof(CompactTrieHorizontalEntry));
1384+        uint32_t estimatedSize = offsetof(CompactTrieHorizontalNode,entries) +
1385+        (fChars.length()*sizeof(CompactTrieHorizontalEntry));
1386+
1387+        if(fValue > 0)
1388+            estimatedSize += sizeof(uint16_t);
1389+
1390+        //estimate extra space needed to store overflow for node ID links
1391+        //may be more than what is actually needed
1392+        for(int i=0; i < fChars.length(); i++){
1393+            if(((BuildCompactTrieNode *)fLinks[i])->fNodeID > 0xFFFF){
1394+                fMayOverflow = TRUE;
1395+                break;
1396+            }
1397+        }
1398+        if(fMayOverflow) // added space for overflow should be same as ceil(fChars.length()/4) * sizeof(uint16_t)
1399+            estimatedSize += (sizeof(uint16_t) * fChars.length() + 2)/4;
1400+
1401+        return estimatedSize;
1402     }
1403
1404     virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {
1405-        BuildCompactTrieNode::write(bytes, offset, translate);
1406         int32_t count = fChars.length();
1407+
1408+        //if largest nodeID > 2^16, set flag
1409+        //large node IDs are more likely to be at the back of the array
1410+        for (int32_t i = count-1; i >= 0; --i) {
1411+            if(translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID) > 0xFFFF){
1412+                fEqualOverflows = TRUE;
1413+                break;
1414+            }
1415+        }
1416+
1417+        BuildCompactTrieNode::write(bytes, offset, translate);
1418+
1419+        // write entries[] to memory
1420         for (int32_t i = 0; i < count; ++i) {
1421             CompactTrieHorizontalEntry *entry = (CompactTrieHorizontalEntry *)(bytes+offset);
1422             entry->ch = fChars[i];
1423             entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID);
1424 #ifdef DEBUG_TRIE_DICT
1425-            if (entry->equal == 0) {
1426+
1427+            if ((entry->equal == 0) && !fEqualOverflows) {
1428                 fprintf(stderr, "ERROR: horizontal link %d, logical node %d maps to physical node zero\n",
1429                         i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID);
1430             }
1431 #endif
1432             offset += sizeof(CompactTrieHorizontalEntry);
1433         }
1434+
1435+        // append extra bits of equal nodes to end if fEqualOverflows
1436+        if (fEqualOverflows) {
1437+            uint16_t leftmostBits = 0;
1438+            for (int16_t i = 0; i < count; i++) {
1439+                leftmostBits = (leftmostBits << 4) | getLeftmostBits(translate, i);
1440+
1441+                // write filled uint16_t to memory
1442+                if(i % 4 == 3){
1443+                    *((uint16_t *)(bytes+offset)) = leftmostBits;
1444+                    leftmostBits = 0;
1445+                    offset += sizeof(uint16_t);
1446+                }
1447+            }
1448+
1449+            // pad last uint16_t with zeroes if necessary
1450+            int remainder = count % 4;
1451+            if (remainder > 0) {
1452+                *((uint16_t *)(bytes+offset)) = (leftmostBits << (16 - 4 * remainder));
1453+                offset += sizeof(uint16_t);
1454+            }
1455+        }
1456+
1457+        BuildCompactTrieNode::writeValue(bytes, offset);
1458+    }
1459+
1460+    // returns leftmost bits of physical node link
1461+    uint16_t getLeftmostBits(const UVector32 &translate, uint32_t i){
1462+        uint16_t leftmostBits = (uint16_t) (translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID) >> 16);
1463+#ifdef DEBUG_TRIE_DICT
1464+        if (leftmostBits > 0xF) {
1465+            fprintf(stderr, "ERROR: horizontal link %d, logical node %d exceeds maximum possible node ID value\n",
1466+                    i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID);
1467+        }
1468+#endif
1469+        return leftmostBits;
1470     }
1471
1472     void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) {
1473         fChars.append(ch);
1474         fLinks.push(link, status);
1475     }
1476+
1477 };
1478
1479 class BuildCompactTrieVerticalNode: public BuildCompactTrieNode {
1480- public:
1481+public:
1482     BuildCompactTrieNode    *fEqual;
1483
1484- public:
1485-    BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status)
1486-        : BuildCompactTrieNode(parentEndsWord, TRUE, nodes, status) {
1487+public:
1488+    BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status, uint16_t value = 0)
1489+    : BuildCompactTrieNode(parentEndsWord, kVerticalType, nodes, status, value) {
1490         fEqual = NULL;
1491     }
1492
1493     virtual ~BuildCompactTrieVerticalNode() {
1494     }
1495
1496+    // Returns the maximum possible size of this node. See comment in
1497+    // BuildCompactTrieHorizontal node for more information.
1498     virtual uint32_t size() {
1499-        return offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof(uint16_t));
1500+        uint32_t estimatedSize = offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof(uint16_t));
1501+        if(fValue > 0){
1502+            estimatedSize += sizeof(uint16_t);
1503+        }
1504+
1505+        if(fEqual->fNodeID > 0xFFFF){
1506+            estimatedSize += sizeof(uint16_t);
1507+        }
1508+        return estimatedSize;
1509     }
1510
1511     virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {
1512         CompactTrieVerticalNode *node = (CompactTrieVerticalNode *)(bytes+offset);
1513+        fEqualOverflows = (translate.elementAti(fEqual->fNodeID) > 0xFFFF);
1514         BuildCompactTrieNode::write(bytes, offset, translate);
1515         node->equal = translate.elementAti(fEqual->fNodeID);
1516         offset += sizeof(node->equal);
1517 #ifdef DEBUG_TRIE_DICT
1518-        if (node->equal == 0) {
1519+        if ((node->equal == 0) && !fEqualOverflows) {
1520             fprintf(stderr, "ERROR: vertical link, logical node %d maps to physical node zero\n",
1521                     fEqual->fNodeID);
1522         }
1523 #endif
1524         fChars.extract(0, fChars.length(), (UChar *)node->chars);
1525-        offset += sizeof(uint16_t)*fChars.length();
1526+        offset += sizeof(UChar)*fChars.length();
1527+
1528+        // append 16 bits of to end for equal node if fEqualOverflows
1529+        if (fEqualOverflows) {
1530+            *((uint16_t *)(bytes+offset)) = (translate.elementAti(fEqual->fNodeID) >> 16);
1531+            offset += sizeof(uint16_t);
1532+        }
1533+
1534+        BuildCompactTrieNode::writeValue(bytes, offset);
1535     }
1536
1537     void addChar(UChar ch) {
1538@@ -784,60 +1161,85 @@
1539     void setLink(BuildCompactTrieNode *node) {
1540         fEqual = node;
1541     }
1542+
1543 };
1544
1545 // Forward declaration
1546 static void walkHorizontal(const TernaryNode *node,
1547                             BuildCompactTrieHorizontalNode *building,
1548                             UStack &nodes,
1549-                            UErrorCode &status);
1550+                            UErrorCode &status,
1551+                            Hashtable *values);
1552
1553-// Convert one node. Uses recursion.
1554+// Convert one TernaryNode into a BuildCompactTrieNode. Uses recursion.
1555
1556 static BuildCompactTrieNode *
1557-compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, UErrorCode &status) {
1558+compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes,
1559+        UErrorCode &status, Hashtable *values = NULL, uint16_t parentValue = 0) {
1560     if (U_FAILURE(status)) {
1561         return NULL;
1562     }
1563     BuildCompactTrieNode *result = NULL;
1564     UBool horizontal = (node->low != NULL || node->high != NULL);
1565     if (horizontal) {
1566-        BuildCompactTrieHorizontalNode *hResult =
1567-                new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status);
1568+        BuildCompactTrieHorizontalNode *hResult;
1569+        if(values != NULL){
1570+            hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status, parentValue);
1571+        } else {
1572+            hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status);
1573+        }
1574+
1575         if (hResult == NULL) {
1576             status = U_MEMORY_ALLOCATION_ERROR;
1577             return NULL;
1578         }
1579         if (U_SUCCESS(status)) {
1580-            walkHorizontal(node, hResult, nodes, status);
1581+            walkHorizontal(node, hResult, nodes, status, values);
1582             result = hResult;
1583         }
1584     }
1585     else {
1586-        BuildCompactTrieVerticalNode *vResult =
1587-                new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status);
1588+        BuildCompactTrieVerticalNode *vResult;
1589+        if(values != NULL){
1590+            vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status, parentValue);
1591+        } else {
1592+            vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status);
1593+        }
1594+
1595         if (vResult == NULL) {
1596             status = U_MEMORY_ALLOCATION_ERROR;
1597+            return NULL;
1598         }
1599         else if (U_SUCCESS(status)) {
1600-            UBool   endsWord = FALSE;
1601+            uint16_t   value = 0;
1602+            UBool endsWord = FALSE;
1603             // Take up nodes until we end a word, or hit a node with < or > links
1604             do {
1605                 vResult->addChar(node->ch);
1606-                endsWord = (node->flags & kEndsWord) != 0;
1607+                value = node->flags;
1608+                endsWord = value > 0;
1609                 node = node->equal;
1610             }
1611             while(node != NULL && !endsWord && node->low == NULL && node->high == NULL);
1612+
1613             if (node == NULL) {
1614                 if (!endsWord) {
1615                     status = U_ILLEGAL_ARGUMENT_ERROR;  // Corrupt input trie
1616                 }
1617-                else {
1618+                else if(values != NULL){
1619+                    UnicodeString key(value); //store value as a single-char UnicodeString
1620+                    BuildCompactTrieValueNode *link = (BuildCompactTrieValueNode *) values->get(key);
1621+                    if(link == NULL){
1622+                        link = new BuildCompactTrieValueNode(nodes, status, value); //take out nodes?
1623+                        values->put(key, link, status);
1624+                    }
1625+                    vResult->setLink(link);
1626+                } else {
1627                     vResult->setLink((BuildCompactTrieNode *)nodes[1]);
1628                 }
1629             }
1630             else {
1631-                vResult->setLink(compactOneNode(node, endsWord, nodes, status));
1632+                vResult->setLink(compactOneNode(node, endsWord, nodes, status, values, value));
1633             }
1634             result = vResult;
1635         }
1636@@ -849,19 +1251,28 @@
1637 // Uses recursion.
1638
1639 static void walkHorizontal(const TernaryNode *node,
1640-                            BuildCompactTrieHorizontalNode *building,
1641-                            UStack &nodes,
1642-                            UErrorCode &status) {
1643+                           BuildCompactTrieHorizontalNode *building,
1644+                           UStack &nodes,
1645+                           UErrorCode &status, Hashtable *values = NULL) {
1646     while (U_SUCCESS(status) && node != NULL) {
1647         if (node->low != NULL) {
1648-            walkHorizontal(node->low, building, nodes, status);
1649+            walkHorizontal(node->low, building, nodes, status, values);
1650         }
1651         BuildCompactTrieNode *link = NULL;
1652         if (node->equal != NULL) {
1653-            link = compactOneNode(node->equal, (node->flags & kEndsWord) != 0, nodes, status);
1654+            link = compactOneNode(node->equal, node->flags > 0, nodes, status, values, node->flags);
1655         }
1656-        else if (node->flags & kEndsWord) {
1657-            link = (BuildCompactTrieNode *)nodes[1];
1658+        else if (node->flags > 0) {
1659+            if(values != NULL) {
1660+                UnicodeString key(node->flags); //store value as a single-char UnicodeString
1661+                link = (BuildCompactTrieValueNode *) values->get(key);
1662+                if(link == NULL) {
1663+                    link = new BuildCompactTrieValueNode(nodes, status, node->flags); //take out nodes?
1664+                    values->put(key, link, status);
1665+                }
1666+            } else {
1667+                link = (BuildCompactTrieNode *)nodes[1];
1668+            }
1669         }
1670         if (U_SUCCESS(status) && link != NULL) {
1671             building->addNode(node->ch, link, status);
1672@@ -881,13 +1292,15 @@
1673 _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr) {
1674     BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl;
1675     BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr;
1676+
1677     // Check for comparing a node to itself, to avoid spurious duplicates
1678     if (left == right) {
1679         return 0;
1680     }
1681+
1682     // Most significant is type of node. Can never coalesce.
1683-    if (left->fVertical != right->fVertical) {
1684-        return left->fVertical - right->fVertical;
1685+    if (left->fNodeType != right->fNodeType) {
1686+        return left->fNodeType - right->fNodeType;
1687     }
1688     // Next, the "parent ends word" flag. If that differs, we cannot coalesce.
1689     if (left->fParentEndsWord != right->fParentEndsWord) {
1690@@ -898,12 +1311,19 @@
1691     if (result != 0) {
1692         return result;
1693     }
1694+
1695+    // If the node value differs, we should not coalesce.
1696+    // If values aren't stored, all fValues should be 0.
1697+    if (left->fValue != right->fValue) {
1698+        return left->fValue - right->fValue;
1699+    }
1700+
1701     // We know they're both the same node type, so branch for the two cases.
1702-    if (left->fVertical) {
1703+    if (left->fNodeType == kVerticalType) {
1704         result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID
1705-                            - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID;
1706+        - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID;
1707     }
1708-    else {
1709+    else if(left->fChars.length() > 0 && right->fChars.length() > 0){
1710         // We need to compare the links vectors. They should be the
1711         // same size because the strings were equal.
1712         // We compare the node IDs instead of the pointers, to handle
1713@@ -914,9 +1334,10 @@
1714         int32_t count = hleft->fLinks.size();
1715         for (int32_t i = 0; i < count && result == 0; ++i) {
1716             result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID -
1717-                     ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;
1718+            ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;
1719         }
1720     }
1721+
1722     // If they are equal to each other, mark them (speeds coalescing)
1723     if (result == 0) {
1724         left->fHasDuplicate = TRUE;
1725@@ -1031,20 +1452,25 @@
1726     // Add node 0, used as the NULL pointer/sentinel.
1727     nodes.addElement((int32_t)0, status);
1728
1729+    Hashtable *values = NULL;                           // Index of (unique) values
1730+    if (dict.fValued) {
1731+        values = new Hashtable(status);
1732+    }
1733+
1734     // Start by creating the special empty node we use to indicate that the parent
1735     // terminates a word. This must be node 1, because the builder assumes
1736-    // that.
1737+    // that. This node will never be used for tries storing numerical values.
1738     if (U_FAILURE(status)) {
1739         return NULL;
1740     }
1741-    BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, FALSE, nodes, status);
1742+    BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, kHorizontalType, nodes, status);
1743     if (terminal == NULL) {
1744         status = U_MEMORY_ALLOCATION_ERROR;
1745     }
1746
1747     // This call does all the work of building the new trie structure. The root
1748-    // will be node 2.
1749-    BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status);
1750+    // will have node ID 2 before writing to memory.
1751+    BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status, values);
1752 #ifdef DEBUG_TRIE_DICT
1753     (void) ::times(&timing);
1754     fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n",
1755@@ -1077,21 +1503,37 @@
1756         return NULL;
1757     }
1758
1759+    //map terminal value nodes
1760+    int valueCount = 0;
1761+    UVector valueNodes(status);
1762+    if(values != NULL) {
1763+        valueCount = values->count(); //number of unique terminal value nodes
1764+    }
1765+
1766+    // map non-terminal nodes
1767+    int valuePos = 1;//, nodePos = valueCount + valuePos;
1768+    nodeCount = valueCount + valuePos;
1769     for (i = 1; i < count; ++i) {
1770         node = (BuildCompactTrieNode *)nodes[i];
1771         if (node->fNodeID == i) {
1772             // Only one node out of each duplicate set is used
1773-            if (i >= translate.size()) {
1774+            if (node->fNodeID >= translate.size()) {
1775                 // Logically extend the mapping table
1776-                translate.setSize(i+1);
1777+                translate.setSize(i + 1);
1778+            }
1779+            //translate.setElementAt(object, index)!
1780+            if(node->fNodeType == kValueType) {
1781+                valueNodes.addElement(node, status);
1782+               translate.setElementAt(valuePos++, i);
1783+             } else {
1784+                translate.setElementAt(nodeCount++, i);
1785             }
1786-            translate.setElementAt(nodeCount++, i);
1787             totalSize += node->size();
1788         }
1789     }
1790-
1791-    // Check for overflowing 16 bits worth of nodes.
1792-    if (nodeCount > 0x10000) {
1793+
1794+    // Check for overflowing 20 bits worth of nodes.
1795+    if (nodeCount > 0x100000) {
1796         status = U_ILLEGAL_ARGUMENT_ERROR;
1797         return NULL;
1798     }
1799@@ -1111,9 +1553,14 @@
1800         status = U_MEMORY_ALLOCATION_ERROR;
1801         return NULL;
1802     }
1803-
1804+
1805     CompactTrieHeader *header = (CompactTrieHeader *)bytes;
1806-    header->size = totalSize;
1807+    //header->size = totalSize;
1808+    if(dict.fValued){
1809+        header->magic = COMPACT_TRIE_MAGIC_3;
1810+    } else {
1811+        header->magic = COMPACT_TRIE_MAGIC_2;
1812+    }
1813     header->nodeCount = nodeCount;
1814     header->offsets[0] = 0;                     // Sentinel
1815     header->root = translate.elementAti(root->fNodeID);
1816@@ -1123,23 +1570,40 @@
1817     }
1818 #endif
1819     uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uint32_t));
1820-    nodeCount = 1;
1821+    nodeCount = valueCount + 1;
1822+
1823+    // Write terminal value nodes to memory
1824+    for (i=0; i < valueNodes.size(); i++) {
1825+        //header->offsets[i + 1] = offset;
1826+        uint32_t tmpOffset = 0;
1827+        node = (BuildCompactTrieNode *) valueNodes.elementAt(i);
1828+        //header->offsets[i + 1] = (uint32_t)node->fValue;
1829+        node->write((uint8_t *)&header->offsets[i+1], tmpOffset, translate);
1830+    }
1831+
1832     // Now write the data
1833     for (i = 1; i < count; ++i) {
1834         node = (BuildCompactTrieNode *)nodes[i];
1835-        if (node->fNodeID == i) {
1836+        if (node->fNodeID == i && node->fNodeType != kValueType) {
1837             header->offsets[nodeCount++] = offset;
1838             node->write(bytes, offset, translate);
1839         }
1840     }
1841+
1842+    //free all extra space
1843+    uprv_realloc(bytes, offset);
1844+    header->size = offset;
1845+
1846 #ifdef DEBUG_TRIE_DICT
1847+    fprintf(stdout, "Space freed: %d\n", totalSize-offset);
1848+
1849     (void) ::times(&timing);
1850     fprintf(stderr, "Trie built, time user %f system %f\n",
1851         (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
1852         (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
1853     previous = timing;
1854     fprintf(stderr, "Final offset is %d\n", offset);
1855-
1856+
1857     // Collect statistics on node types and sizes
1858     int hCount = 0;
1859     int vCount = 0;
1860@@ -1148,68 +1612,85 @@
1861     size_t hItemCount = 0;
1862     size_t vItemCount = 0;
1863     uint32_t previousOff = offset;
1864-    for (uint16_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {
1865+    uint32_t numOverflow = 0;
1866+    uint32_t valueSpace = 0;
1867+    for (uint32_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {
1868         const CompactTrieNode *node = getCompactNode(header, nodeIdx);
1869-        if (node->flagscount & kVerticalNode) {
1870+        int itemCount;
1871+        if(nodeIdx == header->root)
1872+            itemCount = node->flagscount & kRootCountMask;
1873+        else
1874+            itemCount = getCount(node);
1875+        if(node->flagscount & kEqualOverflows){
1876+            numOverflow++;
1877+        }
1878+        if (node->flagscount & kVerticalNode && nodeIdx != header->root) {
1879             vCount += 1;
1880-            vItemCount += (node->flagscount & kCountMask);
1881+            vItemCount += itemCount;
1882             vSize += previousOff-header->offsets[nodeIdx];
1883         }
1884         else {
1885             hCount += 1;
1886-            hItemCount += (node->flagscount & kCountMask);
1887-            hSize += previousOff-header->offsets[nodeIdx];
1888+            hItemCount += itemCount;
1889+            if(nodeIdx >= header->root) {
1890+                hSize += previousOff-header->offsets[nodeIdx];
1891+            }
1892         }
1893+
1894+        if(header->magic == COMPACT_TRIE_MAGIC_3 && node->flagscount & kParentEndsWord)
1895+            valueSpace += sizeof(uint16_t);
1896         previousOff = header->offsets[nodeIdx];
1897     }
1898     fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items\n", hCount,
1899                 (double)hSize/hCount, (double)hItemCount/hCount);
1900     fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n", vCount,
1901                 (double)vSize/vCount, (double)vItemCount/vCount);
1902+    fprintf(stderr, "Number of nodes with overflowing nodeIDs: %d \n", numOverflow);
1903+    fprintf(stderr, "Space taken up by values: %d \n", valueSpace);
1904 #endif
1905
1906     if (U_FAILURE(status)) {
1907         uprv_free(bytes);
1908         header = NULL;
1909     }
1910-    else {
1911-        header->magic = COMPACT_TRIE_MAGIC_1;
1912-    }
1913     return header;
1914 }
1915
1916 // Forward declaration
1917 static TernaryNode *
1918-unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UErrorCode &status );
1919-
1920+unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorCode &status );
1921
1922 // Convert a horizontal node (or subarray thereof) into a ternary subtrie
1923 static TernaryNode *
1924-unpackHorizontalArray( const CompactTrieHeader *header, const CompactTrieHorizontalEntry *array,
1925-                            int low, int high, UErrorCode &status ) {
1926+unpackHorizontalArray( const CompactTrieInfo *info, const CompactTrieHorizontalNode *hnode,
1927+        int low, int high, int nodeCount, UErrorCode &status) {
1928     if (U_FAILURE(status) || low > high) {
1929         return NULL;
1930     }
1931     int middle = (low+high)/2;
1932-    TernaryNode *result = new TernaryNode(array[middle].ch);
1933+    TernaryNode *result = new TernaryNode(hnode->entries[middle].ch);
1934     if (result == NULL) {
1935         status = U_MEMORY_ALLOCATION_ERROR;
1936         return NULL;
1937     }
1938-    const CompactTrieNode *equal = getCompactNode(header, array[middle].equal);
1939+    const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(hnode, middle, nodeCount));
1940     if (equal->flagscount & kParentEndsWord) {
1941-        result->flags |= kEndsWord;
1942+        if(info->magic == COMPACT_TRIE_MAGIC_3){
1943+            result->flags = getValue(equal);
1944+        }else{
1945+            result->flags |= kEndsWord;
1946+        }
1947     }
1948-    result->low = unpackHorizontalArray(header, array, low, middle-1, status);
1949-    result->high = unpackHorizontalArray(header, array, middle+1, high, status);
1950-    result->equal = unpackOneNode(header, equal, status);
1951+    result->low = unpackHorizontalArray(info, hnode, low, middle-1, nodeCount, status);
1952+    result->high = unpackHorizontalArray(info, hnode, middle+1, high, nodeCount, status);
1953+    result->equal = unpackOneNode(info, equal, status);
1954     return result;
1955 }
1956
1957 // Convert one compact trie node into a ternary subtrie
1958 static TernaryNode *
1959-unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UErrorCode &status ) {
1960-    int nodeCount = (node->flagscount & kCountMask);
1961+unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorCode &status ) {
1962+    int nodeCount = getCount(node);
1963     if (nodeCount == 0 || U_FAILURE(status)) {
1964         // Failure, or terminal node
1965         return NULL;
1966@@ -1234,29 +1715,41 @@
1967             previous = latest;
1968         }
1969         if (latest != NULL) {
1970-            const CompactTrieNode *equal = getCompactNode(header, vnode->equal);
1971+            const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(vnode));
1972             if (equal->flagscount & kParentEndsWord) {
1973-                latest->flags |= kEndsWord;
1974+                if(info->magic == COMPACT_TRIE_MAGIC_3){
1975+                    latest->flags = getValue(equal);
1976+                } else {
1977+                    latest->flags |= kEndsWord;
1978+                }
1979             }
1980-            latest->equal = unpackOneNode(header, equal, status);
1981+            latest->equal = unpackOneNode(info, equal, status);
1982         }
1983         return head;
1984     }
1985     else {
1986         // Horizontal node
1987         const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;
1988-        return unpackHorizontalArray(header, &hnode->entries[0], 0, nodeCount-1, status);
1989+        return unpackHorizontalArray(info, hnode, 0, nodeCount-1, nodeCount, status);
1990     }
1991 }
1992
1993+// returns a MutableTrieDictionary generated from the CompactTrieDictionary
1994 MutableTrieDictionary *
1995 CompactTrieDictionary::cloneMutable( UErrorCode &status ) const {
1996-    MutableTrieDictionary *result = new MutableTrieDictionary( status );
1997+    MutableTrieDictionary *result = new MutableTrieDictionary( status, fInfo->magic == COMPACT_TRIE_MAGIC_3 );
1998     if (result == NULL) {
1999         status = U_MEMORY_ALLOCATION_ERROR;
2000         return NULL;
2001     }
2002-    TernaryNode *root = unpackOneNode(fData, getCompactNode(fData, fData->root), status);
2003+    // treat root node as special case: don't call unpackOneNode() or unpackHorizontalArray() directly
2004+    // because only kEqualOverflows flag should be checked in root's flagscount
2005+    const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)
2006+    getCompactNode(fInfo, fInfo->root);
2007+    uint16_t nodeCount = hnode->flagscount & kRootCountMask;
2008+    TernaryNode *root = unpackHorizontalArray(fInfo, hnode, 0, nodeCount-1,
2009+            nodeCount, status);
2010+
2011     if (U_FAILURE(status)) {
2012         delete root;    // Clean up
2013         delete result;
2014@@ -1270,8 +1763,8 @@
2015
2016 U_CAPI int32_t U_EXPORT2
2017 triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
2018-           UErrorCode *status) {
2019-
2020+        UErrorCode *status) {
2021+
2022     if (status == NULL || U_FAILURE(*status)) {
2023         return 0;
2024     }
2025@@ -1286,14 +1779,14 @@
2026     //
2027     const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4);
2028     if(!(  pInfo->dataFormat[0]==0x54 &&   /* dataFormat="TrDc" */
2029-           pInfo->dataFormat[1]==0x72 &&
2030-           pInfo->dataFormat[2]==0x44 &&
2031-           pInfo->dataFormat[3]==0x63 &&
2032-           pInfo->formatVersion[0]==1  )) {
2033+            pInfo->dataFormat[1]==0x72 &&
2034+            pInfo->dataFormat[2]==0x44 &&
2035+            pInfo->dataFormat[3]==0x63 &&
2036+            pInfo->formatVersion[0]==1  )) {
2037         udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
2038-                         pInfo->dataFormat[0], pInfo->dataFormat[1],
2039-                         pInfo->dataFormat[2], pInfo->dataFormat[3],
2040-                         pInfo->formatVersion[0]);
2041+                pInfo->dataFormat[0], pInfo->dataFormat[1],
2042+                pInfo->dataFormat[2], pInfo->dataFormat[3],
2043+                pInfo->formatVersion[0]);
2044         *status=U_UNSUPPORTED_ERROR;
2045         return 0;
2046     }
2047@@ -1311,8 +1804,10 @@
2048     //
2049     const uint8_t  *inBytes =(const uint8_t *)inData+headerSize;
2050     const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes;
2051-    if (ds->readUInt32(header->magic) != COMPACT_TRIE_MAGIC_1
2052-            || ds->readUInt32(header->size) < sizeof(CompactTrieHeader))
2053+    uint32_t magic = ds->readUInt32(header->magic);
2054+    if (magic != COMPACT_TRIE_MAGIC_1 && magic != COMPACT_TRIE_MAGIC_2 && magic != COMPACT_TRIE_MAGIC_3
2055+            || magic == COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeaderV1)
2056+            || magic != COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeader))
2057     {
2058         udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n");
2059         *status=U_UNSUPPORTED_ERROR;
2060@@ -1333,10 +1828,10 @@
2061     //
2062     if (length < sizeWithUData) {
2063         udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n",
2064-                            totalSize);
2065+                totalSize);
2066         *status=U_INDEX_OUTOFBOUNDS_ERROR;
2067         return 0;
2068-        }
2069+    }
2070
2071     //
2072     // Swap the Data.  Do the data itself first, then the CompactTrieHeader, because
2073@@ -1355,20 +1850,38 @@
2074     }
2075
2076     // We need to loop through all the nodes in the offset table, and swap each one.
2077-    uint16_t nodeCount = ds->readUInt16(header->nodeCount);
2078+    uint32_t nodeCount, rootId;
2079+    if(header->magic == COMPACT_TRIE_MAGIC_1) {
2080+        nodeCount = ds->readUInt16(((CompactTrieHeaderV1 *)header)->nodeCount);
2081+        rootId = ds->readUInt16(((CompactTrieHeaderV1 *)header)->root);
2082+    } else {
2083+        nodeCount = ds->readUInt32(header->nodeCount);
2084+        rootId = ds->readUInt32(header->root);
2085+    }
2086+
2087     // Skip node 0, which should always be 0.
2088-    for (int i = 1; i < nodeCount; ++i) {
2089+    for (uint32_t i = 1; i < nodeCount; ++i) {
2090         uint32_t nodeOff = ds->readUInt32(header->offsets[i]);
2091         const CompactTrieNode *inNode = (const CompactTrieNode *)(inBytes + nodeOff);
2092         CompactTrieNode *outNode = (CompactTrieNode *)(outBytes + nodeOff);
2093         uint16_t flagscount = ds->readUInt16(inNode->flagscount);
2094-        uint16_t itemCount = flagscount & kCountMask;
2095+        uint16_t itemCount = getCount(inNode);
2096+        //uint16_t itemCount = flagscount & kCountMask;
2097         ds->writeUInt16(&outNode->flagscount, flagscount);
2098         if (itemCount > 0) {
2099-            if (flagscount & kVerticalNode) {
2100+            uint16_t overflow = 0; //number of extra uint16_ts needed to be swapped
2101+            if (flagscount & kVerticalNode && i != rootId) {
2102+                if(flagscount & kEqualOverflows){
2103+                    // include overflow bits
2104+                    overflow += 1;
2105+                }
2106+                if (header->magic == COMPACT_TRIE_MAGIC_3 && flagscount & kEndsParentWord) {
2107+                    //include values
2108+                    overflow += 1;
2109+                }
2110                 ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars),
2111-                                    itemCount*sizeof(uint16_t),
2112-                                    outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), status);
2113+                        (itemCount + overflow)*sizeof(uint16_t),
2114+                        outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), status);
2115                 uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal);
2116                 ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal));
2117             }
2118@@ -1381,26 +1894,62 @@
2119                     word = ds->readUInt16(inHNode->entries[j].equal);
2120                     ds->writeUInt16(&outHNode->entries[j].equal, word);
2121                 }
2122+
2123+                // swap overflow/value information
2124+                if(flagscount & kEqualOverflows){
2125+                    overflow += (itemCount + 3) / 4;
2126+                }
2127+
2128+                if (header->magic == COMPACT_TRIE_MAGIC_3 && i != rootId && flagscount & kEndsParentWord) {
2129+                    //include values
2130+                    overflow += 1;
2131+                }
2132+
2133+                uint16_t *inOverflow = (uint16_t *) &inHNode->entries[itemCount];
2134+                uint16_t *outOverflow = (uint16_t *) &outHNode->entries[itemCount];
2135+                for(int j = 0; j<overflow; j++){
2136+                    uint16_t extraInfo = ds->readUInt16(*inOverflow);
2137+                    ds->writeUInt16(outOverflow, extraInfo);
2138+
2139+                    inOverflow++;
2140+                    outOverflow++;
2141+                }
2142             }
2143         }
2144     }
2145 #endif
2146
2147-    // All the data in all the nodes consist of 16 bit items. Swap them all at once.
2148-    uint16_t nodeCount = ds->readUInt16(header->nodeCount);
2149-    uint32_t nodesOff = offsetof(CompactTrieHeader,offsets)+((uint32_t)nodeCount*sizeof(uint32_t));
2150-    ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff, status);
2151-
2152     // Swap the header
2153     ds->writeUInt32(&outputHeader->size, totalSize);
2154-    uint32_t magic = ds->readUInt32(header->magic);
2155     ds->writeUInt32(&outputHeader->magic, magic);
2156-    ds->writeUInt16(&outputHeader->nodeCount, nodeCount);
2157-    uint16_t root = ds->readUInt16(header->root);
2158-    ds->writeUInt16(&outputHeader->root, root);
2159-    ds->swapArray32(ds, inBytes+offsetof(CompactTrieHeader,offsets),
2160-            sizeof(uint32_t)*(int32_t)nodeCount,
2161-            outBytes+offsetof(CompactTrieHeader,offsets), status);
2162+
2163+    uint32_t nodeCount;
2164+    uint32_t offsetPos;
2165+    if (header->magic == COMPACT_TRIE_MAGIC_1) {
2166+        CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *)header;
2167+        CompactTrieHeaderV1 *outputHeaderV1 = (CompactTrieHeaderV1 *)outputHeader;
2168+
2169+        nodeCount = ds->readUInt16(headerV1->nodeCount);
2170+        ds->writeUInt16(&outputHeaderV1->nodeCount, nodeCount);
2171+        uint16_t root = ds->readUInt16(headerV1->root);
2172+        ds->writeUInt16(&outputHeaderV1->root, root);
2173+        offsetPos = offsetof(CompactTrieHeaderV1,offsets);
2174+    } else {
2175+        nodeCount = ds->readUInt32(header->nodeCount);
2176+        ds->writeUInt32(&outputHeader->nodeCount, nodeCount);
2177+        uint32_t root = ds->readUInt32(header->root);
2178+        ds->writeUInt32(&outputHeader->root, root);
2179+        offsetPos = offsetof(CompactTrieHeader,offsets);
2180+    }
2181+
2182+    // All the data in all the nodes consist of 16 bit items. Swap them all at once.
2183+    uint32_t nodesOff = offsetPos+((uint32_t)nodeCount*sizeof(uint32_t));
2184+    ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff, status);
2185+
2186+    //swap offsets
2187+    ds->swapArray32(ds, inBytes+offsetPos,
2188+            sizeof(uint32_t)*(uint32_t)nodeCount,
2189+            outBytes+offsetPos, status);
2190
2191     return sizeWithUData;
2192 }
2193--- source/common/triedict.h	2006-06-06 15:38:49.000000000 -0700
2194+++ source/common/triedict.h	2011-01-21 14:12:45.496927000 -0800
2195@@ -47,7 +47,6 @@
2196 U_NAMESPACE_BEGIN
2197
2198 class StringEnumeration;
2199-struct CompactTrieHeader;
2200
2201 /*******************************************************************
2202  * TrieWordDictionary
2203@@ -72,23 +71,29 @@
2204    */
2205   virtual ~TrieWordDictionary();
2206
2207+  /**
2208+   * <p>Returns true if the dictionary contains values associated with each word.</p>
2209+   */
2210+  virtual UBool getValued() const = 0;
2211+
2212  /**
2213   * <p>Find dictionary words that match the text.</p>
2214   *
2215   * @param text A UText representing the text. The
2216   * iterator is left after the longest prefix match in the dictionary.
2217-  * @param start The current position in text.
2218   * @param maxLength The maximum number of code units to match.
2219   * @param lengths An array that is filled with the lengths of words that matched.
2220   * @param count Filled with the number of elements output in lengths.
2221   * @param limit The size of the lengths array; this limits the number of words output.
2222+  * @param values An array that is filled with the values associated with the matched words.
2223   * @return The number of characters in text that were matched.
2224   */
2225   virtual int32_t matches( UText *text,
2226                               int32_t maxLength,
2227                               int32_t *lengths,
2228                               int &count,
2229-                              int limit ) const = 0;
2230+                              int limit,
2231+                              uint16_t *values = NULL) const = 0;
2232
2233   /**
2234    * <p>Return a StringEnumeration for iterating all the words in the dictionary.</p>
2235@@ -128,6 +133,12 @@
2236
2237   UText    *fIter;
2238
2239+    /**
2240+     * A UText for internal use
2241+     * @internal
2242+     */
2243+  UBool fValued;
2244+
2245   friend class CompactTrieDictionary;   // For fast conversion
2246
2247  public:
2248@@ -138,14 +149,29 @@
2249   * @param median A UChar around which to balance the trie. Ideally, it should
2250   * begin at least one word that is near the median of the set in the dictionary
2251   * @param status A status code recording the success of the call.
2252+  * @param containsValue True if the dictionary stores values associated with each word.
2253   */
2254-  MutableTrieDictionary( UChar median, UErrorCode &status );
2255+  MutableTrieDictionary( UChar median, UErrorCode &status, UBool containsValue = FALSE );
2256
2257   /**
2258    * <p>Virtual destructor.</p>
2259    */
2260   virtual ~MutableTrieDictionary();
2261
2262+  /**
2263+   * Indicate whether the MutableTrieDictionary stores values associated with each word
2264+   */
2265+  void setValued(UBool valued){
2266+      fValued = valued;
2267+  }
2268+
2269+  /**
2270+   * <p>Returns true if the dictionary contains values associated with each word.</p>
2271+   */
2272+  virtual UBool getValued() const {
2273+      return fValued;
2274+  }
2275+
2276  /**
2277   * <p>Find dictionary words that match the text.</p>
2278   *
2279@@ -155,13 +181,15 @@
2280   * @param lengths An array that is filled with the lengths of words that matched.
2281   * @param count Filled with the number of elements output in lengths.
2282   * @param limit The size of the lengths array; this limits the number of words output.
2283+  * @param values An array that is filled with the values associated with the matched words.
2284   * @return The number of characters in text that were matched.
2285   */
2286   virtual int32_t matches( UText *text,
2287                               int32_t maxLength,
2288                               int32_t *lengths,
2289                               int &count,
2290-                              int limit ) const;
2291+                              int limit,
2292+                              uint16_t *values = NULL) const;
2293
2294   /**
2295    * <p>Return a StringEnumeration for iterating all the words in the dictionary.</p>
2296@@ -173,15 +201,17 @@
2297   virtual StringEnumeration *openWords( UErrorCode &status ) const;
2298
2299  /**
2300-  * <p>Add one word to the dictionary.</p>
2301+  * <p>Add one word to the dictionary with an optional associated value.</p>
2302   *
2303   * @param word A UChar buffer containing the word.
2304   * @param length The length of the word.
2305-  * @param status The resultant status
2306+  * @param status The resultant status.
2307+  * @param value The nonzero value associated with this word.
2308   */
2309   virtual void addWord( const UChar *word,
2310                         int32_t length,
2311-                        UErrorCode &status);
2312+                        UErrorCode &status,
2313+                        uint16_t value = 0);
2314
2315 #if 0
2316  /**
2317@@ -203,8 +233,9 @@
2318   * @param lengths An array that is filled with the lengths of words that matched.
2319   * @param count Filled with the number of elements output in lengths.
2320   * @param limit The size of the lengths array; this limits the number of words output.
2321-  * @param parent The parent of the current node
2322-  * @param pMatched The returned parent node matched the input
2323+  * @param parent The parent of the current node.
2324+  * @param pMatched The returned parent node matched the input/
2325+  * @param values An array that is filled with the values associated with the matched words.
2326   * @return The number of characters in text that were matched.
2327   */
2328   virtual int32_t search( UText *text,
2329@@ -213,40 +244,46 @@
2330                               int &count,
2331                               int limit,
2332                               TernaryNode *&parent,
2333-                              UBool &pMatched ) const;
2334+                              UBool &pMatched,
2335+                              uint16_t *values = NULL) const;
2336
2337 private:
2338  /**
2339   * <p>Private constructor. The root node it not allocated.</p>
2340   *
2341   * @param status A status code recording the success of the call.
2342+  * @param containsValues True if the dictionary will store a value associated
2343+  * with each word added.
2344   */
2345-  MutableTrieDictionary( UErrorCode &status );
2346+  MutableTrieDictionary( UErrorCode &status, UBool containsValues = false );
2347 };
2348
2349 /*******************************************************************
2350  * CompactTrieDictionary
2351  */
2352
2353+//forward declarations
2354+struct CompactTrieHeader;
2355+struct CompactTrieInfo;
2356+
2357 /**
2358  * <p>CompactTrieDictionary is a TrieWordDictionary that has been compacted
2359  * to save space.</p>
2360  */
2361 class U_COMMON_API CompactTrieDictionary : public TrieWordDictionary {
2362  private:
2363-    /**
2364-     * The root node of the trie
2365-     */
2366+  /**
2367+   * The header of the CompactTrieDictionary which contains all info
2368+   */
2369
2370-  const CompactTrieHeader   *fData;
2371-
2372-    /**
2373-     * A UBool indicating whether or not we own the fData.
2374-     */
2375+  CompactTrieInfo                 *fInfo;
2376
2377+  /**
2378+   * A UBool indicating whether or not we own the fData.
2379+   */
2380   UBool                     fOwnData;
2381
2382-    UDataMemory              *fUData;
2383+  UDataMemory              *fUData;
2384  public:
2385   /**
2386    * <p>Construct a dictionary from a UDataMemory.</p>
2387@@ -277,6 +314,11 @@
2388    */
2389   virtual ~CompactTrieDictionary();
2390
2391+  /**
2392+   * <p>Returns true if the dictionary contains values associated with each word.</p>
2393+   */
2394+  virtual UBool getValued() const;
2395+
2396  /**
2397   * <p>Find dictionary words that match the text.</p>
2398   *
2399@@ -286,13 +328,15 @@
2400   * @param lengths An array that is filled with the lengths of words that matched.
2401   * @param count Filled with the number of elements output in lengths.
2402   * @param limit The size of the lengths array; this limits the number of words output.
2403+  * @param values An array that is filled with the values associated with the matched words.
2404   * @return The number of characters in text that were matched.
2405   */
2406   virtual int32_t matches( UText *text,
2407-                              int32_t rangeEnd,
2408+                              int32_t maxLength,
2409                               int32_t *lengths,
2410                               int &count,
2411-                              int limit ) const;
2412+                              int limit,
2413+                              uint16_t *values = NULL) const;
2414
2415   /**
2416    * <p>Return a StringEnumeration for iterating all the words in the dictionary.</p>
2417@@ -311,7 +355,7 @@
2418   virtual uint32_t dataSize() const;
2419
2420  /**
2421-  * <p>Return a void * pointer to the compact data, platform-endian.</p>
2422+  * <p>Return a void * pointer to the (unmanaged) compact data, platform-endian.</p>
2423   *
2424   * @return The data for the compact dictionary, suitable for passing to the
2425   * constructor.
2426@@ -342,5 +386,5 @@
2427
2428 U_NAMESPACE_END
2429
2430-    /* TRIEDICT_H */
2431+/* TRIEDICT_H */
2432 #endif
2433--- source/data/Makefile.in	2010-10-29 13:21:33.000000000 -0700
2434+++ source/data/Makefile.in	2011-01-26 16:24:24.856798000 -0800
2435@@ -509,8 +520,9 @@
2436 ####################################################    CTD
2437 # CTD FILES
2438
2439-$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES)
2440-	$(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $<
2441+# .ctd file now generated regardless of whether dictionary file exists
2442+$(BRKBLDDIR)/%.ctd: $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES)
2443+	$(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $(BRKSRCDIR)/$(*F).txt
2444
2445 ####################################################    CFU
2446 # CFU FILES
2447--- source/data/brkitr/root.txt	2010-07-28 17:18:28.000000000 -0700
2448+++ source/data/brkitr/root.txt	2011-01-21 14:12:45.653922000 -0800
2449@@ -17,5 +17,8 @@
2450     }
2451     dictionaries{
2452         Thai:process(dependency){"thaidict.ctd"}
2453+        Hani:process(dependency){"cjdict.ctd"}
2454+        Hira:process(dependency){"cjdict.ctd"}
2455+        Kata:process(dependency){"cjdict.ctd"}
2456     }
2457 }
2458--- source/data/xml/brkitr/root.xml	2010-03-01 15:13:18.000000000 -0800
2459+++ source/data/xml/brkitr/root.xml	2011-01-21 14:12:45.735922000 -0800
2460@@ -25,6 +25,9 @@
2461             </icu:boundaries>
2462             <icu:dictionaries>
2463                 <icu:dictionary type="Thai" icu:dependency="thaidict.ctd"/>
2464+                <icu:dictionary type="Hani" icu:dependency="cjdict.ctd"/>
2465+                <icu:dictionary type="Hira" icu:dependency="cjdict.ctd"/>
2466+                <icu:dictionary type="Kata" icu:dependency="cjdict.ctd"/>
2467             </icu:dictionaries>
2468         </icu:breakIteratorData>
2469     </special>
2470--- source/test/cintltst/creststn.c	2010-10-28 10:44:02.000000000 -0700
2471+++ source/test/cintltst/creststn.c	2011-01-21 14:12:44.995020000 -0800
2472@@ -2188,21 +2188,21 @@
2473
2474
2475       {
2476-            UResourceBundle* ja = ures_open(U_ICUDATA_BRKITR,"ja", &status);
2477+            UResourceBundle* th = ures_open(U_ICUDATA_BRKITR,"th", &status);
2478             const UChar *got = NULL, *exp=NULL;
2479             int32_t gotLen = 0, expLen=0;
2480-            ja = ures_getByKey(ja, "boundaries", ja, &status);
2481-            exp = tres_getString(ja, -1, "word", &expLen, &status);
2482+            th = ures_getByKey(th, "boundaries", th, &status);
2483+            exp = tres_getString(th, -1, "grapheme", &expLen, &status);
2484
2485             tb = ures_getByKey(aliasB, "boundaries", tb, &status);
2486-            got = tres_getString(tb, -1, "word", &gotLen, &status);
2487+            got = tres_getString(tb, -1, "grapheme", &gotLen, &status);
2488
2489             if(U_FAILURE(status)) {
2490                 log_err("%s trying to read str boundaries\n", u_errorName(status));
2491             } else if(gotLen != expLen || u_strncmp(exp, got, gotLen) != 0) {
2492                 log_err("Referencing alias didn't get the right data\n");
2493             }
2494-            ures_close(ja);
2495+            ures_close(th);
2496             status = U_ZERO_ERROR;
2497       }
2498       /* simple alias */
2499--- source/test/intltest/rbbiapts.cpp	2010-07-12 11:03:29.000000000 -0700
2500+++ source/test/intltest/rbbiapts.cpp	2011-01-21 14:12:45.033014000 -0800
2501@@ -156,9 +156,13 @@
2502     if(*a!=*b){
2503         errln("Failed: boilerplate method operator!= does not return correct results");
2504     }
2505-    BreakIterator* c = BreakIterator::createWordInstance(Locale("ja"),status);
2506-    if(a && c){
2507-        if(*c==*a){
2508+    // Japanese word break iteratos is identical to root with
2509+    // a dictionary-based break iterator, but Thai character break iterator
2510+    // is still different from Root.
2511+    BreakIterator* c = BreakIterator::createCharacterInstance(Locale("ja"),status);
2512+    BreakIterator* d = BreakIterator::createCharacterInstance(Locale("th"),status);
2513+    if(c && d){
2514+        if(*c==*d){
2515             errln("Failed: boilerplate method opertator== does not return correct results");
2516         }
2517     }else{
2518@@ -167,6 +171,7 @@
2519     delete a;
2520     delete b;
2521     delete c;
2522+    delete d;
2523 }
2524
2525 void RBBIAPITest::TestgetRules()
2526@@ -635,21 +640,21 @@
2527 //
2528 void RBBIAPITest::TestRuleStatus() {
2529      UChar str[30];
2530-     u_unescape("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094",
2531-              // 012345678901234567  8      9    0  1      2    3  4      5    6
2532-              //                    Ideographic    Katakana       Hiragana
2533+     //no longer test Han or hiragana breaking here: ruleStatusVec would return nothing
2534+     // changed UBRK_WORD_KANA to UBRK_WORD_IDEO
2535+     u_unescape("plain word 123.45 \\u30a1\\u30a2 ",
2536+              // 012345678901234567  8      9    0
2537+              //                     Katakana
2538                 str, 30);
2539      UnicodeString testString1(str);
2540-     int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 19, 20, 21, 23, 24, 25, 26};
2541+     int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 20, 21};
2542      int32_t tag_lo[]  = {UBRK_WORD_NONE,     UBRK_WORD_LETTER, UBRK_WORD_NONE,    UBRK_WORD_LETTER,
2543                           UBRK_WORD_NONE,     UBRK_WORD_NUMBER, UBRK_WORD_NONE,
2544-                          UBRK_WORD_IDEO,     UBRK_WORD_IDEO,   UBRK_WORD_NONE,
2545-                          UBRK_WORD_KANA,     UBRK_WORD_NONE,   UBRK_WORD_KANA,    UBRK_WORD_KANA};
2546+                          UBRK_WORD_IDEO,     UBRK_WORD_NONE};
2547
2548      int32_t tag_hi[]  = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT,
2549                           UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT,
2550-                          UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT,   UBRK_WORD_NONE_LIMIT,
2551-                          UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT,   UBRK_WORD_KANA_LIMIT, UBRK_WORD_KANA_LIMIT};
2552+                          UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT};
2553
2554      UErrorCode status=U_ZERO_ERROR;
2555
2556@@ -888,9 +893,11 @@
2557
2558     URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD, status);
2559     {
2560+#if 0 // With a dictionary based word breaking, ja_word is identical to root.
2561         if (ja_word && *ja_word == *root_word) {
2562             errln("japan not different from root");
2563         }
2564+#endif
2565     }
2566
2567     {
2568--- source/test/intltest/rbbitst.cpp	2010-10-08 18:23:28.000000000 -0700
2569+++ source/test/intltest/rbbitst.cpp	2011-01-21 14:12:45.180030000 -0800
2570@@ -35,6 +35,8 @@
2571 #include <string.h>
2572 #include <stdio.h>
2573 #include <stdlib.h>
2574+#include "unicode/numfmt.h"
2575+#include "unicode/uscript.h"
2576
2577 #define TEST_ASSERT(x) {if (!(x)) { \
2578     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
2579@@ -138,11 +140,13 @@
2580             if (exec) TestThaiBreaks();                        break;
2581         case 23: name = "TestTailoredBreaks";
2582             if (exec) TestTailoredBreaks();                    break;
2583+        case 24: name = "TestTrieDictWithValue";
2584+            if(exec) TestTrieDictWithValue();                  break;
2585 #else
2586-        case 21: case 22: case 23: name = "skip";
2587+        case 21: case 22: case 23: case 24: name = "skip";
2588             break;
2589 #endif
2590-        case 24: name = "TestDictRules";
2591+        case 25: name = "TestDictRules";
2592             if (exec) TestDictRules();                         break;
2593         case 25: name = "TestBug5532";
2594             if (exec) TestBug5532();                           break;
2595@@ -607,6 +611,8 @@
2596
2597
2598 void RBBITest::TestJapaneseWordBreak() {
2599+// TODO: Rewrite this test for a dictionary-based word breaking.
2600+#if 0
2601     UErrorCode status = U_ZERO_ERROR;
2602     BITestData   japaneseWordSelection(status);
2603
2604@@ -628,6 +634,7 @@
2605
2606     generalIteratorTest(*e, japaneseWordSelection);
2607     delete e;
2608+#endif
2609 }
2610
2611 void RBBITest::TestTrieDict() {
2612@@ -849,6 +856,372 @@
2613     delete compact2;
2614 }
2615
2616+/*TODO: delete later*/
2617+inline void writeEnumerationToFile(StringEnumeration *enumer, char *filename){
2618+    UErrorCode      status  = U_ZERO_ERROR;
2619+    FILE *outfile = fopen(filename,"w");
2620+    UConverter *cvt = ucnv_open("UTF-8", &status);
2621+    if (U_FAILURE(status))
2622+        return;
2623+    if(outfile != NULL){
2624+        status = U_ZERO_ERROR;
2625+        const UnicodeString *word = enumer->snext(status);
2626+        while (word != NULL && U_SUCCESS(status)) {
2627+            char u8word[500];
2628+            status = U_ZERO_ERROR;
2629+            ucnv_fromUChars(cvt, u8word, 500, word->getBuffer(), word->length(),
2630+                    &status);
2631+            fprintf(outfile,"%s\n", u8word);
2632+            status = U_ZERO_ERROR;
2633+            word = enumer->snext(status);
2634+        }
2635+        fclose(outfile);
2636+    }
2637+    ucnv_close(cvt);
2638+}
2639+
2640+// A very simple helper class to streamline the buffer handling in
2641+// TestTrieDictWithValue
2642+template<class T, size_t N>
2643+class AutoBuffer {
2644+ public:
2645+  AutoBuffer(size_t size) : buffer(stackBuffer) {
2646+    if (size > N)
2647+      buffer = new T[size];
2648+  }
2649+  ~AutoBuffer() {
2650+    if (buffer != stackBuffer)
2651+      delete [] buffer;
2652+  }
2653+  T* elems() {
2654+    return buffer;
2655+  }
2656+  const T& operator[] (size_t i) const {
2657+    return buffer[i];
2658+  }
2659+  T& operator[] (size_t i) {
2660+    return buffer[i];
2661+  }
2662+ private:
2663+  T stackBuffer[N];
2664+  T* buffer;
2665+  AutoBuffer();
2666+};
2667+
2668+//----------------------------------------------------------------------------
2669+//
2670+// TestTrieDictWithValue    Test trie dictionaries with logprob values and
2671+// more than 2^16 nodes after compaction.
2672+//
2673+//----------------------------------------------------------------------------
2674+void RBBITest::TestTrieDictWithValue() {
2675+    UErrorCode      status  = U_ZERO_ERROR;
2676+
2677+    //
2678+    //  Open and read the test data file.
2679+    //
2680+    const char *testDataDirectory = IntlTest::getSourceTestData(status);
2681+    const char *filename = "cjdict-truncated.txt";
2682+    char testFileName[1000];
2683+    if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen(filename) + 10 >= sizeof(testFileName)) {
2684+        errln("Can't open test data.  Path too long.");
2685+        return;
2686+    }
2687+    strcpy(testFileName, testDataDirectory);
2688+    strcat(testFileName, filename);
2689+
2690+    // Items needing deleting at the end
2691+    MutableTrieDictionary *mutableDict = NULL;
2692+    CompactTrieDictionary *compactDict = NULL;
2693+    UnicodeSet            *breaks      = NULL;
2694+    UChar                 *testFile    = NULL;
2695+    StringEnumeration     *enumer1     = NULL;
2696+    StringEnumeration     *enumer2     = NULL;
2697+    MutableTrieDictionary *mutable2    = NULL;
2698+    StringEnumeration     *cloneEnum   = NULL;
2699+    CompactTrieDictionary *compact2    = NULL;
2700+    NumberFormat          *nf           = NULL;
2701+    UText *originalText = NULL, *cloneText = NULL;
2702+
2703+    const UnicodeString *originalWord = NULL;
2704+    const UnicodeString *cloneWord    = NULL;
2705+    UChar *current;
2706+    UChar *word;
2707+    UChar uc;
2708+    int32_t wordLen;
2709+    int32_t wordCount;
2710+    int32_t testCount;
2711+    int32_t valueLen;
2712+    int counter = 0;
2713+
2714+    int    len;
2715+    testFile = ReadAndConvertFile(testFileName, len, NULL, status);
2716+    if (U_FAILURE(status)) {
2717+        goto cleanup; /* something went wrong, error already output */
2718+    }
2719+
2720+    mutableDict = new MutableTrieDictionary(0x0E1C, status, TRUE);
2721+    if (U_FAILURE(status)) {
2722+        errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
2723+        goto cleanup;
2724+    }
2725+
2726+    breaks = new UnicodeSet;
2727+    breaks->add(0x000A);     // Line Feed
2728+    breaks->add(0x000D);     // Carriage Return
2729+    breaks->add(0x2028);     // Line Separator
2730+    breaks->add(0x2029);     // Paragraph Separator
2731+    breaks->add(0x0009);     // Tab character
2732+
2733+    // Now add each non-comment line of the file as a word.
2734+    current = testFile;
2735+    word = current;
2736+    uc = *current++;
2737+    wordLen = 0;
2738+    wordCount = 0;
2739+    nf = NumberFormat::createInstance(status);
2740+
2741+    while (uc) {
2742+        UnicodeString ucharValue;
2743+        valueLen = 0;
2744+
2745+        if (uc == 0x0023) {     // #comment line, skip
2746+            while (uc && !breaks->contains(uc)) {
2747+                uc = *current++;
2748+            }
2749+        }
2750+        else{
2751+            while (uc && !breaks->contains(uc)) {
2752+                ++wordLen;
2753+                uc = *current++;
2754+            }
2755+            if(uc == 0x0009){ //separator is a tab char, read in num after tab
2756+                uc = *current++;
2757+                while (uc && !breaks->contains(uc)) {
2758+                    ucharValue.append(uc);
2759+                    uc = *current++;
2760+                }
2761+            }
2762+        }
2763+        if (wordLen > 0) {
2764+            Formattable value((int32_t)0);
2765+            nf->parse(ucharValue.getTerminatedBuffer(), value, status);
2766+
2767+            if(U_FAILURE(status)){
2768+                errln("parsing of value failed when reading in dictionary\n");
2769+                goto cleanup;
2770+            }
2771+            mutableDict->addWord(word, wordLen, status, value.getLong());
2772+            if (U_FAILURE(status)) {
2773+                errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
2774+                goto cleanup;
2775+            }
2776+            wordCount += 1;
2777+        }
2778+
2779+        // Find beginning of next line
2780+        while (uc && breaks->contains(uc)) {
2781+            uc = *current++;
2782+        }
2783+        word = current-1;
2784+        wordLen = 0;
2785+    }
2786+
2787+    if (wordCount < 50) {
2788+        errln("Word count (%d) unreasonably small\n", wordCount);
2789+        goto cleanup;
2790+    }
2791+
2792+    enumer1 = mutableDict->openWords(status);
2793+    if (U_FAILURE(status)) {
2794+        errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
2795+        goto cleanup;
2796+    }
2797+
2798+    testCount = 0;
2799+    if (wordCount != (testCount = enumer1->count(status))) {
2800+        errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
2801+                testCount, wordCount, u_errorName(status));
2802+        goto cleanup;
2803+    }
2804+
2805+    // Now compact it
2806+    compactDict = new CompactTrieDictionary(*mutableDict, status);
2807+    if (U_FAILURE(status)) {
2808+        errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
2809+        goto cleanup;
2810+    }
2811+
2812+    enumer2 = compactDict->openWords(status);
2813+    if (U_FAILURE(status)) {
2814+        errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
2815+        goto cleanup;
2816+    }
2817+
2818+
2819+    //delete later
2820+//    writeEnumerationToFile(enumer1, "/home/jchye/mutable.txt");
2821+//    writeEnumerationToFile(enumer2, "/home/jchye/compact.txt");
2822+
2823+    enumer1->reset(status);
2824+    enumer2->reset(status);
2825+
2826+    originalWord = enumer1->snext(status);
2827+    cloneWord = enumer2->snext(status);
2828+    while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
2829+        if (*originalWord != *cloneWord) {
2830+            errln("MutableTrieDictionary and CompactTrieDictionary word mismatch at %d, lengths are %d and %d\n",
2831+                    counter, originalWord->length(), cloneWord->length());
2832+            goto cleanup;
2833+        }
2834+
2835+        // check if attached values of the same word in both dictionaries tally
2836+#if 0
2837+        int32_t lengths1[originalWord->length()], lengths2[cloneWord->length()];
2838+        uint16_t values1[originalWord->length()], values2[cloneWord->length()];
2839+#endif
2840+        AutoBuffer<int32_t, 20> lengths1(originalWord->length());
2841+        AutoBuffer<int32_t, 20> lengths2(cloneWord->length());
2842+        AutoBuffer<uint16_t, 20> values1(originalWord->length());
2843+        AutoBuffer<uint16_t, 20> values2(cloneWord->length());
2844+
2845+        originalText = utext_openConstUnicodeString(originalText, originalWord, &status);
2846+        cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status);
2847+
2848+        int count1, count2;
2849+        mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems());
2850+        compactDict->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems());
2851+
2852+        if(values1[count1-1] != values2[count2-1]){
2853+            errln("Values of word %d in MutableTrieDictionary and CompactTrieDictionary do not match, with values %d and %d\n",
2854+                  counter, values1[count1-1], values2[count2-1]);
2855+            goto cleanup;
2856+        }
2857+
2858+        counter++;
2859+        originalWord = enumer1->snext(status);
2860+        cloneWord = enumer2->snext(status);
2861+    }
2862+    if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) {
2863+        errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same");
2864+    }
2865+
2866+    delete enumer1;
2867+    enumer1 = NULL;
2868+    delete enumer2;
2869+    enumer2 = NULL;
2870+
2871+    // Now un-compact it
2872+    mutable2 = compactDict->cloneMutable(status);
2873+    if (U_FAILURE(status)) {
2874+        errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
2875+        goto cleanup;
2876+    }
2877+
2878+    cloneEnum = mutable2->openWords(status);
2879+    if (U_FAILURE(status)) {
2880+        errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
2881+        goto cleanup;
2882+    }
2883+
2884+    if (wordCount != (testCount = cloneEnum->count(status))) {
2885+        errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
2886+                testCount, wordCount, u_errorName(status));
2887+        goto cleanup;
2888+    }
2889+
2890+    // Compact original dictionary to clone. Note that we can only compare the same kind of
2891+    // dictionary as the order of the enumerators is not guaranteed to be the same between
2892+    // different kinds
2893+    enumer1 = mutableDict->openWords(status);
2894+    if (U_FAILURE(status)) {
2895+        errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
2896+        goto cleanup;
2897+    }
2898+
2899+    counter = 0;
2900+    originalWord = enumer1->snext(status);
2901+    cloneWord = cloneEnum->snext(status);
2902+    while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
2903+        if (*originalWord != *cloneWord) {
2904+            errln("Original and cloned MutableTrieDictionary word mismatch\n");
2905+            goto cleanup;
2906+        }
2907+
2908+        // check if attached values of the same word in both dictionaries tally
2909+        AutoBuffer<int32_t, 20> lengths1(originalWord->length());
2910+        AutoBuffer<int32_t, 20> lengths2(cloneWord->length());
2911+        AutoBuffer<uint16_t, 20> values1(originalWord->length());
2912+        AutoBuffer<uint16_t, 20> values2(cloneWord->length());
2913+        originalText = utext_openConstUnicodeString(originalText, originalWord, &status);
2914+        cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status);
2915+
2916+        int count1, count2;
2917+        mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems());
2918+        mutable2->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems());
2919+
2920+        if(values1[count1-1] != values2[count2-1]){
2921+            errln("Values of word %d in original and cloned MutableTrieDictionary do not match, with values %d and %d\n",
2922+                  counter, values1[count1-1], values2[count2-1]);
2923+            goto cleanup;
2924+        }
2925+
2926+        counter++;
2927+
2928+        originalWord = enumer1->snext(status);
2929+        cloneWord = cloneEnum->snext(status);
2930+    }
2931+
2932+    if (U_FAILURE(status)) {
2933+        errln("Enumeration failed: %s\n", u_errorName(status));
2934+        goto cleanup;
2935+    }
2936+
2937+    if (originalWord != cloneWord) {
2938+        errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
2939+        goto cleanup;
2940+    }
2941+
2942+    // Test the data copying constructor for CompactTrieDict, and the data access APIs.
2943+    compact2 = new CompactTrieDictionary(compactDict->data(), status);
2944+    if (U_FAILURE(status)) {
2945+        errln("CompactTrieDictionary(const void *,...) failed\n");
2946+        goto cleanup;
2947+    }
2948+
2949+    if (compact2->dataSize() == 0) {
2950+        errln("CompactTrieDictionary->dataSize() == 0\n");
2951+        goto cleanup;
2952+    }
2953+
2954+    // Now count the words via the second dictionary
2955+    delete enumer1;
2956+    enumer1 = compact2->openWords(status);
2957+    if (U_FAILURE(status)) {
2958+        errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
2959+        goto cleanup;
2960+    }
2961+
2962+    if (wordCount != (testCount = enumer1->count(status))) {
2963+        errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
2964+                testCount, wordCount, u_errorName(status));
2965+        goto cleanup;
2966+    }
2967+
2968+    cleanup:
2969+    delete compactDict;
2970+    delete mutableDict;
2971+    delete breaks;
2972+    delete[] testFile;
2973+    delete enumer1;
2974+    delete mutable2;
2975+    delete cloneEnum;
2976+    delete compact2;
2977+    utext_close(originalText);
2978+    utext_close(cloneText);
2979+
2980+
2981+}
2982
2983 //----------------------------------------------------------------------------
2984 //
2985@@ -1870,8 +2243,15 @@
2986 // Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
2987 static const char    jaWordText[]     = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"
2988                                         "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";
2989+#if 0
2990 static const int32_t jaWordTOffsets[] = {    2, 3,          7, 8, 14,         17, 18,     20, 21, 24,         27, 28 };
2991 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 };
2992+#endif
2993+// There's no separate Japanese word break iterator. Root is the same as Japanese.
2994+// Our dictionary-based iterator has to be tweaked to better handle U+3005,
2995+// U+3007, U+300B and some other cases.
2996+static const int32_t jaWordTOffsets[] = { 1, 2, 3, 4, 5,    7, 8, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };
2997+static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5,    7, 8, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };
2998
2999 // UBreakIteratorType UBRK_SENTENCE, Locale "el"
3000 // Add break after Greek question mark (cldrbug #2069).
3001@@ -2672,6 +3052,8 @@
3002     UnicodeSet  *fNewlineSet;
3003     UnicodeSet  *fKatakanaSet;
3004     UnicodeSet  *fALetterSet;
3005+    // TODO(jungshik): Do we still need this change?
3006+    // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt
3007     UnicodeSet  *fMidNumLetSet;
3008     UnicodeSet  *fMidLetterSet;
3009     UnicodeSet  *fMidNumSet;
3010@@ -2680,6 +3062,7 @@
3011     UnicodeSet  *fOtherSet;
3012     UnicodeSet  *fExtendSet;
3013     UnicodeSet  *fExtendNumLetSet;
3014+    UnicodeSet  *fDictionaryCjkSet;
3015
3016     RegexMatcher  *fMatcher;
3017
3018@@ -2696,12 +3079,24 @@
3019     fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
3020     fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
3021     fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
3022-    fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"),      status);
3023+    fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
3024+    // Exclude Hangul syllables from ALetterSet during testing.
3025+    // Leave CJK dictionary characters out from the monkey tests!
3026+#if 0
3027+    fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
3028+                                      "[\\p{Line_Break = Complex_Context}"
3029+                                      "-\\p{Grapheme_Cluster_Break = Extend}"
3030+                                      "-\\p{Grapheme_Cluster_Break = Control}"
3031+                                      "]]",
3032+                                      status);
3033+#endif
3034+    fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
3035+    fALetterSet->removeAll(*fDictionaryCjkSet);
3036     fKatakanaSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
3037     fMidNumLetSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
3038     fMidLetterSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
3039     fMidNumSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
3040-    fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
3041+    fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}[\\uff10-\\uff19]]"),      status);
3042     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
3043     fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
3044     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
3045@@ -2725,13 +3120,14 @@
3046     fOtherSet->removeAll(*fFormatSet);
3047     fOtherSet->removeAll(*fExtendSet);
3048     // Inhibit dictionary characters from being tested at all.
3049+    fOtherSet->removeAll(*fDictionaryCjkSet);
3050     fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
3051
3052     fSets->addElement(fCRSet,        status);
3053     fSets->addElement(fLFSet,        status);
3054     fSets->addElement(fNewlineSet,   status);
3055     fSets->addElement(fALetterSet,   status);
3056-    fSets->addElement(fKatakanaSet,  status);
3057+    //fSets->addElement(fKatakanaSet,  status); //TODO: work out how to test katakana
3058     fSets->addElement(fMidLetterSet, status);
3059     fSets->addElement(fMidNumLetSet, status);
3060     fSets->addElement(fMidNumSet,    status);
3061@@ -3978,6 +4374,7 @@
3062     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3063         count --;
3064         if (forward[count] != i) {
3065+            printStringBreaks(ustr, expected, expectedcount);
3066             test->errln("happy break test previous() failed: expected %d but got %d",
3067                         forward[count], i);
3068             break;
3069@@ -4011,23 +4408,25 @@
3070     UErrorCode    status = U_ZERO_ERROR;
3071     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3072     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3073+    // Replaced any C+J characters in a row with a random sequence of characters
3074+    // of the same length to make our C+J segmentation not get in the way.
3075     static const char *strlist[] =
3076     {
3077     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3078-    "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
3079+    "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3080     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3081     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3082-    "\\u90ca\\u3588\\u009c\\u0953\\u194b",
3083+    "\\uac00\\u3588\\u009c\\u0953\\u194b",
3084     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3085     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3086-    "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
3087+    "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3088     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3089     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3090     "\\u2027\\U000e0067\\u0a47\\u00b7",
3091     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3092     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3093     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3094-    "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3095+    "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3096     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3097     "\\u0027\\u11af\\U000e0057\\u0602",
3098     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3099@@ -4039,7 +4438,7 @@
3100     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3101     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3102     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3103-    "\\u58f4\\U000e0049\\u20e7\\u2027",
3104+    "\\u18f4\\U000e0049\\u20e7\\u2027",
3105     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3106     "\\ua183\\u102d\\u0bec\\u003a",
3107     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3108@@ -4049,7 +4448,7 @@
3109     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3110     "\\u003a\\u0664\\u00b7\\u1fba",
3111     "\\u003b\\u0027\\u00b7\\u47a3",
3112-    "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
3113+    "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3114     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3115     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3116     };
3117@@ -4104,12 +4503,12 @@
3118     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3119     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3120     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3121-    "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3122+    "\\U000e0065\\u302c\\u09ee\\U000e0068",
3123     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3124     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3125     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3126     "\\u58f4\\U000e0049\\u20e7\\u2027",
3127-    "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3128+    "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3129     "\\ua183\\u102d\\u0bec\\u003a",
3130     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3131     "\\u003a\\u0e57\\u0fad\\u002e",
3132--- source/test/intltest/rbbitst.h	2010-07-22 17:15:37.000000000 -0700
3133+++ source/test/intltest/rbbitst.h	2011-01-21 14:12:45.152007000 -0800
3134@@ -70,6 +70,7 @@
3135     void TestBug5775();
3136     void TestThaiBreaks();
3137     void TestTailoredBreaks();
3138+    void TestTrieDictWithValue();
3139     void TestDictRules();
3140     void TestBug5532();
3141
3142--- source/test/testdata/rbbitst.txt	2010-07-28 17:18:28.000000000 -0700
3143+++ source/test/testdata/rbbitst.txt	2011-01-21 14:12:45.221011000 -0800
3144@@ -161,7 +161,23 @@
3145 <data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data>
3146
3147 # Hiragana & Katakana stay together, but separates from each other and Latin.
3148-<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data>
3149+# *** what to do about theoretical combos of chars? i.e. hiragana + accent
3150+#<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data>
3151+
3152+# test normalization/dictionary handling of halfwidth katakana: same dictionary phrase in fullwidth and halfwidth
3153+<data>•芽キャベツ<400>芽キャベツ<400></data>
3154+
3155+# more Japanese tests
3156+# TODO: Currently, U+30FC and other characters (script=common) in the Hiragana
3157+# and the Katakana block are not treated correctly. Enable this later.
3158+#<data>•どー<400>せ<400>日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data>
3159+<data>•日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data>
3160+
3161+# Testing of word boundary for dictionary word containing both kanji and kana
3162+<data>•中だるみ<400>蔵王の森<400>ウ離島<400></data>
3163+
3164+# Testing of Chinese segmentation (taken from a Chinese news article)
3165+<data>•400<100>余<400>名<400>中央<400>委员<400>和<400>中央<400>候补<400>委员<400>都<400>领<400>到了<400>“•推荐<400>票<400>”•,•有<400>资格<400>在<400>200<100>多<400>名<400>符合<400>条件<400>的<400>63<100>岁<400>以下<400>中共<400>正<400>部<400>级<400>干部<400>中<400>,•选出<400>他们<400>属意<400>的<400>中央<400>政治局<400>委员<400>以<400>向<400>政治局<400>常委<400>会<400>举荐<400>。•</data>
3166
3167 # Words with interior formatting characters
3168 <data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</data>
3169@@ -169,6 +185,8 @@
3170 # to test for bug #4097779
3171 <data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>
3172
3173+# fullwidth numeric, midletter characters etc should be treated like their halfwidth counterparts
3174+<data>•ISN'T<200> •19<100>日<400></data>
3175
3176 #      to test for bug #4098467
3177 #      What follows is a string of Korean characters (I found it in the Yellow Pages
3178@@ -178,9 +196,15 @@
3179 #      precomposed syllables...
3180 <data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data>
3181
3182-<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> •</data>
3183+# more Korean tests (Jamo not tested here, not counted as dictionary characters)
3184+# Disable them now because we don't include a Korean dictionary.
3185+#<data>•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<200>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data>
3186+#<data>•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2dd<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200> •\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e•</data>
3187+
3188+<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> •</data>
3189+
3190+<data>•\u06c9<200>\uc799<200>\ufffa•</data>
3191
3192-<data>•\u06c9\uc799\ufffa<200></data>
3193
3194 #
3195 #      Try some words from other scripts.
3196@@ -491,8 +515,7 @@
3197 <data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c•</data>
3198
3199 #      conjoining jamo...
3200-#      TODO:  rules update needed
3201-#<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data>
3202+<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data>
3203
3204 #      to test for bug #4117554: Fullwidth .!? should be treated as postJwrd
3205 <data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data>
3206--- source/test/testdata/testaliases.txt	2009-11-12 13:53:42.000000000 -0800
3207+++ source/test/testdata/testaliases.txt	2011-01-21 14:12:45.204005000 -0800
3208@@ -28,7 +28,7 @@
3209     LocaleScript:alias { "/ICUDATA/ja/LocaleScript" }
3210
3211     // aliasing using position
3212-    boundaries:alias { "/ICUDATA-brkitr/ja" } // Referencing corresponding resource in another bundle
3213+    boundaries:alias { "/ICUDATA-brkitr/th" } // Referencing corresponding resource in another bundle
3214
3215     // aliasing arrays
3216     zoneTests {
3217--- source/tools/genctd/genctd.cpp	2009-08-04 14:09:17.000000000 -0700
3218+++ source/tools/genctd/genctd.cpp	2011-01-21 14:12:45.564923000 -0800
3219@@ -1,6 +1,6 @@
3220 /*
3221 **********************************************************************
3222-*   Copyright (C) 2002-2009, International Business Machines
3223+*   Copyright (C) 2002-2010, International Business Machines
3224 *   Corporation and others.  All Rights Reserved.
3225 **********************************************************************
3226 *
3227@@ -34,12 +34,15 @@
3228 #include "unicode/udata.h"
3229 #include "unicode/putil.h"
3230
3231+//#include "unicode/ustdio.h"
3232+
3233 #include "uoptions.h"
3234 #include "unewdata.h"
3235 #include "ucmndata.h"
3236 #include "rbbidata.h"
3237 #include "triedict.h"
3238 #include "cmemory.h"
3239+#include "uassert.h"
3240
3241 #include <stdio.h>
3242 #include <stdlib.h>
3243@@ -199,147 +202,191 @@
3244     long        wordFileSize;
3245     FILE        *file;
3246     char        *wordBufferC;
3247-
3248+    MutableTrieDictionary *mtd = NULL;
3249+
3250     file = fopen(wordFileName, "rb");
3251-    if( file == 0 ) {
3252-        fprintf(stderr, "Could not open file \"%s\"\n", wordFileName);
3253-        exit(-1);
3254-    }
3255-    fseek(file, 0, SEEK_END);
3256-    wordFileSize = ftell(file);
3257-    fseek(file, 0, SEEK_SET);
3258-    wordBufferC = new char[wordFileSize+10];
3259-
3260-    result = (long)fread(wordBufferC, 1, wordFileSize, file);
3261-    if (result != wordFileSize)  {
3262-        fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
3263-        exit (-1);
3264-    }
3265-    wordBufferC[wordFileSize]=0;
3266-    fclose(file);
3267-
3268-    //
3269-    // Look for a Unicode Signature (BOM) on the word file
3270-    //
3271-    int32_t        signatureLength;
3272-    const char *   wordSourceC = wordBufferC;
3273-    const char*    encoding = ucnv_detectUnicodeSignature(
3274-                           wordSourceC, wordFileSize, &signatureLength, &status);
3275-    if (U_FAILURE(status)) {
3276-        exit(status);
3277-    }
3278-    if(encoding!=NULL ){
3279-        wordSourceC  += signatureLength;
3280-        wordFileSize -= signatureLength;
3281-    }
3282-
3283-    //
3284-    // Open a converter to take the rule file to UTF-16
3285-    //
3286-    UConverter* conv;
3287-    conv = ucnv_open(encoding, &status);
3288-    if (U_FAILURE(status)) {
3289-        fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
3290-        exit(status);
3291-    }
3292-
3293-    //
3294-    // Convert the words to UChar.
3295-    //  Preflight first to determine required buffer size.
3296-    //
3297-    uint32_t destCap = ucnv_toUChars(conv,
3298-                       NULL,           //  dest,
3299-                       0,              //  destCapacity,
3300-                       wordSourceC,
3301-                       wordFileSize,
3302-                       &status);
3303-    if (status != U_BUFFER_OVERFLOW_ERROR) {
3304-        fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3305-        exit(status);
3306-    };
3307-
3308-    status = U_ZERO_ERROR;
3309-    UChar *wordSourceU = new UChar[destCap+1];
3310-    ucnv_toUChars(conv,
3311-                  wordSourceU,     //  dest,
3312-                  destCap+1,
3313-                  wordSourceC,
3314-                  wordFileSize,
3315-                  &status);
3316-    if (U_FAILURE(status)) {
3317-        fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3318-        exit(status);
3319-    };
3320-    ucnv_close(conv);
3321-
3322-    // Get rid of the original file buffer
3323-    delete[] wordBufferC;
3324-
3325-    // Create a MutableTrieDictionary, and loop through all the lines, inserting
3326-    // words.
3327-
3328-    // First, pick a median character.
3329-    UChar *current = wordSourceU + (destCap/2);
3330-    UChar uc = *current++;
3331-    UnicodeSet breaks;
3332-    breaks.add(0x000A);     // Line Feed
3333-    breaks.add(0x000D);     // Carriage Return
3334-    breaks.add(0x2028);     // Line Separator
3335-    breaks.add(0x2029);     // Paragraph Separator
3336-
3337-    do {
3338-        // Look for line break
3339-        while (uc && !breaks.contains(uc)) {
3340-            uc = *current++;
3341-        }
3342-        // Now skip to first non-line-break
3343-        while (uc && breaks.contains(uc)) {
3344-            uc = *current++;
3345+    if( file == 0 ) { //cannot find file
3346+        //create 1-line dummy file: ie 1 char, 1 value
3347+        UNewDataMemory *pData;
3348+        char msg[1024];
3349+
3350+        /* write message with just the name */
3351+        sprintf(msg, "%s not found, genctd writes dummy %s", wordFileName, outFileName);
3352+        fprintf(stderr, "%s\n", msg);
3353+
3354+        UChar c = 0x0020;
3355+        mtd = new MutableTrieDictionary(c, status, TRUE);
3356+        mtd->addWord(&c, 1, status, 1);
3357+
3358+    } else { //read words in from input file
3359+        fseek(file, 0, SEEK_END);
3360+        wordFileSize = ftell(file);
3361+        fseek(file, 0, SEEK_SET);
3362+        wordBufferC = new char[wordFileSize+10];
3363+
3364+        result = (long)fread(wordBufferC, 1, wordFileSize, file);
3365+        if (result != wordFileSize)  {
3366+            fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
3367+            exit (-1);
3368         }
3369-    }
3370-    while (uc && (breaks.contains(uc) || u_isspace(uc)));
3371-
3372-    MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status);
3373+        wordBufferC[wordFileSize]=0;
3374+        fclose(file);
3375
3376-    if (U_FAILURE(status)) {
3377-        fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
3378-        exit(status);
3379-    }
3380+        //
3381+        // Look for a Unicode Signature (BOM) on the word file
3382+        //
3383+        int32_t        signatureLength;
3384+        const char *   wordSourceC = wordBufferC;
3385+        const char*    encoding = ucnv_detectUnicodeSignature(
3386+                               wordSourceC, wordFileSize, &signatureLength, &status);
3387+        if (U_FAILURE(status)) {
3388+            exit(status);
3389+        }
3390+        if(encoding!=NULL ){
3391+            wordSourceC  += signatureLength;
3392+            wordFileSize -= signatureLength;
3393+        }
3394
3395-    // Now add the words. Words are non-space characters at the beginning of
3396-    // lines, and must be at least one UChar.
3397-    current = wordSourceU;
3398-    UChar *candidate = current;
3399-    uc = *current++;
3400-    int32_t length = 0;
3401-
3402-    while (uc) {
3403-        while (uc && !u_isspace(uc)) {
3404-            ++length;
3405-            uc = *current++;
3406+        //
3407+        // Open a converter to take the rule file to UTF-16
3408+        //
3409+        UConverter* conv;
3410+        conv = ucnv_open(encoding, &status);
3411+        if (U_FAILURE(status)) {
3412+            fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
3413+            exit(status);
3414         }
3415-        if (length > 0) {
3416-            mtd->addWord(candidate, length, status);
3417-            if (U_FAILURE(status)) {
3418-                fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\"\n",
3419-                        u_errorName(status));
3420-                exit(status);
3421+
3422+        //
3423+        // Convert the words to UChar.
3424+        //  Preflight first to determine required buffer size.
3425+        //
3426+        uint32_t destCap = ucnv_toUChars(conv,
3427+                           NULL,           //  dest,
3428+                           0,              //  destCapacity,
3429+                           wordSourceC,
3430+                           wordFileSize,
3431+                           &status);
3432+        if (status != U_BUFFER_OVERFLOW_ERROR) {
3433+            fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3434+            exit(status);
3435+        };
3436+
3437+        status = U_ZERO_ERROR;
3438+        UChar *wordSourceU = new UChar[destCap+1];
3439+        ucnv_toUChars(conv,
3440+                      wordSourceU,     //  dest,
3441+                      destCap+1,
3442+                      wordSourceC,
3443+                      wordFileSize,
3444+                      &status);
3445+        if (U_FAILURE(status)) {
3446+            fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3447+            exit(status);
3448+        };
3449+        ucnv_close(conv);
3450+
3451+        // Get rid of the original file buffer
3452+        delete[] wordBufferC;
3453+
3454+        // Create a MutableTrieDictionary, and loop through all the lines, inserting
3455+        // words.
3456+
3457+        // First, pick a median character.
3458+        UChar *current = wordSourceU + (destCap/2);
3459+        UChar uc = *current++;
3460+        UnicodeSet breaks;
3461+        breaks.add(0x000A);     // Line Feed
3462+        breaks.add(0x000D);     // Carriage Return
3463+        breaks.add(0x2028);     // Line Separator
3464+        breaks.add(0x2029);     // Paragraph Separator
3465+
3466+        do {
3467+            // Look for line break
3468+            while (uc && !breaks.contains(uc)) {
3469+                uc = *current++;
3470+            }
3471+            // Now skip to first non-line-break
3472+            while (uc && breaks.contains(uc)) {
3473+                uc = *current++;
3474             }
3475         }
3476-        // Find beginning of next line
3477-        while (uc && !breaks.contains(uc)) {
3478-            uc = *current++;
3479+        while (uc && (breaks.contains(uc) || u_isspace(uc)));
3480+
3481+        mtd = new MutableTrieDictionary(uc, status);
3482+
3483+        if (U_FAILURE(status)) {
3484+            fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
3485+            exit(status);
3486         }
3487-        while (uc && breaks.contains(uc)) {
3488-            uc = *current++;
3489+
3490+        // Now add the words. Words are non-space characters at the beginning of
3491+        // lines, and must be at least one UChar. If a word has an associated value,
3492+        // the value should follow the word on the same line after a tab character.
3493+        current = wordSourceU;
3494+        UChar *candidate = current;
3495+        uc = *current++;
3496+        int32_t length = 0;
3497+        int count = 0;
3498+
3499+        while (uc) {
3500+            while (uc && !u_isspace(uc)) {
3501+                ++length;
3502+                uc = *current++;
3503+            }
3504+
3505+            UnicodeString valueString;
3506+            UChar candidateValue;
3507+            if(uc == 0x0009){ //separator is a tab char, read in number after space
3508+            	while (uc && u_isspace(uc)) {
3509+            		uc = *current++;
3510+            	}
3511+                while (uc && !u_isspace(uc)) {
3512+                    valueString.append(uc);
3513+                    uc = *current++;
3514+                }
3515+            }
3516+
3517+            if (length > 0) {
3518+                count++;
3519+                if(valueString.length() > 0){
3520+                    mtd->setValued(TRUE);
3521+
3522+                    uint32_t value = 0;
3523+                    char* s = new char[valueString.length()];
3524+                    valueString.extract(0,valueString.length(), s, valueString.length());
3525+                    int n = sscanf(s, "%ud", &value);
3526+                    U_ASSERT(n == 1);
3527+                    U_ASSERT(value >= 0);
3528+                    mtd->addWord(candidate, length, status, (uint16_t)value);
3529+                    delete[] s;
3530+                } else {
3531+                    mtd->addWord(candidate, length, status);
3532+                }
3533+
3534+                if (U_FAILURE(status)) {
3535+                    fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\" at line %d in input file\n",
3536+                            u_errorName(status), count);
3537+                    exit(status);
3538+                }
3539+            }
3540+
3541+            // Find beginning of next line
3542+            while (uc && !breaks.contains(uc)) {
3543+                uc = *current++;
3544+            }
3545+            // Find next non-line-breaking character
3546+            while (uc && breaks.contains(uc)) {
3547+                uc = *current++;
3548+            }
3549+            candidate = current-1;
3550+            length = 0;
3551         }
3552-        candidate = current-1;
3553-        length = 0;
3554+
3555+        // Get rid of the Unicode text buffer
3556+        delete[] wordSourceU;
3557     }
3558
3559-    // Get rid of the Unicode text buffer
3560-    delete[] wordSourceU;
3561-
3562     // Now, create a CompactTrieDictionary from the mutable dictionary
3563     CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status);
3564     if (U_FAILURE(status)) {
3565@@ -393,4 +440,3 @@
3566
3567 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
3568 }
3569-
3570--- source/tools/genctd/Makefile.in	2006-12-16 13:07:01.000000000 -0800
3571+++ source/tools/genctd/Makefile.in	2011-01-21 14:12:45.555920000 -0800
3572@@ -23,13 +23,13 @@
3573 ## Extra files to remove for 'make clean'
3574 CLEANFILES = *~ $(DEPS) $(MAN_FILES)
3575
3576-## Target information
3577+## Target informationcd
3578 TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)
3579
3580 ifneq ($(top_builddir),$(top_srcdir))
3581 CPPFLAGS += -I$(top_builddir)/common
3582 endif
3583-CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil
3584+CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil -I$(top_srcdir)/i18n
3585 LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
3586
3587 OBJECTS = genctd.o
3588