--- source/common/brkeng.cpp 2009-11-11 07:47:22.000000000 -0800 +++ source/common/brkeng.cpp 2011-01-21 14:12:45.479922000 -0800 @@ -226,6 +226,30 @@ case USCRIPT_THAI: engine = new ThaiBreakEngine(dict, status); break; + + case USCRIPT_HANGUL: + engine = new CjkBreakEngine(dict, kKorean, status); + break; + + // use same BreakEngine and dictionary for both Chinese and Japanese + case USCRIPT_HIRAGANA: + case USCRIPT_KATAKANA: + case USCRIPT_HAN: + engine = new CjkBreakEngine(dict, kChineseJapanese, status); + break; +#if 0 + // TODO: Have to get some characters with script=common handled + // by CjkBreakEngine (e.g. U+309B). Simply subjecting + // them to CjkBreakEngine does not work. The engine has to + // special-case them. + case USCRIPT_COMMON: + { + UBlockCode block = ublock_getCode(code); + if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) + engine = new CjkBreakEngine(dict, kChineseJapanese, status); + break; + } +#endif default: break; } @@ -281,6 +305,13 @@ dict = NULL; } return dict; + } else if (dictfname != NULL){ + //create dummy dict if dictionary filename not valid + UChar c = 0x0020; + status = U_ZERO_ERROR; + MutableTrieDictionary *mtd = new MutableTrieDictionary(c, status, TRUE); + mtd->addWord(&c, 1, status, 1); + return new CompactTrieDictionary(*mtd, status); } return NULL; } --- source/common/dictbe.cpp 2008-06-13 12:21:12.000000000 -0700 +++ source/common/dictbe.cpp 2011-01-21 14:12:45.468928000 -0800 @@ -16,6 +16,9 @@ #include "unicode/ubrk.h" #include "uvector.h" #include "triedict.h" +#include "uassert.h" +#include "unicode/normlzr.h" +#include "cmemory.h" U_NAMESPACE_BEGIN @@ -422,6 +425,294 @@ return wordsFound; } +/* + ****************************************************************** + * CjkBreakEngine + */ +static const uint32_t kuint32max = 0xFFFFFFFF; +CjkBreakEngine::CjkBreakEngine(const TrieWordDictionary *adoptDictionary, LanguageType type, UErrorCode &status) +: DictionaryBreakEngine(1<getValued()) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + // Korean dictionary only includes Hangul syllables + fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status); + fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status); + fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status); + fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status); + + if (U_SUCCESS(status)) { + // handle Korean and Japanese/Chinese using different dictionaries + if (type == kKorean) { + setCharacters(fHangulWordSet); + } else { //Chinese and Japanese + UnicodeSet cjSet; + cjSet.addAll(fHanWordSet); + cjSet.addAll(fKatakanaWordSet); + cjSet.addAll(fHiraganaWordSet); + cjSet.add(UNICODE_STRING_SIMPLE("\\uff70\\u30fc")); + setCharacters(cjSet); + } + } +} + +CjkBreakEngine::~CjkBreakEngine(){ + delete fDictionary; +} + +// The katakanaCost values below are based on the length frequencies of all +// katakana phrases in the dictionary +static const int kMaxKatakanaLength = 8; +static const int kMaxKatakanaGroupLength = 20; +static const uint32_t maxSnlp = 255; + +static inline uint32_t getKatakanaCost(int wordLength){ + //TODO: fill array with actual values from dictionary! + static const uint32_t katakanaCost[kMaxKatakanaLength + 1] + = {8192, 984, 408, 240, 204, 252, 300, 372, 480}; + return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength]; +} + +static inline bool isKatakana(uint16_t value) { + return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) || + (value >= 0xFF66u && value <= 0xFF9fu); +} + +// A very simple helper class to streamline the buffer handling in +// divideUpDictionaryRange. +template +class AutoBuffer { + public: + AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) { + if (size > N) { + buffer = reinterpret_cast(uprv_malloc(sizeof(T)*size)); + capacity = size; + } + } + ~AutoBuffer() { + if (buffer != stackBuffer) + uprv_free(buffer); + } +#if 0 + T* operator& () { + return buffer; + } +#endif + T* elems() { + return buffer; + } + const T& operator[] (size_t i) const { + return buffer[i]; + } + T& operator[] (size_t i) { + return buffer[i]; + } + + // resize without copy + void resize(size_t size) { + if (size <= capacity) + return; + if (buffer != stackBuffer) + uprv_free(buffer); + buffer = reinterpret_cast(uprv_malloc(sizeof(T)*size)); + capacity = size; + } + private: + T stackBuffer[N]; + T* buffer; + AutoBuffer(); + size_t capacity; +}; + + +/* + * @param text A UText representing the text + * @param rangeStart The start of the range of dictionary characters + * @param rangeEnd The end of the range of dictionary characters + * @param foundBreaks Output of C array of int32_t break positions, or 0 + * @return The number of breaks found + */ +int32_t +CjkBreakEngine::divideUpDictionaryRange( UText *text, + int32_t rangeStart, + int32_t rangeEnd, + UStack &foundBreaks ) const { + if (rangeStart >= rangeEnd) { + return 0; + } + + const size_t defaultInputLength = 80; + size_t inputLength = rangeEnd - rangeStart; + AutoBuffer charString(inputLength); + + // Normalize the input string and put it in normalizedText. + // The map from the indices of the normalized input to the raw + // input is kept in charPositions. + UErrorCode status = U_ZERO_ERROR; + utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status); + if (U_FAILURE(status)) + return 0; + + UnicodeString inputString(charString.elems(), inputLength); + UNormalizationMode norm_mode = UNORM_NFKC; + UBool isNormalized = + Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES || + Normalizer::isNormalized(inputString, norm_mode, status); + + AutoBuffer charPositions(inputLength + 1); + int numChars = 0; + UText normalizedText = UTEXT_INITIALIZER; + // Needs to be declared here because normalizedText holds onto its buffer. + UnicodeString normalizedString; + if (isNormalized) { + int32_t index = 0; + charPositions[0] = 0; + while(index < inputString.length()) { + index = inputString.moveIndex32(index, 1); + charPositions[++numChars] = index; + } + utext_openUnicodeString(&normalizedText, &inputString, &status); + } + else { + Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status); + if (U_FAILURE(status)) + return 0; + charPositions.resize(normalizedString.length() + 1); + Normalizer normalizer(charString.elems(), inputLength, norm_mode); + int32_t index = 0; + charPositions[0] = 0; + while(index < normalizer.endIndex()){ + UChar32 uc = normalizer.next(); + charPositions[++numChars] = index = normalizer.getIndex(); + } + utext_openUnicodeString(&normalizedText, &normalizedString, &status); + } + + if (U_FAILURE(status)) + return 0; + + // From this point on, all the indices refer to the indices of + // the normalized input string. + + // bestSnlp[i] is the snlp of the best segmentation of the first i + // characters in the range to be matched. + AutoBuffer bestSnlp(numChars + 1); + bestSnlp[0] = 0; + for(int i=1; i<=numChars; i++){ + bestSnlp[i] = kuint32max; + } + + // prev[i] is the index of the last CJK character in the previous word in + // the best segmentation of the first i characters. + AutoBuffer prev(numChars + 1); + for(int i=0; i<=numChars; i++){ + prev[i] = -1; + } + + const size_t maxWordSize = 20; + AutoBuffer values(numChars); + AutoBuffer lengths(numChars); + + // Dynamic programming to find the best segmentation. + bool is_prev_katakana = false; + for (int i = 0; i < numChars; ++i) { + //utext_setNativeIndex(text, rangeStart + i); + utext_setNativeIndex(&normalizedText, i); + if (bestSnlp[i] == kuint32max) + continue; + + int count; + // limit maximum word length matched to size of current substring + int maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize: numChars - i; + + fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems()); + + // if there are no single character matches found in the dictionary + // starting with this charcter, treat character as a 1-character word + // with the highest value possible, i.e. the least likely to occur. + // Exclude Korean characters from this treatment, as they should be left + // together by default. + if((count == 0 || lengths[0] != 1) && + !fHangulWordSet.contains(utext_current32(&normalizedText))){ + values[count] = maxSnlp; + lengths[count++] = 1; + } + + for (int j = 0; j < count; j++){ + //U_ASSERT(values[j] >= 0 && values[j] <= maxSnlp); + uint32_t newSnlp = bestSnlp[i] + values[j]; + if (newSnlp < bestSnlp[lengths[j] + i]) { + bestSnlp[lengths[j] + i] = newSnlp; + prev[lengths[j] + i] = i; + } + } + + // In Japanese, + // Katakana word in single character is pretty rare. So we apply + // the following heuristic to Katakana: any continuous run of Katakana + // characters is considered a candidate word with a default cost + // specified in the katakanaCost table according to its length. + //utext_setNativeIndex(text, rangeStart + i); + utext_setNativeIndex(&normalizedText, i); + bool is_katakana = isKatakana(utext_current32(&normalizedText)); + if (!is_prev_katakana && is_katakana) { + int j = i + 1; + utext_next32(&normalizedText); + // Find the end of the continuous run of Katakana characters + while (j < numChars && (j - i) < kMaxKatakanaGroupLength && + isKatakana(utext_current32(&normalizedText))) { + utext_next32(&normalizedText); + ++j; + } + if ((j - i) < kMaxKatakanaGroupLength) { + uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i); + if (newSnlp < bestSnlp[j]) { + bestSnlp[j] = newSnlp; + prev[j] = i; + } + } + } + is_prev_katakana = is_katakana; + } + + // Start pushing the optimal offset index into t_boundary (t for tentative). + // prev[numChars] is guaranteed to be meaningful. + // We'll first push in the reverse order, i.e., + // t_boundary[0] = numChars, and afterwards do a swap. + AutoBuffer t_boundary(numChars + 1); + + int numBreaks = 0; + // No segmentation found, set boundary to end of range + if (bestSnlp[numChars] == kuint32max) { + t_boundary[numBreaks++] = numChars; + } else { + for (int i = numChars; i > 0; i = prev[i]){ + t_boundary[numBreaks++] = i; + + } + U_ASSERT(prev[t_boundary[numBreaks-1]] == 0); + } + + // Reverse offset index in t_boundary. + // Don't add a break for the start of the dictionary range if there is one + // there already. + if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) { + t_boundary[numBreaks++] = 0; + } + + // Now that we're done, convert positions in t_bdry[] (indices in + // the normalized input string) back to indices in the raw input string + // while reversing t_bdry and pushing values to foundBreaks. + for (int i = numBreaks-1; i >= 0; i--) { + foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status); + } + + utext_close(&normalizedText); + return numBreaks; +} + U_NAMESPACE_END #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ --- source/common/dictbe.h 2006-09-29 17:37:45.000000000 -0700 +++ source/common/dictbe.h 2011-01-21 14:12:45.492920000 -0800 @@ -1,8 +1,8 @@ /** - ******************************************************************************* - * Copyright (C) 2006, International Business Machines Corporation and others. * - * All Rights Reserved. * - ******************************************************************************* + ********************************************************************************** + * Copyright (C) 2006-2010, International Business Machines Corporation and others. + * All Rights Reserved. + ********************************************************************************** */ #ifndef DICTBE_H @@ -65,31 +65,31 @@ */ virtual ~DictionaryBreakEngine(); - /** - *

Indicate whether this engine handles a particular character for - * a particular kind of break.

- * - * @param c A character which begins a run that the engine might handle - * @param breakType The type of text break which the caller wants to determine - * @return TRUE if this engine handles the particular character and break - * type. - */ + /** + *

Indicate whether this engine handles a particular character for + * a particular kind of break.

+ * + * @param c A character which begins a run that the engine might handle + * @param breakType The type of text break which the caller wants to determine + * @return TRUE if this engine handles the particular character and break + * type. + */ virtual UBool handles( UChar32 c, int32_t breakType ) const; - /** - *

Find any breaks within a run in the supplied text.

- * - * @param text A UText representing the text. The - * iterator is left at the end of the run of characters which the engine - * is capable of handling. - * @param startPos The start of the run within the supplied text. - * @param endPos The end of the run within the supplied text. - * @param reverse Whether the caller is looking for breaks in a reverse - * direction. - * @param breakType The type of break desired, or -1. - * @param foundBreaks An allocated C array of the breaks found, if any - * @return The number of breaks found. - */ + /** + *

Find any breaks within a run in the supplied text.

+ * + * @param text A UText representing the text. The iterator is left at + * the end of the run of characters which the engine is capable of handling + * that starts from the first (or last) character in the range. + * @param startPos The start of the run within the supplied text. + * @param endPos The end of the run within the supplied text. + * @param reverse Whether the caller is looking for breaks in a reverse + * direction. + * @param breakType The type of break desired, or -1. + * @param foundBreaks An allocated C array of the breaks found, if any + * @return The number of breaks found. + */ virtual int32_t findBreaks( UText *text, int32_t startPos, int32_t endPos, @@ -114,7 +114,7 @@ // virtual void setBreakTypes( uint32_t breakTypes ); /** - *

Divide up a range of known dictionary characters.

+ *

Divide up a range of known dictionary characters handled by this break engine.

* * @param text A UText representing the text * @param rangeStart The start of the range of dictionary characters @@ -171,7 +171,7 @@ protected: /** - *

Divide up a range of known dictionary characters.

+ *

Divide up a range of known dictionary characters handled by this break engine.

* * @param text A UText representing the text * @param rangeStart The start of the range of dictionary characters @@ -186,6 +186,66 @@ }; +/******************************************************************* + * CjkBreakEngine + */ + +//indicates language/script that the CjkBreakEngine will handle +enum LanguageType { + kKorean, + kChineseJapanese +}; + +/** + *

CjkBreakEngine is a kind of DictionaryBreakEngine that uses a + * TrieWordDictionary with costs associated with each word and + * Viterbi decoding to determine CJK-specific breaks.

+ */ +class CjkBreakEngine : public DictionaryBreakEngine { + protected: + /** + * The set of characters handled by this engine + * @internal + */ + UnicodeSet fHangulWordSet; + UnicodeSet fHanWordSet; + UnicodeSet fKatakanaWordSet; + UnicodeSet fHiraganaWordSet; + + const TrieWordDictionary *fDictionary; + + public: + + /** + *

Default constructor.

+ * + * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the + * engine is deleted. The TrieWordDictionary must contain costs for each word + * in order for the dictionary to work properly. + */ + CjkBreakEngine(const TrieWordDictionary *adoptDictionary, LanguageType type, UErrorCode &status); + + /** + *

Virtual destructor.

+ */ + virtual ~CjkBreakEngine(); + + protected: + /** + *

Divide up a range of known dictionary characters handled by this break engine.

+ * + * @param text A UText representing the text + * @param rangeStart The start of the range of dictionary characters + * @param rangeEnd The end of the range of dictionary characters + * @param foundBreaks Output of C array of int32_t break positions, or 0 + * @return The number of breaks found + */ + virtual int32_t divideUpDictionaryRange( UText *text, + int32_t rangeStart, + int32_t rangeEnd, + UStack &foundBreaks ) const; + +}; U_NAMESPACE_END --- source/common/rbbi.cpp 2010-07-22 17:15:37.000000000 -0700 +++ source/common/rbbi.cpp 2011-01-21 14:12:45.457938000 -0800 @@ -1555,10 +1555,12 @@ int32_t endPos, UBool reverse) { // Reset the old break cache first. - uint32_t dictionaryCount = fDictionaryCharCount; reset(); - if (dictionaryCount <= 1 || (endPos - startPos) <= 1) { + // note: code segment below assumes that dictionary chars are in the + // startPos-endPos range + // value returned should be next character in sequence + if ((endPos - startPos) <= 1) { return (reverse ? startPos : endPos); } @@ -1711,7 +1713,7 @@ // proposed break by one of the breaks we found. Use following() and // preceding() to do the work. They should never recurse in this case. if (reverse) { - return preceding(endPos - 1); + return preceding(endPos); } else { return following(startPos); --- source/common/triedict.cpp 2008-02-13 01:35:50.000000000 -0800 +++ source/common/triedict.cpp 2011-01-21 14:12:45.271006000 -0800 @@ -20,6 +20,7 @@ #include "uvector.h" #include "uvectr32.h" #include "uarrsort.h" +#include "hash.h" //#define DEBUG_TRIE_DICT 1 @@ -27,6 +28,11 @@ #include #include #include +#include +#ifndef CLK_TCK +#define CLK_TCK CLOCKS_PER_SEC +#endif + #endif U_NAMESPACE_BEGIN @@ -45,6 +51,11 @@ * MutableTrieDictionary */ +//#define MAX_VALUE 65535 + +// forward declaration +inline uint16_t scaleLogProbabilities(double logprob); + // Node structure for the ternary, uncompressed trie struct TernaryNode : public UMemory { UChar ch; // UTF-16 code unit @@ -77,7 +88,8 @@ delete high; } -MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status ) { +MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status, + UBool containsValue /* = FALSE */ ) { // Start the trie off with something. Having the root node already present // cuts a special case out of the search/insertion functions. // Making it a median character cuts the worse case for searches from @@ -91,14 +103,19 @@ if (U_SUCCESS(status) && fIter == NULL) { status = U_MEMORY_ALLOCATION_ERROR; } + + fValued = containsValue; } -MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) { +MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status, + UBool containsValue /* = false */ ) { fTrie = NULL; fIter = utext_openUChars(NULL, NULL, 0, &status); if (U_SUCCESS(status) && fIter == NULL) { status = U_MEMORY_ALLOCATION_ERROR; } + + fValued = containsValue; } MutableTrieDictionary::~MutableTrieDictionary() { @@ -108,12 +125,13 @@ int32_t MutableTrieDictionary::search( UText *text, - int32_t maxLength, - int32_t *lengths, - int &count, - int limit, - TernaryNode *&parent, - UBool &pMatched ) const { + int32_t maxLength, + int32_t *lengths, + int &count, + int limit, + TernaryNode *&parent, + UBool &pMatched, + uint16_t *values /*=NULL*/) const { // TODO: current implementation works in UTF-16 space const TernaryNode *up = NULL; const TernaryNode *p = fTrie; @@ -121,6 +139,10 @@ pMatched = TRUE; int i; + if (!fValued) { + values = NULL; + } + UChar uc = utext_current32(text); for (i = 0; i < maxLength && p != NULL; ++i) { while (p != NULL) { @@ -141,7 +163,11 @@ break; } // Must be equal to get here - if (limit > 0 && (p->flags & kEndsWord)) { + if (limit > 0 && (p->flags > 0)) { + //is there a more efficient way to add values? ie. remove if stmt + if(values != NULL) { + values[mycount] = p->flags; + } lengths[mycount++] = i+1; --limit; } @@ -161,13 +187,14 @@ void MutableTrieDictionary::addWord( const UChar *word, int32_t length, - UErrorCode &status ) { -#if 0 - if (length <= 0) { + UErrorCode &status, + uint16_t value /* = 0 */ ) { + // dictionary cannot store zero values, would interfere with flags + if (length <= 0 || (!fValued && value > 0) || (fValued && value == 0)) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } -#endif + TernaryNode *parent; UBool pMatched; int count; @@ -177,7 +204,7 @@ matched = search(fIter, length, NULL, count, 0, parent, pMatched); while (matched++ < length) { - UChar32 uc = utext_next32(fIter); // TODO: supplemetary support? + UChar32 uc = utext_next32(fIter); // TODO: supplementary support? U_ASSERT(uc != U_SENTINEL); TernaryNode *newNode = new TernaryNode(uc); if (newNode == NULL) { @@ -199,30 +226,23 @@ parent = newNode; } - parent->flags |= kEndsWord; -} - -#if 0 -void -MutableTrieDictionary::addWords( UEnumeration *words, - UErrorCode &status ) { - int32_t length; - const UChar *word; - while ((word = uenum_unext(words, &length, &status)) && U_SUCCESS(status)) { - addWord(word, length, status); + if(fValued && value > 0){ + parent->flags = value; + } else { + parent->flags |= kEndsWord; } } -#endif int32_t MutableTrieDictionary::matches( UText *text, int32_t maxLength, int32_t *lengths, int &count, - int limit ) const { + int limit, + uint16_t *values /*=NULL*/) const { TernaryNode *parent; UBool pMatched; - return search(text, maxLength, lengths, count, limit, parent, pMatched); + return search(text, maxLength, lengths, count, limit, parent, pMatched, values); } // Implementation of iteration for MutableTrieDictionary @@ -277,7 +297,7 @@ break; } case kEqual: - emit = (node->flags & kEndsWord) != 0; + emit = node->flags > 0; equal = (node->equal != NULL); // If this node should be part of the next emitted string, append // the UChar to the string, and make sure we pop it when we come @@ -299,7 +319,7 @@ } case kGreaterThan: // If this node's character is in the string, remove it. - if (node->equal != NULL || (node->flags & kEndsWord)) { + if (node->equal != NULL || node->flags > 0) { unistr.truncate(unistr.length()-1); } if (node->high != NULL) { @@ -354,12 +374,75 @@ * CompactTrieDictionary */ +//TODO further optimization: +// minimise size of trie with logprobs by storing values +// for terminal nodes directly in offsets[] +// --> calculating from next offset *might* be simpler, but would have to add +// one last offset for logprob of last node +// --> if calculate from current offset, need to factor in possible overflow +// as well. +// idea: store in offset, set first bit to indicate logprob storage-->won't +// have to access additional node + +// {'Dic', 1}, version 1: uses old header, no values +#define COMPACT_TRIE_MAGIC_1 0x44696301 +// version 2: uses new header (more than 2^16 nodes), no values +#define COMPACT_TRIE_MAGIC_2 0x44696302 +// version 3: uses new header, includes values +#define COMPACT_TRIE_MAGIC_3 0x44696303 + struct CompactTrieHeader { uint32_t size; // Size of the data in bytes uint32_t magic; // Magic number (including version) + uint32_t nodeCount; // Number of entries in offsets[] + uint32_t root; // Node number of the root node + uint32_t offsets[1]; // Offsets to nodes from start of data +}; + +// old version of CompactTrieHeader kept for backwards compatibility +struct CompactTrieHeaderV1 { + uint32_t size; // Size of the data in bytes + uint32_t magic; // Magic number (including version) uint16_t nodeCount; // Number of entries in offsets[] uint16_t root; // Node number of the root node - uint32_t offsets[1]; // Offsets to nodes from start of data + uint32_t offsets[1]; // Offsets to nodes from start of data +}; + +// Helper class for managing CompactTrieHeader and CompactTrieHeaderV1 +struct CompactTrieInfo { + uint32_t size; // Size of the data in bytes + uint32_t magic; // Magic number (including version) + uint32_t nodeCount; // Number of entries in offsets[] + uint32_t root; // Node number of the root node + uint32_t *offsets; // Offsets to nodes from start of data + uint8_t *address; // pointer to header bytes in memory + + CompactTrieInfo(const void *data, UErrorCode &status){ + CompactTrieHeader *header = (CompactTrieHeader *) data; + if (header->magic != COMPACT_TRIE_MAGIC_1 && + header->magic != COMPACT_TRIE_MAGIC_2 && + header->magic != COMPACT_TRIE_MAGIC_3) { + status = U_ILLEGAL_ARGUMENT_ERROR; + } else { + size = header->size; + magic = header->magic; + + if (header->magic == COMPACT_TRIE_MAGIC_1) { + CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *) header; + nodeCount = headerV1->nodeCount; + root = headerV1->root; + offsets = &(headerV1->offsets[0]); + address = (uint8_t *)headerV1; + } else { + nodeCount = header->nodeCount; + root = header->root; + offsets = &(header->offsets[0]); + address = (uint8_t *)header; + } + } + } + + ~CompactTrieInfo(){} }; // Note that to avoid platform-specific alignment issues, all members of the node @@ -375,10 +458,14 @@ enum CompactTrieNodeFlags { kVerticalNode = 0x1000, // This is a vertical node kParentEndsWord = 0x2000, // The node whose equal link points to this ends a word - kReservedFlag1 = 0x4000, - kReservedFlag2 = 0x8000, + kExceedsCount = 0x4000, // new MSB for count >= 4096, originally kReservedFlag1 + kEqualOverflows = 0x8000, // Links to nodeIDs > 2^16, orig. kReservedFlag2 kCountMask = 0x0FFF, // The count portion of flagscount - kFlagMask = 0xF000 // The flags portion of flagscount + kFlagMask = 0xF000, // The flags portion of flagscount + kRootCountMask = 0x7FFF // The count portion of flagscount in the root node + + //offset flags: + //kOffsetContainsValue = 0x80000000 // Offset contains value for parent node }; // The two node types are distinguished by the kVerticalNode flag. @@ -402,63 +489,177 @@ uint16_t chars[1]; // Code units }; -// {'Dic', 1}, version 1 -#define COMPACT_TRIE_MAGIC_1 0x44696301 - CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj, UErrorCode &status ) : fUData(dataObj) { - fData = (const CompactTrieHeader *) udata_getMemory(dataObj); + fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo)); + *fInfo = CompactTrieInfo(udata_getMemory(dataObj), status); fOwnData = FALSE; - if (fData->magic != COMPACT_TRIE_MAGIC_1) { - status = U_ILLEGAL_ARGUMENT_ERROR; - fData = NULL; - } } + CompactTrieDictionary::CompactTrieDictionary( const void *data, UErrorCode &status ) : fUData(NULL) { - fData = (const CompactTrieHeader *) data; + fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo)); + *fInfo = CompactTrieInfo(data, status); fOwnData = FALSE; - if (fData->magic != COMPACT_TRIE_MAGIC_1) { - status = U_ILLEGAL_ARGUMENT_ERROR; - fData = NULL; - } } CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict, UErrorCode &status ) : fUData(NULL) { - fData = compactMutableTrieDictionary(dict, status); + const CompactTrieHeader* header = compactMutableTrieDictionary(dict, status); + if (U_SUCCESS(status)) { + fInfo = (CompactTrieInfo *)uprv_malloc(sizeof(CompactTrieInfo)); + *fInfo = CompactTrieInfo(header, status); + } + fOwnData = !U_FAILURE(status); } CompactTrieDictionary::~CompactTrieDictionary() { if (fOwnData) { - uprv_free((void *)fData); + uprv_free((void *)(fInfo->address)); } + uprv_free((void *)fInfo); + if (fUData) { udata_close(fUData); } } +UBool CompactTrieDictionary::getValued() const{ + return fInfo->magic == COMPACT_TRIE_MAGIC_3; +} + uint32_t CompactTrieDictionary::dataSize() const { - return fData->size; + return fInfo->size; } const void * CompactTrieDictionary::data() const { - return fData; + return fInfo->address; +} + +//This function finds the address of a node for us, given its node ID +static inline const CompactTrieNode * +getCompactNode(const CompactTrieInfo *info, uint32_t node) { + if(node < info->root-1) { + return (const CompactTrieNode *)(&info->offsets[node]); + } else { + return (const CompactTrieNode *)(info->address + info->offsets[node]); + } } -// This function finds the address of a node for us, given its node ID +//this version of getCompactNode is currently only used in compactMutableTrieDictionary() static inline const CompactTrieNode * -getCompactNode(const CompactTrieHeader *header, uint16_t node) { - return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]); +getCompactNode(const CompactTrieHeader *header, uint32_t node) { + if(node < header->root-1) { + return (const CompactTrieNode *)(&header->offsets[node]); + } else { + return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]); + } +} + + +/** + * Calculates the number of links in a node + * @node The specified node + */ +static inline const uint16_t +getCount(const CompactTrieNode *node){ + return (node->flagscount & kCountMask); + //use the code below if number of links ever exceed 4096 + //return (node->flagscount & kCountMask) + ((node->flagscount & kExceedsCount) >> 2); +} + +/** + * calculates an equal link node ID of a horizontal node + * @hnode The horizontal node containing the equal link + * @param index The index into hnode->entries[] + * @param nodeCount The length of hnode->entries[] + */ +static inline uint32_t calcEqualLink(const CompactTrieVerticalNode *vnode){ + if(vnode->flagscount & kEqualOverflows){ + // treat overflow bits as an extension of chars[] + uint16_t *overflow = (uint16_t *) &vnode->chars[getCount((CompactTrieNode*)vnode)]; + return vnode->equal + (((uint32_t)*overflow) << 16); + }else{ + return vnode->equal; + } +} + +/** + * calculates an equal link node ID of a horizontal node + * @hnode The horizontal node containing the equal link + * @param index The index into hnode->entries[] + * @param nodeCount The length of hnode->entries[] + */ +static inline uint32_t calcEqualLink(const CompactTrieHorizontalNode *hnode, uint16_t index, uint16_t nodeCount){ + if(hnode->flagscount & kEqualOverflows){ + //set overflow to point to the uint16_t containing the overflow bits + uint16_t *overflow = (uint16_t *) &hnode->entries[nodeCount]; + overflow += index/4; + uint16_t extraBits = (*overflow >> (3 - (index % 4)) * 4) % 0x10; + return hnode->entries[index].equal + (((uint32_t)extraBits) << 16); + } else { + return hnode->entries[index].equal; + } +} + +/** + * Returns the value stored in the specified node which is associated with its + * parent node. + * TODO: how to tell that value is stored in node or in offset? check whether + * node ID < fInfo->root! + */ +static inline uint16_t getValue(const CompactTrieHorizontalNode *hnode){ + uint16_t count = getCount((CompactTrieNode *)hnode); + uint16_t overflowSize = 0; //size of node ID overflow storage in bytes + + if(hnode->flagscount & kEqualOverflows) + overflowSize = (count + 3) / 4 * sizeof(uint16_t); + return *((uint16_t *)((uint8_t *)&hnode->entries[count] + overflowSize)); +} + +static inline uint16_t getValue(const CompactTrieVerticalNode *vnode){ + // calculate size of total node ID overflow storage in bytes + uint16_t overflowSize = (vnode->flagscount & kEqualOverflows)? sizeof(uint16_t) : 0; + return *((uint16_t *)((uint8_t *)&vnode->chars[getCount((CompactTrieNode *)vnode)] + overflowSize)); +} + +static inline uint16_t getValue(const CompactTrieNode *node){ + if(node->flagscount & kVerticalNode) + return getValue((const CompactTrieVerticalNode *)node); + else + return getValue((const CompactTrieHorizontalNode *)node); +} + +//returns index of match in CompactTrieHorizontalNode.entries[] using binary search +inline int16_t +searchHorizontalEntries(const CompactTrieHorizontalEntry *entries, + UChar uc, uint16_t nodeCount){ + int low = 0; + int high = nodeCount-1; + int middle; + while (high >= low) { + middle = (high+low)/2; + if (uc == entries[middle].ch) { + return middle; + } + else if (uc < entries[middle].ch) { + high = middle-1; + } + else { + low = middle+1; + } + } + + return -1; } int32_t @@ -466,17 +667,38 @@ int32_t maxLength, int32_t *lengths, int &count, - int limit ) const { + int limit, + uint16_t *values /*= NULL*/) const { + if (fInfo->magic == COMPACT_TRIE_MAGIC_2) + values = NULL; + // TODO: current implementation works in UTF-16 space - const CompactTrieNode *node = getCompactNode(fData, fData->root); + const CompactTrieNode *node = getCompactNode(fInfo, fInfo->root); int mycount = 0; UChar uc = utext_current32(text); int i = 0; + // handle root node with only kEqualOverflows flag: assume horizontal node without parent + if(node != NULL){ + const CompactTrieHorizontalNode *root = (const CompactTrieHorizontalNode *) node; + int index = searchHorizontalEntries(root->entries, uc, root->flagscount & kRootCountMask); + if(index > -1){ + node = getCompactNode(fInfo, calcEqualLink(root, index, root->flagscount & kRootCountMask)); + utext_next32(text); + uc = utext_current32(text); + ++i; + }else{ + node = NULL; + } + } + while (node != NULL) { // Check if the node we just exited ends a word if (limit > 0 && (node->flagscount & kParentEndsWord)) { + if(values != NULL){ + values[mycount] = getValue(node); + } lengths[mycount++] = i; --limit; } @@ -487,7 +709,7 @@ break; } - int nodeCount = (node->flagscount & kCountMask); + int nodeCount = getCount(node); if (nodeCount == 0) { // Special terminal node; return now break; @@ -507,35 +729,27 @@ // To get here we must have come through the whole list successfully; // go on to the next node. Note that a word cannot end in the middle // of a vertical node. - node = getCompactNode(fData, vnode->equal); + node = getCompactNode(fInfo, calcEqualLink(vnode)); } else { // Horizontal node; do binary search const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node; - int low = 0; - int high = nodeCount-1; - int middle; - node = NULL; // If we don't find a match, we'll fall out of the loop - while (high >= low) { - middle = (high+low)/2; - if (uc == hnode->entries[middle].ch) { - // We hit a match; get the next node and next character - node = getCompactNode(fData, hnode->entries[middle].equal); - utext_next32(text); - uc = utext_current32(text); - ++i; - break; - } - else if (uc < hnode->entries[middle].ch) { - high = middle-1; - } - else { - low = middle+1; - } + const CompactTrieHorizontalEntry *entries; + entries = hnode->entries; + + int index = searchHorizontalEntries(entries, uc, nodeCount); + if(index > -1){ // + // We hit a match; get the next node and next character + node = getCompactNode(fInfo, calcEqualLink(hnode, index, nodeCount)); + utext_next32(text); + uc = utext_current32(text); + ++i; + }else{ + node = NULL; // If we don't find a match, we'll fall out of the loop } } } -exit: + exit: count = mycount; return i; } @@ -545,16 +759,16 @@ private: UVector32 fNodeStack; // Stack of nodes to process UVector32 fIndexStack; // Stack of where in node we are - const CompactTrieHeader *fHeader; // Trie data + const CompactTrieInfo *fInfo; // Trie data public: static UClassID U_EXPORT2 getStaticClassID(void); virtual UClassID getDynamicClassID(void) const; public: - CompactTrieEnumeration(const CompactTrieHeader *header, UErrorCode &status) + CompactTrieEnumeration(const CompactTrieInfo *info, UErrorCode &status) : fNodeStack(status), fIndexStack(status) { - fHeader = header; - fNodeStack.push(header->root, status); + fInfo = info; + fNodeStack.push(info->root, status); fIndexStack.push(0, status); unistr.remove(); } @@ -564,14 +778,14 @@ virtual StringEnumeration *clone() const { UErrorCode status = U_ZERO_ERROR; - return new CompactTrieEnumeration(fHeader, status); + return new CompactTrieEnumeration(fInfo, status); } virtual const UnicodeString * snext(UErrorCode &status); // Very expensive, but this should never be used. virtual int32_t count(UErrorCode &status) const { - CompactTrieEnumeration counter(fHeader, status); + CompactTrieEnumeration counter(fInfo, status); int32_t result = 0; while (counter.snext(status) != NULL && U_SUCCESS(status)) { ++result; @@ -582,7 +796,7 @@ virtual void reset(UErrorCode &status) { fNodeStack.removeAllElements(); fIndexStack.removeAllElements(); - fNodeStack.push(fHeader->root, status); + fNodeStack.push(fInfo->root, status); fIndexStack.push(0, status); unistr.remove(); } @@ -595,26 +809,34 @@ if (fNodeStack.empty() || U_FAILURE(status)) { return NULL; } - const CompactTrieNode *node = getCompactNode(fHeader, fNodeStack.peeki()); + const CompactTrieNode *node = getCompactNode(fInfo, fNodeStack.peeki()); int where = fIndexStack.peeki(); while (!fNodeStack.empty() && U_SUCCESS(status)) { - int nodeCount = (node->flagscount & kCountMask); + int nodeCount; + + bool isRoot = fNodeStack.peeki() == static_cast(fInfo->root); + if(isRoot){ + nodeCount = node->flagscount & kRootCountMask; + } else { + nodeCount = getCount(node); + } + UBool goingDown = FALSE; if (nodeCount == 0) { // Terminal node; go up immediately fNodeStack.popi(); fIndexStack.popi(); - node = getCompactNode(fHeader, fNodeStack.peeki()); + node = getCompactNode(fInfo, fNodeStack.peeki()); where = fIndexStack.peeki(); } - else if (node->flagscount & kVerticalNode) { + else if ((node->flagscount & kVerticalNode) && !isRoot) { // Vertical node const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node; if (where == 0) { // Going down - unistr.append((const UChar *)vnode->chars, (int32_t) nodeCount); + unistr.append((const UChar *)vnode->chars, nodeCount); fIndexStack.setElementAt(1, fIndexStack.size()-1); - node = getCompactNode(fHeader, fNodeStack.push(vnode->equal, status)); + node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(vnode), status)); where = fIndexStack.push(0, status); goingDown = TRUE; } @@ -623,7 +845,7 @@ unistr.truncate(unistr.length()-nodeCount); fNodeStack.popi(); fIndexStack.popi(); - node = getCompactNode(fHeader, fNodeStack.peeki()); + node = getCompactNode(fInfo, fNodeStack.peeki()); where = fIndexStack.peeki(); } } @@ -638,7 +860,7 @@ // Push on next node unistr.append((UChar)hnode->entries[where].ch); fIndexStack.setElementAt(where+1, fIndexStack.size()-1); - node = getCompactNode(fHeader, fNodeStack.push(hnode->entries[where].equal, status)); + node = getCompactNode(fInfo, fNodeStack.push(calcEqualLink(hnode, where, nodeCount), status)); where = fIndexStack.push(0, status); goingDown = TRUE; } @@ -646,12 +868,14 @@ // Going up fNodeStack.popi(); fIndexStack.popi(); - node = getCompactNode(fHeader, fNodeStack.peeki()); + node = getCompactNode(fInfo, fNodeStack.peeki()); where = fIndexStack.peeki(); } } + // Check if the parent of the node we've just gone down to ends a // word. If so, return it. + // The root node should never end up here. if (goingDown && (node->flagscount & kParentEndsWord)) { return &unistr; } @@ -664,7 +888,7 @@ if (U_FAILURE(status)) { return NULL; } - return new CompactTrieEnumeration(fData, status); + return new CompactTrieEnumeration(fInfo, status); } // @@ -672,21 +896,36 @@ // and back again // -// Helper classes to construct the compact trie +enum CompactTrieNodeType { + kHorizontalType = 0, + kVerticalType = 1, + kValueType = 2 +}; + +/** + * The following classes (i.e. BuildCompactTrie*Node) are helper classes to + * construct the compact trie by storing information for each node and later + * writing the node to memory in a sequential format. + */ class BuildCompactTrieNode: public UMemory { - public: +public: UBool fParentEndsWord; - UBool fVertical; + CompactTrieNodeType fNodeType; UBool fHasDuplicate; + UBool fEqualOverflows; int32_t fNodeID; UnicodeString fChars; + uint16_t fValue; - public: - BuildCompactTrieNode(UBool parentEndsWord, UBool vertical, UStack &nodes, UErrorCode &status) { +public: + BuildCompactTrieNode(UBool parentEndsWord, CompactTrieNodeType nodeType, + UStack &nodes, UErrorCode &status, uint16_t value = 0) { fParentEndsWord = parentEndsWord; fHasDuplicate = FALSE; - fVertical = vertical; + fNodeType = nodeType; + fEqualOverflows = FALSE; fNodeID = nodes.size(); + fValue = parentEndsWord? value : 0; nodes.push(this, status); } @@ -694,87 +933,225 @@ } virtual uint32_t size() { - return sizeof(uint16_t); + if(fValue > 0) + return sizeof(uint16_t) * 2; + else + return sizeof(uint16_t); } virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &/*translate*/) { // Write flag/count - *((uint16_t *)(bytes+offset)) = (fChars.length() & kCountMask) - | (fVertical ? kVerticalNode : 0) | (fParentEndsWord ? kParentEndsWord : 0 ); + + // if this ever fails, a flag bit (i.e. kExceedsCount) will need to be + // used as a 5th MSB. + U_ASSERT(fChars.length() < 4096 || fNodeID == 2); + + *((uint16_t *)(bytes+offset)) = (fEqualOverflows? kEqualOverflows : 0) | + ((fNodeID == 2)? (fChars.length() & kRootCountMask): + ( + (fChars.length() & kCountMask) | + //((fChars.length() << 2) & kExceedsCount) | + (fNodeType == kVerticalType ? kVerticalNode : 0) | + (fParentEndsWord ? kParentEndsWord : 0 ) + ) + ); offset += sizeof(uint16_t); } + + virtual void writeValue(uint8_t *bytes, uint32_t &offset) { + if(fValue > 0){ + *((uint16_t *)(bytes+offset)) = fValue; + offset += sizeof(uint16_t); + } + } + +}; + +/** + * Stores value of parent terminating nodes that have no more subtries. + */ +class BuildCompactTrieValueNode: public BuildCompactTrieNode { +public: + BuildCompactTrieValueNode(UStack &nodes, UErrorCode &status, uint16_t value) + : BuildCompactTrieNode(TRUE, kValueType, nodes, status, value){ + } + + virtual ~BuildCompactTrieValueNode(){ + } + + virtual uint32_t size() { + return sizeof(uint16_t) * 2; + } + + virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) { + // don't write value directly to memory but store it in offset to be written later + //offset = fValue & kOffsetContainsValue; + BuildCompactTrieNode::write(bytes, offset, translate); + BuildCompactTrieNode::writeValue(bytes, offset); + } }; class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode { public: UStack fLinks; + UBool fMayOverflow; //intermediate value for fEqualOverflows public: - BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status) - : BuildCompactTrieNode(parentEndsWord, FALSE, nodes, status), fLinks(status) { + BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status, uint16_t value = 0) + : BuildCompactTrieNode(parentEndsWord, kHorizontalType, nodes, status, value), fLinks(status) { + fMayOverflow = FALSE; } virtual ~BuildCompactTrieHorizontalNode() { } + // It is impossible to know beforehand exactly how much space the node will + // need in memory before being written, because the node IDs in the equal + // links may or may not overflow after node coalescing. Therefore, this method + // returns the maximum size possible for the node. virtual uint32_t size() { - return offsetof(CompactTrieHorizontalNode,entries) + - (fChars.length()*sizeof(CompactTrieHorizontalEntry)); + uint32_t estimatedSize = offsetof(CompactTrieHorizontalNode,entries) + + (fChars.length()*sizeof(CompactTrieHorizontalEntry)); + + if(fValue > 0) + estimatedSize += sizeof(uint16_t); + + //estimate extra space needed to store overflow for node ID links + //may be more than what is actually needed + for(int i=0; i < fChars.length(); i++){ + if(((BuildCompactTrieNode *)fLinks[i])->fNodeID > 0xFFFF){ + fMayOverflow = TRUE; + break; + } + } + if(fMayOverflow) // added space for overflow should be same as ceil(fChars.length()/4) * sizeof(uint16_t) + estimatedSize += (sizeof(uint16_t) * fChars.length() + 2)/4; + + return estimatedSize; } virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) { - BuildCompactTrieNode::write(bytes, offset, translate); int32_t count = fChars.length(); + + //if largest nodeID > 2^16, set flag + //large node IDs are more likely to be at the back of the array + for (int32_t i = count-1; i >= 0; --i) { + if(translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID) > 0xFFFF){ + fEqualOverflows = TRUE; + break; + } + } + + BuildCompactTrieNode::write(bytes, offset, translate); + + // write entries[] to memory for (int32_t i = 0; i < count; ++i) { CompactTrieHorizontalEntry *entry = (CompactTrieHorizontalEntry *)(bytes+offset); entry->ch = fChars[i]; entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID); #ifdef DEBUG_TRIE_DICT - if (entry->equal == 0) { + + if ((entry->equal == 0) && !fEqualOverflows) { fprintf(stderr, "ERROR: horizontal link %d, logical node %d maps to physical node zero\n", i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID); } #endif offset += sizeof(CompactTrieHorizontalEntry); } + + // append extra bits of equal nodes to end if fEqualOverflows + if (fEqualOverflows) { + uint16_t leftmostBits = 0; + for (int16_t i = 0; i < count; i++) { + leftmostBits = (leftmostBits << 4) | getLeftmostBits(translate, i); + + // write filled uint16_t to memory + if(i % 4 == 3){ + *((uint16_t *)(bytes+offset)) = leftmostBits; + leftmostBits = 0; + offset += sizeof(uint16_t); + } + } + + // pad last uint16_t with zeroes if necessary + int remainder = count % 4; + if (remainder > 0) { + *((uint16_t *)(bytes+offset)) = (leftmostBits << (16 - 4 * remainder)); + offset += sizeof(uint16_t); + } + } + + BuildCompactTrieNode::writeValue(bytes, offset); + } + + // returns leftmost bits of physical node link + uint16_t getLeftmostBits(const UVector32 &translate, uint32_t i){ + uint16_t leftmostBits = (uint16_t) (translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID) >> 16); +#ifdef DEBUG_TRIE_DICT + if (leftmostBits > 0xF) { + fprintf(stderr, "ERROR: horizontal link %d, logical node %d exceeds maximum possible node ID value\n", + i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID); + } +#endif + return leftmostBits; } void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) { fChars.append(ch); fLinks.push(link, status); } + }; class BuildCompactTrieVerticalNode: public BuildCompactTrieNode { - public: +public: BuildCompactTrieNode *fEqual; - public: - BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status) - : BuildCompactTrieNode(parentEndsWord, TRUE, nodes, status) { +public: + BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status, uint16_t value = 0) + : BuildCompactTrieNode(parentEndsWord, kVerticalType, nodes, status, value) { fEqual = NULL; } virtual ~BuildCompactTrieVerticalNode() { } + // Returns the maximum possible size of this node. See comment in + // BuildCompactTrieHorizontal node for more information. virtual uint32_t size() { - return offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof(uint16_t)); + uint32_t estimatedSize = offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof(uint16_t)); + if(fValue > 0){ + estimatedSize += sizeof(uint16_t); + } + + if(fEqual->fNodeID > 0xFFFF){ + estimatedSize += sizeof(uint16_t); + } + return estimatedSize; } virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) { CompactTrieVerticalNode *node = (CompactTrieVerticalNode *)(bytes+offset); + fEqualOverflows = (translate.elementAti(fEqual->fNodeID) > 0xFFFF); BuildCompactTrieNode::write(bytes, offset, translate); node->equal = translate.elementAti(fEqual->fNodeID); offset += sizeof(node->equal); #ifdef DEBUG_TRIE_DICT - if (node->equal == 0) { + if ((node->equal == 0) && !fEqualOverflows) { fprintf(stderr, "ERROR: vertical link, logical node %d maps to physical node zero\n", fEqual->fNodeID); } #endif fChars.extract(0, fChars.length(), (UChar *)node->chars); - offset += sizeof(uint16_t)*fChars.length(); + offset += sizeof(UChar)*fChars.length(); + + // append 16 bits of to end for equal node if fEqualOverflows + if (fEqualOverflows) { + *((uint16_t *)(bytes+offset)) = (translate.elementAti(fEqual->fNodeID) >> 16); + offset += sizeof(uint16_t); + } + + BuildCompactTrieNode::writeValue(bytes, offset); } void addChar(UChar ch) { @@ -784,60 +1161,85 @@ void setLink(BuildCompactTrieNode *node) { fEqual = node; } + }; // Forward declaration static void walkHorizontal(const TernaryNode *node, BuildCompactTrieHorizontalNode *building, UStack &nodes, - UErrorCode &status); + UErrorCode &status, + Hashtable *values); -// Convert one node. Uses recursion. +// Convert one TernaryNode into a BuildCompactTrieNode. Uses recursion. static BuildCompactTrieNode * -compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, UErrorCode &status) { +compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, + UErrorCode &status, Hashtable *values = NULL, uint16_t parentValue = 0) { if (U_FAILURE(status)) { return NULL; } BuildCompactTrieNode *result = NULL; UBool horizontal = (node->low != NULL || node->high != NULL); if (horizontal) { - BuildCompactTrieHorizontalNode *hResult = - new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status); + BuildCompactTrieHorizontalNode *hResult; + if(values != NULL){ + hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status, parentValue); + } else { + hResult = new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status); + } + if (hResult == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return NULL; } if (U_SUCCESS(status)) { - walkHorizontal(node, hResult, nodes, status); + walkHorizontal(node, hResult, nodes, status, values); result = hResult; } } else { - BuildCompactTrieVerticalNode *vResult = - new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status); + BuildCompactTrieVerticalNode *vResult; + if(values != NULL){ + vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status, parentValue); + } else { + vResult = new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status); + } + if (vResult == NULL) { status = U_MEMORY_ALLOCATION_ERROR; + return NULL; } else if (U_SUCCESS(status)) { - UBool endsWord = FALSE; + uint16_t value = 0; + UBool endsWord = FALSE; // Take up nodes until we end a word, or hit a node with < or > links do { vResult->addChar(node->ch); - endsWord = (node->flags & kEndsWord) != 0; + value = node->flags; + endsWord = value > 0; node = node->equal; } while(node != NULL && !endsWord && node->low == NULL && node->high == NULL); + if (node == NULL) { if (!endsWord) { status = U_ILLEGAL_ARGUMENT_ERROR; // Corrupt input trie } - else { + else if(values != NULL){ + UnicodeString key(value); //store value as a single-char UnicodeString + BuildCompactTrieValueNode *link = (BuildCompactTrieValueNode *) values->get(key); + if(link == NULL){ + link = new BuildCompactTrieValueNode(nodes, status, value); //take out nodes? + values->put(key, link, status); + } + vResult->setLink(link); + } else { vResult->setLink((BuildCompactTrieNode *)nodes[1]); } } else { - vResult->setLink(compactOneNode(node, endsWord, nodes, status)); + vResult->setLink(compactOneNode(node, endsWord, nodes, status, values, value)); } result = vResult; } @@ -849,19 +1251,28 @@ // Uses recursion. static void walkHorizontal(const TernaryNode *node, - BuildCompactTrieHorizontalNode *building, - UStack &nodes, - UErrorCode &status) { + BuildCompactTrieHorizontalNode *building, + UStack &nodes, + UErrorCode &status, Hashtable *values = NULL) { while (U_SUCCESS(status) && node != NULL) { if (node->low != NULL) { - walkHorizontal(node->low, building, nodes, status); + walkHorizontal(node->low, building, nodes, status, values); } BuildCompactTrieNode *link = NULL; if (node->equal != NULL) { - link = compactOneNode(node->equal, (node->flags & kEndsWord) != 0, nodes, status); + link = compactOneNode(node->equal, node->flags > 0, nodes, status, values, node->flags); } - else if (node->flags & kEndsWord) { - link = (BuildCompactTrieNode *)nodes[1]; + else if (node->flags > 0) { + if(values != NULL) { + UnicodeString key(node->flags); //store value as a single-char UnicodeString + link = (BuildCompactTrieValueNode *) values->get(key); + if(link == NULL) { + link = new BuildCompactTrieValueNode(nodes, status, node->flags); //take out nodes? + values->put(key, link, status); + } + } else { + link = (BuildCompactTrieNode *)nodes[1]; + } } if (U_SUCCESS(status) && link != NULL) { building->addNode(node->ch, link, status); @@ -881,13 +1292,15 @@ _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr) { BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl; BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr; + // Check for comparing a node to itself, to avoid spurious duplicates if (left == right) { return 0; } + // Most significant is type of node. Can never coalesce. - if (left->fVertical != right->fVertical) { - return left->fVertical - right->fVertical; + if (left->fNodeType != right->fNodeType) { + return left->fNodeType - right->fNodeType; } // Next, the "parent ends word" flag. If that differs, we cannot coalesce. if (left->fParentEndsWord != right->fParentEndsWord) { @@ -898,12 +1311,19 @@ if (result != 0) { return result; } + + // If the node value differs, we should not coalesce. + // If values aren't stored, all fValues should be 0. + if (left->fValue != right->fValue) { + return left->fValue - right->fValue; + } + // We know they're both the same node type, so branch for the two cases. - if (left->fVertical) { + if (left->fNodeType == kVerticalType) { result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID - - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID; + - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID; } - else { + else if(left->fChars.length() > 0 && right->fChars.length() > 0){ // We need to compare the links vectors. They should be the // same size because the strings were equal. // We compare the node IDs instead of the pointers, to handle @@ -914,9 +1334,10 @@ int32_t count = hleft->fLinks.size(); for (int32_t i = 0; i < count && result == 0; ++i) { result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID - - ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID; + ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID; } } + // If they are equal to each other, mark them (speeds coalescing) if (result == 0) { left->fHasDuplicate = TRUE; @@ -1031,20 +1452,25 @@ // Add node 0, used as the NULL pointer/sentinel. nodes.addElement((int32_t)0, status); + Hashtable *values = NULL; // Index of (unique) values + if (dict.fValued) { + values = new Hashtable(status); + } + // Start by creating the special empty node we use to indicate that the parent // terminates a word. This must be node 1, because the builder assumes - // that. + // that. This node will never be used for tries storing numerical values. if (U_FAILURE(status)) { return NULL; } - BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, FALSE, nodes, status); + BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, kHorizontalType, nodes, status); if (terminal == NULL) { status = U_MEMORY_ALLOCATION_ERROR; } // This call does all the work of building the new trie structure. The root - // will be node 2. - BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status); + // will have node ID 2 before writing to memory. + BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status, values); #ifdef DEBUG_TRIE_DICT (void) ::times(&timing); fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n", @@ -1077,21 +1503,37 @@ return NULL; } + //map terminal value nodes + int valueCount = 0; + UVector valueNodes(status); + if(values != NULL) { + valueCount = values->count(); //number of unique terminal value nodes + } + + // map non-terminal nodes + int valuePos = 1;//, nodePos = valueCount + valuePos; + nodeCount = valueCount + valuePos; for (i = 1; i < count; ++i) { node = (BuildCompactTrieNode *)nodes[i]; if (node->fNodeID == i) { // Only one node out of each duplicate set is used - if (i >= translate.size()) { + if (node->fNodeID >= translate.size()) { // Logically extend the mapping table - translate.setSize(i+1); + translate.setSize(i + 1); + } + //translate.setElementAt(object, index)! + if(node->fNodeType == kValueType) { + valueNodes.addElement(node, status); + translate.setElementAt(valuePos++, i); + } else { + translate.setElementAt(nodeCount++, i); } - translate.setElementAt(nodeCount++, i); totalSize += node->size(); } } - - // Check for overflowing 16 bits worth of nodes. - if (nodeCount > 0x10000) { + + // Check for overflowing 20 bits worth of nodes. + if (nodeCount > 0x100000) { status = U_ILLEGAL_ARGUMENT_ERROR; return NULL; } @@ -1111,9 +1553,14 @@ status = U_MEMORY_ALLOCATION_ERROR; return NULL; } - + CompactTrieHeader *header = (CompactTrieHeader *)bytes; - header->size = totalSize; + //header->size = totalSize; + if(dict.fValued){ + header->magic = COMPACT_TRIE_MAGIC_3; + } else { + header->magic = COMPACT_TRIE_MAGIC_2; + } header->nodeCount = nodeCount; header->offsets[0] = 0; // Sentinel header->root = translate.elementAti(root->fNodeID); @@ -1123,23 +1570,40 @@ } #endif uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uint32_t)); - nodeCount = 1; + nodeCount = valueCount + 1; + + // Write terminal value nodes to memory + for (i=0; i < valueNodes.size(); i++) { + //header->offsets[i + 1] = offset; + uint32_t tmpOffset = 0; + node = (BuildCompactTrieNode *) valueNodes.elementAt(i); + //header->offsets[i + 1] = (uint32_t)node->fValue; + node->write((uint8_t *)&header->offsets[i+1], tmpOffset, translate); + } + // Now write the data for (i = 1; i < count; ++i) { node = (BuildCompactTrieNode *)nodes[i]; - if (node->fNodeID == i) { + if (node->fNodeID == i && node->fNodeType != kValueType) { header->offsets[nodeCount++] = offset; node->write(bytes, offset, translate); } } + + //free all extra space + uprv_realloc(bytes, offset); + header->size = offset; + #ifdef DEBUG_TRIE_DICT + fprintf(stdout, "Space freed: %d\n", totalSize-offset); + (void) ::times(&timing); fprintf(stderr, "Trie built, time user %f system %f\n", (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK, (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK); previous = timing; fprintf(stderr, "Final offset is %d\n", offset); - + // Collect statistics on node types and sizes int hCount = 0; int vCount = 0; @@ -1148,68 +1612,85 @@ size_t hItemCount = 0; size_t vItemCount = 0; uint32_t previousOff = offset; - for (uint16_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) { + uint32_t numOverflow = 0; + uint32_t valueSpace = 0; + for (uint32_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) { const CompactTrieNode *node = getCompactNode(header, nodeIdx); - if (node->flagscount & kVerticalNode) { + int itemCount; + if(nodeIdx == header->root) + itemCount = node->flagscount & kRootCountMask; + else + itemCount = getCount(node); + if(node->flagscount & kEqualOverflows){ + numOverflow++; + } + if (node->flagscount & kVerticalNode && nodeIdx != header->root) { vCount += 1; - vItemCount += (node->flagscount & kCountMask); + vItemCount += itemCount; vSize += previousOff-header->offsets[nodeIdx]; } else { hCount += 1; - hItemCount += (node->flagscount & kCountMask); - hSize += previousOff-header->offsets[nodeIdx]; + hItemCount += itemCount; + if(nodeIdx >= header->root) { + hSize += previousOff-header->offsets[nodeIdx]; + } } + + if(header->magic == COMPACT_TRIE_MAGIC_3 && node->flagscount & kParentEndsWord) + valueSpace += sizeof(uint16_t); previousOff = header->offsets[nodeIdx]; } fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items\n", hCount, (double)hSize/hCount, (double)hItemCount/hCount); fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n", vCount, (double)vSize/vCount, (double)vItemCount/vCount); + fprintf(stderr, "Number of nodes with overflowing nodeIDs: %d \n", numOverflow); + fprintf(stderr, "Space taken up by values: %d \n", valueSpace); #endif if (U_FAILURE(status)) { uprv_free(bytes); header = NULL; } - else { - header->magic = COMPACT_TRIE_MAGIC_1; - } return header; } // Forward declaration static TernaryNode * -unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UErrorCode &status ); - +unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorCode &status ); // Convert a horizontal node (or subarray thereof) into a ternary subtrie static TernaryNode * -unpackHorizontalArray( const CompactTrieHeader *header, const CompactTrieHorizontalEntry *array, - int low, int high, UErrorCode &status ) { +unpackHorizontalArray( const CompactTrieInfo *info, const CompactTrieHorizontalNode *hnode, + int low, int high, int nodeCount, UErrorCode &status) { if (U_FAILURE(status) || low > high) { return NULL; } int middle = (low+high)/2; - TernaryNode *result = new TernaryNode(array[middle].ch); + TernaryNode *result = new TernaryNode(hnode->entries[middle].ch); if (result == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return NULL; } - const CompactTrieNode *equal = getCompactNode(header, array[middle].equal); + const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(hnode, middle, nodeCount)); if (equal->flagscount & kParentEndsWord) { - result->flags |= kEndsWord; + if(info->magic == COMPACT_TRIE_MAGIC_3){ + result->flags = getValue(equal); + }else{ + result->flags |= kEndsWord; + } } - result->low = unpackHorizontalArray(header, array, low, middle-1, status); - result->high = unpackHorizontalArray(header, array, middle+1, high, status); - result->equal = unpackOneNode(header, equal, status); + result->low = unpackHorizontalArray(info, hnode, low, middle-1, nodeCount, status); + result->high = unpackHorizontalArray(info, hnode, middle+1, high, nodeCount, status); + result->equal = unpackOneNode(info, equal, status); return result; } // Convert one compact trie node into a ternary subtrie static TernaryNode * -unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UErrorCode &status ) { - int nodeCount = (node->flagscount & kCountMask); +unpackOneNode( const CompactTrieInfo *info, const CompactTrieNode *node, UErrorCode &status ) { + int nodeCount = getCount(node); if (nodeCount == 0 || U_FAILURE(status)) { // Failure, or terminal node return NULL; @@ -1234,29 +1715,41 @@ previous = latest; } if (latest != NULL) { - const CompactTrieNode *equal = getCompactNode(header, vnode->equal); + const CompactTrieNode *equal = getCompactNode(info, calcEqualLink(vnode)); if (equal->flagscount & kParentEndsWord) { - latest->flags |= kEndsWord; + if(info->magic == COMPACT_TRIE_MAGIC_3){ + latest->flags = getValue(equal); + } else { + latest->flags |= kEndsWord; + } } - latest->equal = unpackOneNode(header, equal, status); + latest->equal = unpackOneNode(info, equal, status); } return head; } else { // Horizontal node const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node; - return unpackHorizontalArray(header, &hnode->entries[0], 0, nodeCount-1, status); + return unpackHorizontalArray(info, hnode, 0, nodeCount-1, nodeCount, status); } } +// returns a MutableTrieDictionary generated from the CompactTrieDictionary MutableTrieDictionary * CompactTrieDictionary::cloneMutable( UErrorCode &status ) const { - MutableTrieDictionary *result = new MutableTrieDictionary( status ); + MutableTrieDictionary *result = new MutableTrieDictionary( status, fInfo->magic == COMPACT_TRIE_MAGIC_3 ); if (result == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return NULL; } - TernaryNode *root = unpackOneNode(fData, getCompactNode(fData, fData->root), status); + // treat root node as special case: don't call unpackOneNode() or unpackHorizontalArray() directly + // because only kEqualOverflows flag should be checked in root's flagscount + const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *) + getCompactNode(fInfo, fInfo->root); + uint16_t nodeCount = hnode->flagscount & kRootCountMask; + TernaryNode *root = unpackHorizontalArray(fInfo, hnode, 0, nodeCount-1, + nodeCount, status); + if (U_FAILURE(status)) { delete root; // Clean up delete result; @@ -1270,8 +1763,8 @@ U_CAPI int32_t U_EXPORT2 triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, - UErrorCode *status) { - + UErrorCode *status) { + if (status == NULL || U_FAILURE(*status)) { return 0; } @@ -1286,14 +1779,14 @@ // const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4); if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */ - pInfo->dataFormat[1]==0x72 && - pInfo->dataFormat[2]==0x44 && - pInfo->dataFormat[3]==0x63 && - pInfo->formatVersion[0]==1 )) { + pInfo->dataFormat[1]==0x72 && + pInfo->dataFormat[2]==0x44 && + pInfo->dataFormat[3]==0x63 && + pInfo->formatVersion[0]==1 )) { udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n", - pInfo->dataFormat[0], pInfo->dataFormat[1], - pInfo->dataFormat[2], pInfo->dataFormat[3], - pInfo->formatVersion[0]); + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + pInfo->formatVersion[0]); *status=U_UNSUPPORTED_ERROR; return 0; } @@ -1311,8 +1804,10 @@ // const uint8_t *inBytes =(const uint8_t *)inData+headerSize; const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes; - if (ds->readUInt32(header->magic) != COMPACT_TRIE_MAGIC_1 - || ds->readUInt32(header->size) < sizeof(CompactTrieHeader)) + uint32_t magic = ds->readUInt32(header->magic); + if (magic != COMPACT_TRIE_MAGIC_1 && magic != COMPACT_TRIE_MAGIC_2 && magic != COMPACT_TRIE_MAGIC_3 + || magic == COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeaderV1) + || magic != COMPACT_TRIE_MAGIC_1 && ds->readUInt32(header->size) < sizeof(CompactTrieHeader)) { udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n"); *status=U_UNSUPPORTED_ERROR; @@ -1333,10 +1828,10 @@ // if (length < sizeWithUData) { udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n", - totalSize); + totalSize); *status=U_INDEX_OUTOFBOUNDS_ERROR; return 0; - } + } // // Swap the Data. Do the data itself first, then the CompactTrieHeader, because @@ -1355,20 +1850,38 @@ } // We need to loop through all the nodes in the offset table, and swap each one. - uint16_t nodeCount = ds->readUInt16(header->nodeCount); + uint32_t nodeCount, rootId; + if(header->magic == COMPACT_TRIE_MAGIC_1) { + nodeCount = ds->readUInt16(((CompactTrieHeaderV1 *)header)->nodeCount); + rootId = ds->readUInt16(((CompactTrieHeaderV1 *)header)->root); + } else { + nodeCount = ds->readUInt32(header->nodeCount); + rootId = ds->readUInt32(header->root); + } + // Skip node 0, which should always be 0. - for (int i = 1; i < nodeCount; ++i) { + for (uint32_t i = 1; i < nodeCount; ++i) { uint32_t nodeOff = ds->readUInt32(header->offsets[i]); const CompactTrieNode *inNode = (const CompactTrieNode *)(inBytes + nodeOff); CompactTrieNode *outNode = (CompactTrieNode *)(outBytes + nodeOff); uint16_t flagscount = ds->readUInt16(inNode->flagscount); - uint16_t itemCount = flagscount & kCountMask; + uint16_t itemCount = getCount(inNode); + //uint16_t itemCount = flagscount & kCountMask; ds->writeUInt16(&outNode->flagscount, flagscount); if (itemCount > 0) { - if (flagscount & kVerticalNode) { + uint16_t overflow = 0; //number of extra uint16_ts needed to be swapped + if (flagscount & kVerticalNode && i != rootId) { + if(flagscount & kEqualOverflows){ + // include overflow bits + overflow += 1; + } + if (header->magic == COMPACT_TRIE_MAGIC_3 && flagscount & kEndsParentWord) { + //include values + overflow += 1; + } ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), - itemCount*sizeof(uint16_t), - outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), status); + (itemCount + overflow)*sizeof(uint16_t), + outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), status); uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal); ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal)); } @@ -1381,26 +1894,62 @@ word = ds->readUInt16(inHNode->entries[j].equal); ds->writeUInt16(&outHNode->entries[j].equal, word); } + + // swap overflow/value information + if(flagscount & kEqualOverflows){ + overflow += (itemCount + 3) / 4; + } + + if (header->magic == COMPACT_TRIE_MAGIC_3 && i != rootId && flagscount & kEndsParentWord) { + //include values + overflow += 1; + } + + uint16_t *inOverflow = (uint16_t *) &inHNode->entries[itemCount]; + uint16_t *outOverflow = (uint16_t *) &outHNode->entries[itemCount]; + for(int j = 0; jreadUInt16(*inOverflow); + ds->writeUInt16(outOverflow, extraInfo); + + inOverflow++; + outOverflow++; + } } } } #endif - // All the data in all the nodes consist of 16 bit items. Swap them all at once. - uint16_t nodeCount = ds->readUInt16(header->nodeCount); - uint32_t nodesOff = offsetof(CompactTrieHeader,offsets)+((uint32_t)nodeCount*sizeof(uint32_t)); - ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff, status); - // Swap the header ds->writeUInt32(&outputHeader->size, totalSize); - uint32_t magic = ds->readUInt32(header->magic); ds->writeUInt32(&outputHeader->magic, magic); - ds->writeUInt16(&outputHeader->nodeCount, nodeCount); - uint16_t root = ds->readUInt16(header->root); - ds->writeUInt16(&outputHeader->root, root); - ds->swapArray32(ds, inBytes+offsetof(CompactTrieHeader,offsets), - sizeof(uint32_t)*(int32_t)nodeCount, - outBytes+offsetof(CompactTrieHeader,offsets), status); + + uint32_t nodeCount; + uint32_t offsetPos; + if (header->magic == COMPACT_TRIE_MAGIC_1) { + CompactTrieHeaderV1 *headerV1 = (CompactTrieHeaderV1 *)header; + CompactTrieHeaderV1 *outputHeaderV1 = (CompactTrieHeaderV1 *)outputHeader; + + nodeCount = ds->readUInt16(headerV1->nodeCount); + ds->writeUInt16(&outputHeaderV1->nodeCount, nodeCount); + uint16_t root = ds->readUInt16(headerV1->root); + ds->writeUInt16(&outputHeaderV1->root, root); + offsetPos = offsetof(CompactTrieHeaderV1,offsets); + } else { + nodeCount = ds->readUInt32(header->nodeCount); + ds->writeUInt32(&outputHeader->nodeCount, nodeCount); + uint32_t root = ds->readUInt32(header->root); + ds->writeUInt32(&outputHeader->root, root); + offsetPos = offsetof(CompactTrieHeader,offsets); + } + + // All the data in all the nodes consist of 16 bit items. Swap them all at once. + uint32_t nodesOff = offsetPos+((uint32_t)nodeCount*sizeof(uint32_t)); + ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff, status); + + //swap offsets + ds->swapArray32(ds, inBytes+offsetPos, + sizeof(uint32_t)*(uint32_t)nodeCount, + outBytes+offsetPos, status); return sizeWithUData; } --- source/common/triedict.h 2006-06-06 15:38:49.000000000 -0700 +++ source/common/triedict.h 2011-01-21 14:12:45.496927000 -0800 @@ -47,7 +47,6 @@ U_NAMESPACE_BEGIN class StringEnumeration; -struct CompactTrieHeader; /******************************************************************* * TrieWordDictionary @@ -72,23 +71,29 @@ */ virtual ~TrieWordDictionary(); + /** + *

Returns true if the dictionary contains values associated with each word.

+ */ + virtual UBool getValued() const = 0; + /** *

Find dictionary words that match the text.

* * @param text A UText representing the text. The * iterator is left after the longest prefix match in the dictionary. - * @param start The current position in text. * @param maxLength The maximum number of code units to match. * @param lengths An array that is filled with the lengths of words that matched. * @param count Filled with the number of elements output in lengths. * @param limit The size of the lengths array; this limits the number of words output. + * @param values An array that is filled with the values associated with the matched words. * @return The number of characters in text that were matched. */ virtual int32_t matches( UText *text, int32_t maxLength, int32_t *lengths, int &count, - int limit ) const = 0; + int limit, + uint16_t *values = NULL) const = 0; /** *

Return a StringEnumeration for iterating all the words in the dictionary.

@@ -128,6 +133,12 @@ UText *fIter; + /** + * A UText for internal use + * @internal + */ + UBool fValued; + friend class CompactTrieDictionary; // For fast conversion public: @@ -138,14 +149,29 @@ * @param median A UChar around which to balance the trie. Ideally, it should * begin at least one word that is near the median of the set in the dictionary * @param status A status code recording the success of the call. + * @param containsValue True if the dictionary stores values associated with each word. */ - MutableTrieDictionary( UChar median, UErrorCode &status ); + MutableTrieDictionary( UChar median, UErrorCode &status, UBool containsValue = FALSE ); /** *

Virtual destructor.

*/ virtual ~MutableTrieDictionary(); + /** + * Indicate whether the MutableTrieDictionary stores values associated with each word + */ + void setValued(UBool valued){ + fValued = valued; + } + + /** + *

Returns true if the dictionary contains values associated with each word.

+ */ + virtual UBool getValued() const { + return fValued; + } + /** *

Find dictionary words that match the text.

* @@ -155,13 +181,15 @@ * @param lengths An array that is filled with the lengths of words that matched. * @param count Filled with the number of elements output in lengths. * @param limit The size of the lengths array; this limits the number of words output. + * @param values An array that is filled with the values associated with the matched words. * @return The number of characters in text that were matched. */ virtual int32_t matches( UText *text, int32_t maxLength, int32_t *lengths, int &count, - int limit ) const; + int limit, + uint16_t *values = NULL) const; /** *

Return a StringEnumeration for iterating all the words in the dictionary.

@@ -173,15 +201,17 @@ virtual StringEnumeration *openWords( UErrorCode &status ) const; /** - *

Add one word to the dictionary.

+ *

Add one word to the dictionary with an optional associated value.

* * @param word A UChar buffer containing the word. * @param length The length of the word. - * @param status The resultant status + * @param status The resultant status. + * @param value The nonzero value associated with this word. */ virtual void addWord( const UChar *word, int32_t length, - UErrorCode &status); + UErrorCode &status, + uint16_t value = 0); #if 0 /** @@ -203,8 +233,9 @@ * @param lengths An array that is filled with the lengths of words that matched. * @param count Filled with the number of elements output in lengths. * @param limit The size of the lengths array; this limits the number of words output. - * @param parent The parent of the current node - * @param pMatched The returned parent node matched the input + * @param parent The parent of the current node. + * @param pMatched The returned parent node matched the input/ + * @param values An array that is filled with the values associated with the matched words. * @return The number of characters in text that were matched. */ virtual int32_t search( UText *text, @@ -213,40 +244,46 @@ int &count, int limit, TernaryNode *&parent, - UBool &pMatched ) const; + UBool &pMatched, + uint16_t *values = NULL) const; private: /** *

Private constructor. The root node it not allocated.

* * @param status A status code recording the success of the call. + * @param containsValues True if the dictionary will store a value associated + * with each word added. */ - MutableTrieDictionary( UErrorCode &status ); + MutableTrieDictionary( UErrorCode &status, UBool containsValues = false ); }; /******************************************************************* * CompactTrieDictionary */ +//forward declarations +struct CompactTrieHeader; +struct CompactTrieInfo; + /** *

CompactTrieDictionary is a TrieWordDictionary that has been compacted * to save space.

*/ class U_COMMON_API CompactTrieDictionary : public TrieWordDictionary { private: - /** - * The root node of the trie - */ + /** + * The header of the CompactTrieDictionary which contains all info + */ - const CompactTrieHeader *fData; - - /** - * A UBool indicating whether or not we own the fData. - */ + CompactTrieInfo *fInfo; + /** + * A UBool indicating whether or not we own the fData. + */ UBool fOwnData; - UDataMemory *fUData; + UDataMemory *fUData; public: /** *

Construct a dictionary from a UDataMemory.

@@ -277,6 +314,11 @@ */ virtual ~CompactTrieDictionary(); + /** + *

Returns true if the dictionary contains values associated with each word.

+ */ + virtual UBool getValued() const; + /** *

Find dictionary words that match the text.

* @@ -286,13 +328,15 @@ * @param lengths An array that is filled with the lengths of words that matched. * @param count Filled with the number of elements output in lengths. * @param limit The size of the lengths array; this limits the number of words output. + * @param values An array that is filled with the values associated with the matched words. * @return The number of characters in text that were matched. */ virtual int32_t matches( UText *text, - int32_t rangeEnd, + int32_t maxLength, int32_t *lengths, int &count, - int limit ) const; + int limit, + uint16_t *values = NULL) const; /** *

Return a StringEnumeration for iterating all the words in the dictionary.

@@ -311,7 +355,7 @@ virtual uint32_t dataSize() const; /** - *

Return a void * pointer to the compact data, platform-endian.

+ *

Return a void * pointer to the (unmanaged) compact data, platform-endian.

* * @return The data for the compact dictionary, suitable for passing to the * constructor. @@ -342,5 +386,5 @@ U_NAMESPACE_END - /* TRIEDICT_H */ +/* TRIEDICT_H */ #endif --- source/data/Makefile.in 2010-10-29 13:21:33.000000000 -0700 +++ source/data/Makefile.in 2011-01-26 16:24:24.856798000 -0800 @@ -509,8 +520,9 @@ #################################################### CTD # CTD FILES -$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES) - $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $< +# .ctd file now generated regardless of whether dictionary file exists +$(BRKBLDDIR)/%.ctd: $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES) + $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $(BRKSRCDIR)/$(*F).txt #################################################### CFU # CFU FILES --- source/data/brkitr/root.txt 2010-07-28 17:18:28.000000000 -0700 +++ source/data/brkitr/root.txt 2011-01-21 14:12:45.653922000 -0800 @@ -17,5 +17,8 @@ } dictionaries{ Thai:process(dependency){"thaidict.ctd"} + Hani:process(dependency){"cjdict.ctd"} + Hira:process(dependency){"cjdict.ctd"} + Kata:process(dependency){"cjdict.ctd"} } } --- source/data/xml/brkitr/root.xml 2010-03-01 15:13:18.000000000 -0800 +++ source/data/xml/brkitr/root.xml 2011-01-21 14:12:45.735922000 -0800 @@ -25,6 +25,9 @@ + + + --- source/test/cintltst/creststn.c 2010-10-28 10:44:02.000000000 -0700 +++ source/test/cintltst/creststn.c 2011-01-21 14:12:44.995020000 -0800 @@ -2188,21 +2188,21 @@ { - UResourceBundle* ja = ures_open(U_ICUDATA_BRKITR,"ja", &status); + UResourceBundle* th = ures_open(U_ICUDATA_BRKITR,"th", &status); const UChar *got = NULL, *exp=NULL; int32_t gotLen = 0, expLen=0; - ja = ures_getByKey(ja, "boundaries", ja, &status); - exp = tres_getString(ja, -1, "word", &expLen, &status); + th = ures_getByKey(th, "boundaries", th, &status); + exp = tres_getString(th, -1, "grapheme", &expLen, &status); tb = ures_getByKey(aliasB, "boundaries", tb, &status); - got = tres_getString(tb, -1, "word", &gotLen, &status); + got = tres_getString(tb, -1, "grapheme", &gotLen, &status); if(U_FAILURE(status)) { log_err("%s trying to read str boundaries\n", u_errorName(status)); } else if(gotLen != expLen || u_strncmp(exp, got, gotLen) != 0) { log_err("Referencing alias didn't get the right data\n"); } - ures_close(ja); + ures_close(th); status = U_ZERO_ERROR; } /* simple alias */ --- source/test/intltest/rbbiapts.cpp 2010-07-12 11:03:29.000000000 -0700 +++ source/test/intltest/rbbiapts.cpp 2011-01-21 14:12:45.033014000 -0800 @@ -156,9 +156,13 @@ if(*a!=*b){ errln("Failed: boilerplate method operator!= does not return correct results"); } - BreakIterator* c = BreakIterator::createWordInstance(Locale("ja"),status); - if(a && c){ - if(*c==*a){ + // Japanese word break iteratos is identical to root with + // a dictionary-based break iterator, but Thai character break iterator + // is still different from Root. + BreakIterator* c = BreakIterator::createCharacterInstance(Locale("ja"),status); + BreakIterator* d = BreakIterator::createCharacterInstance(Locale("th"),status); + if(c && d){ + if(*c==*d){ errln("Failed: boilerplate method opertator== does not return correct results"); } }else{ @@ -167,6 +171,7 @@ delete a; delete b; delete c; + delete d; } void RBBIAPITest::TestgetRules() @@ -635,21 +640,21 @@ // void RBBIAPITest::TestRuleStatus() { UChar str[30]; - u_unescape("plain word 123.45 \\u9160\\u9161 \\u30a1\\u30a2 \\u3041\\u3094", - // 012345678901234567 8 9 0 1 2 3 4 5 6 - // Ideographic Katakana Hiragana + //no longer test Han or hiragana breaking here: ruleStatusVec would return nothing + // changed UBRK_WORD_KANA to UBRK_WORD_IDEO + u_unescape("plain word 123.45 \\u30a1\\u30a2 ", + // 012345678901234567 8 9 0 + // Katakana str, 30); UnicodeString testString1(str); - int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 19, 20, 21, 23, 24, 25, 26}; + int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 20, 21}; int32_t tag_lo[] = {UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_NUMBER, UBRK_WORD_NONE, - UBRK_WORD_IDEO, UBRK_WORD_IDEO, UBRK_WORD_NONE, - UBRK_WORD_KANA, UBRK_WORD_NONE, UBRK_WORD_KANA, UBRK_WORD_KANA}; + UBRK_WORD_IDEO, UBRK_WORD_NONE}; int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT, - UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT, - UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_KANA_LIMIT, UBRK_WORD_KANA_LIMIT}; + UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT}; UErrorCode status=U_ZERO_ERROR; @@ -888,9 +893,11 @@ URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD, status); { +#if 0 // With a dictionary based word breaking, ja_word is identical to root. if (ja_word && *ja_word == *root_word) { errln("japan not different from root"); } +#endif } { --- source/test/intltest/rbbitst.cpp 2010-10-08 18:23:28.000000000 -0700 +++ source/test/intltest/rbbitst.cpp 2011-01-21 14:12:45.180030000 -0800 @@ -35,6 +35,8 @@ #include #include #include +#include "unicode/numfmt.h" +#include "unicode/uscript.h" #define TEST_ASSERT(x) {if (!(x)) { \ errln("Failure in file %s, line %d", __FILE__, __LINE__);}} @@ -138,11 +140,13 @@ if (exec) TestThaiBreaks(); break; case 23: name = "TestTailoredBreaks"; if (exec) TestTailoredBreaks(); break; + case 24: name = "TestTrieDictWithValue"; + if(exec) TestTrieDictWithValue(); break; #else - case 21: case 22: case 23: name = "skip"; + case 21: case 22: case 23: case 24: name = "skip"; break; #endif - case 24: name = "TestDictRules"; + case 25: name = "TestDictRules"; if (exec) TestDictRules(); break; case 25: name = "TestBug5532"; if (exec) TestBug5532(); break; @@ -607,6 +611,8 @@ void RBBITest::TestJapaneseWordBreak() { +// TODO: Rewrite this test for a dictionary-based word breaking. +#if 0 UErrorCode status = U_ZERO_ERROR; BITestData japaneseWordSelection(status); @@ -628,6 +634,7 @@ generalIteratorTest(*e, japaneseWordSelection); delete e; +#endif } void RBBITest::TestTrieDict() { @@ -849,6 +856,372 @@ delete compact2; } +/*TODO: delete later*/ +inline void writeEnumerationToFile(StringEnumeration *enumer, char *filename){ + UErrorCode status = U_ZERO_ERROR; + FILE *outfile = fopen(filename,"w"); + UConverter *cvt = ucnv_open("UTF-8", &status); + if (U_FAILURE(status)) + return; + if(outfile != NULL){ + status = U_ZERO_ERROR; + const UnicodeString *word = enumer->snext(status); + while (word != NULL && U_SUCCESS(status)) { + char u8word[500]; + status = U_ZERO_ERROR; + ucnv_fromUChars(cvt, u8word, 500, word->getBuffer(), word->length(), + &status); + fprintf(outfile,"%s\n", u8word); + status = U_ZERO_ERROR; + word = enumer->snext(status); + } + fclose(outfile); + } + ucnv_close(cvt); +} + +// A very simple helper class to streamline the buffer handling in +// TestTrieDictWithValue +template +class AutoBuffer { + public: + AutoBuffer(size_t size) : buffer(stackBuffer) { + if (size > N) + buffer = new T[size]; + } + ~AutoBuffer() { + if (buffer != stackBuffer) + delete [] buffer; + } + T* elems() { + return buffer; + } + const T& operator[] (size_t i) const { + return buffer[i]; + } + T& operator[] (size_t i) { + return buffer[i]; + } + private: + T stackBuffer[N]; + T* buffer; + AutoBuffer(); +}; + +//---------------------------------------------------------------------------- +// +// TestTrieDictWithValue Test trie dictionaries with logprob values and +// more than 2^16 nodes after compaction. +// +//---------------------------------------------------------------------------- +void RBBITest::TestTrieDictWithValue() { + UErrorCode status = U_ZERO_ERROR; + + // + // Open and read the test data file. + // + const char *testDataDirectory = IntlTest::getSourceTestData(status); + const char *filename = "cjdict-truncated.txt"; + char testFileName[1000]; + if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen(filename) + 10 >= sizeof(testFileName)) { + errln("Can't open test data. Path too long."); + return; + } + strcpy(testFileName, testDataDirectory); + strcat(testFileName, filename); + + // Items needing deleting at the end + MutableTrieDictionary *mutableDict = NULL; + CompactTrieDictionary *compactDict = NULL; + UnicodeSet *breaks = NULL; + UChar *testFile = NULL; + StringEnumeration *enumer1 = NULL; + StringEnumeration *enumer2 = NULL; + MutableTrieDictionary *mutable2 = NULL; + StringEnumeration *cloneEnum = NULL; + CompactTrieDictionary *compact2 = NULL; + NumberFormat *nf = NULL; + UText *originalText = NULL, *cloneText = NULL; + + const UnicodeString *originalWord = NULL; + const UnicodeString *cloneWord = NULL; + UChar *current; + UChar *word; + UChar uc; + int32_t wordLen; + int32_t wordCount; + int32_t testCount; + int32_t valueLen; + int counter = 0; + + int len; + testFile = ReadAndConvertFile(testFileName, len, NULL, status); + if (U_FAILURE(status)) { + goto cleanup; /* something went wrong, error already output */ + } + + mutableDict = new MutableTrieDictionary(0x0E1C, status, TRUE); + if (U_FAILURE(status)) { + errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status)); + goto cleanup; + } + + breaks = new UnicodeSet; + breaks->add(0x000A); // Line Feed + breaks->add(0x000D); // Carriage Return + breaks->add(0x2028); // Line Separator + breaks->add(0x2029); // Paragraph Separator + breaks->add(0x0009); // Tab character + + // Now add each non-comment line of the file as a word. + current = testFile; + word = current; + uc = *current++; + wordLen = 0; + wordCount = 0; + nf = NumberFormat::createInstance(status); + + while (uc) { + UnicodeString ucharValue; + valueLen = 0; + + if (uc == 0x0023) { // #comment line, skip + while (uc && !breaks->contains(uc)) { + uc = *current++; + } + } + else{ + while (uc && !breaks->contains(uc)) { + ++wordLen; + uc = *current++; + } + if(uc == 0x0009){ //separator is a tab char, read in num after tab + uc = *current++; + while (uc && !breaks->contains(uc)) { + ucharValue.append(uc); + uc = *current++; + } + } + } + if (wordLen > 0) { + Formattable value((int32_t)0); + nf->parse(ucharValue.getTerminatedBuffer(), value, status); + + if(U_FAILURE(status)){ + errln("parsing of value failed when reading in dictionary\n"); + goto cleanup; + } + mutableDict->addWord(word, wordLen, status, value.getLong()); + if (U_FAILURE(status)) { + errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status)); + goto cleanup; + } + wordCount += 1; + } + + // Find beginning of next line + while (uc && breaks->contains(uc)) { + uc = *current++; + } + word = current-1; + wordLen = 0; + } + + if (wordCount < 50) { + errln("Word count (%d) unreasonably small\n", wordCount); + goto cleanup; + } + + enumer1 = mutableDict->openWords(status); + if (U_FAILURE(status)) { + errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status)); + goto cleanup; + } + + testCount = 0; + if (wordCount != (testCount = enumer1->count(status))) { + errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", + testCount, wordCount, u_errorName(status)); + goto cleanup; + } + + // Now compact it + compactDict = new CompactTrieDictionary(*mutableDict, status); + if (U_FAILURE(status)) { + errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status)); + goto cleanup; + } + + enumer2 = compactDict->openWords(status); + if (U_FAILURE(status)) { + errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status)); + goto cleanup; + } + + + //delete later +// writeEnumerationToFile(enumer1, "/home/jchye/mutable.txt"); +// writeEnumerationToFile(enumer2, "/home/jchye/compact.txt"); + + enumer1->reset(status); + enumer2->reset(status); + + originalWord = enumer1->snext(status); + cloneWord = enumer2->snext(status); + while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) { + if (*originalWord != *cloneWord) { + errln("MutableTrieDictionary and CompactTrieDictionary word mismatch at %d, lengths are %d and %d\n", + counter, originalWord->length(), cloneWord->length()); + goto cleanup; + } + + // check if attached values of the same word in both dictionaries tally +#if 0 + int32_t lengths1[originalWord->length()], lengths2[cloneWord->length()]; + uint16_t values1[originalWord->length()], values2[cloneWord->length()]; +#endif + AutoBuffer lengths1(originalWord->length()); + AutoBuffer lengths2(cloneWord->length()); + AutoBuffer values1(originalWord->length()); + AutoBuffer values2(cloneWord->length()); + + originalText = utext_openConstUnicodeString(originalText, originalWord, &status); + cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status); + + int count1, count2; + mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems()); + compactDict->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems()); + + if(values1[count1-1] != values2[count2-1]){ + errln("Values of word %d in MutableTrieDictionary and CompactTrieDictionary do not match, with values %d and %d\n", + counter, values1[count1-1], values2[count2-1]); + goto cleanup; + } + + counter++; + originalWord = enumer1->snext(status); + cloneWord = enumer2->snext(status); + } + if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) { + errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are the same"); + } + + delete enumer1; + enumer1 = NULL; + delete enumer2; + enumer2 = NULL; + + // Now un-compact it + mutable2 = compactDict->cloneMutable(status); + if (U_FAILURE(status)) { + errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status)); + goto cleanup; + } + + cloneEnum = mutable2->openWords(status); + if (U_FAILURE(status)) { + errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status)); + goto cleanup; + } + + if (wordCount != (testCount = cloneEnum->count(status))) { + errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n", + testCount, wordCount, u_errorName(status)); + goto cleanup; + } + + // Compact original dictionary to clone. Note that we can only compare the same kind of + // dictionary as the order of the enumerators is not guaranteed to be the same between + // different kinds + enumer1 = mutableDict->openWords(status); + if (U_FAILURE(status)) { + errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status)); + goto cleanup; + } + + counter = 0; + originalWord = enumer1->snext(status); + cloneWord = cloneEnum->snext(status); + while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) { + if (*originalWord != *cloneWord) { + errln("Original and cloned MutableTrieDictionary word mismatch\n"); + goto cleanup; + } + + // check if attached values of the same word in both dictionaries tally + AutoBuffer lengths1(originalWord->length()); + AutoBuffer lengths2(cloneWord->length()); + AutoBuffer values1(originalWord->length()); + AutoBuffer values2(cloneWord->length()); + originalText = utext_openConstUnicodeString(originalText, originalWord, &status); + cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status); + + int count1, count2; + mutableDict->matches(originalText, originalWord->length(), lengths1.elems(), count1, originalWord->length(), values1.elems()); + mutable2->matches(cloneText, cloneWord->length(), lengths2.elems(), count2, cloneWord->length(), values2.elems()); + + if(values1[count1-1] != values2[count2-1]){ + errln("Values of word %d in original and cloned MutableTrieDictionary do not match, with values %d and %d\n", + counter, values1[count1-1], values2[count2-1]); + goto cleanup; + } + + counter++; + + originalWord = enumer1->snext(status); + cloneWord = cloneEnum->snext(status); + } + + if (U_FAILURE(status)) { + errln("Enumeration failed: %s\n", u_errorName(status)); + goto cleanup; + } + + if (originalWord != cloneWord) { + errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n"); + goto cleanup; + } + + // Test the data copying constructor for CompactTrieDict, and the data access APIs. + compact2 = new CompactTrieDictionary(compactDict->data(), status); + if (U_FAILURE(status)) { + errln("CompactTrieDictionary(const void *,...) failed\n"); + goto cleanup; + } + + if (compact2->dataSize() == 0) { + errln("CompactTrieDictionary->dataSize() == 0\n"); + goto cleanup; + } + + // Now count the words via the second dictionary + delete enumer1; + enumer1 = compact2->openWords(status); + if (U_FAILURE(status)) { + errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status)); + goto cleanup; + } + + if (wordCount != (testCount = enumer1->count(status))) { + errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n", + testCount, wordCount, u_errorName(status)); + goto cleanup; + } + + cleanup: + delete compactDict; + delete mutableDict; + delete breaks; + delete[] testFile; + delete enumer1; + delete mutable2; + delete cloneEnum; + delete compact2; + utext_close(originalText); + utext_close(cloneText); + + +} //---------------------------------------------------------------------------- // @@ -1870,8 +2243,15 @@ // Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009). static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF" "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002"; +#if 0 static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 17, 18, 20, 21, 24, 27, 28 }; static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 }; +#endif +// There's no separate Japanese word break iterator. Root is the same as Japanese. +// Our dictionary-based iterator has to be tweaked to better handle U+3005, +// U+3007, U+300B and some other cases. +static const int32_t jaWordTOffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 }; +static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 15, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 }; // UBreakIteratorType UBRK_SENTENCE, Locale "el" // Add break after Greek question mark (cldrbug #2069). @@ -2672,6 +3052,8 @@ UnicodeSet *fNewlineSet; UnicodeSet *fKatakanaSet; UnicodeSet *fALetterSet; + // TODO(jungshik): Do we still need this change? + // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt UnicodeSet *fMidNumLetSet; UnicodeSet *fMidLetterSet; UnicodeSet *fMidNumSet; @@ -2680,6 +3062,7 @@ UnicodeSet *fOtherSet; UnicodeSet *fExtendSet; UnicodeSet *fExtendNumLetSet; + UnicodeSet *fDictionaryCjkSet; RegexMatcher *fMatcher; @@ -2696,12 +3079,24 @@ fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status); fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status); fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status); - fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status); + fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status); + // Exclude Hangul syllables from ALetterSet during testing. + // Leave CJK dictionary characters out from the monkey tests! +#if 0 + fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}" + "[\\p{Line_Break = Complex_Context}" + "-\\p{Grapheme_Cluster_Break = Extend}" + "-\\p{Grapheme_Cluster_Break = Control}" + "]]", + status); +#endif + fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status); + fALetterSet->removeAll(*fDictionaryCjkSet); fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status); fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status); fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status); fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status); - fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status); + fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}[\\uff10-\\uff19]]"), status); fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status); fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status); fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status); @@ -2725,13 +3120,14 @@ fOtherSet->removeAll(*fFormatSet); fOtherSet->removeAll(*fExtendSet); // Inhibit dictionary characters from being tested at all. + fOtherSet->removeAll(*fDictionaryCjkSet); fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status)); fSets->addElement(fCRSet, status); fSets->addElement(fLFSet, status); fSets->addElement(fNewlineSet, status); fSets->addElement(fALetterSet, status); - fSets->addElement(fKatakanaSet, status); + //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana fSets->addElement(fMidLetterSet, status); fSets->addElement(fMidNumLetSet, status); fSets->addElement(fMidNumSet, status); @@ -3978,6 +4374,7 @@ for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { count --; if (forward[count] != i) { + printStringBreaks(ustr, expected, expectedcount); test->errln("happy break test previous() failed: expected %d but got %d", forward[count], i); break; @@ -4011,23 +4408,25 @@ UErrorCode status = U_ZERO_ERROR; // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); BreakIterator *bi = BreakIterator::createWordInstance(locale, status); + // Replaced any C+J characters in a row with a random sequence of characters + // of the same length to make our C+J segmentation not get in the way. static const char *strlist[] = { "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", - "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b", + "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b", "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a", "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", - "\\u90ca\\u3588\\u009c\\u0953\\u194b", + "\\uac00\\u3588\\u009c\\u0953\\u194b", "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e", - "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e", + "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e", "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", "\\u003b\\u024a\\u102e\\U000e0071\\u0600", "\\u2027\\U000e0067\\u0a47\\u00b7", "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", "\\u0589\\U000e006e\\u0a42\\U000104a5", - "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", + "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a", "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", "\\u0027\\u11af\\U000e0057\\u0602", "\\U0001d7f2\\U000e007\\u0004\\u0589", @@ -4039,7 +4438,7 @@ "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", "\\u0233\\U000e0020\\u0a69\\u0d6a", "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", - "\\u58f4\\U000e0049\\u20e7\\u2027", + "\\u18f4\\U000e0049\\u20e7\\u2027", "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", "\\ua183\\u102d\\u0bec\\u003a", "\\u17e8\\u06e7\\u002e\\u096d\\u003b", @@ -4049,7 +4448,7 @@ "\\U000e005d\\u2044\\u0731\\u0650\\u0061", "\\u003a\\u0664\\u00b7\\u1fba", "\\u003b\\u0027\\u00b7\\u47a3", - "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b", + "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b", "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673", "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", }; @@ -4104,12 +4503,12 @@ "\\U0001d7f2\\U000e007d\\u0004\\u0589", "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", - "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", + "\\U000e0065\\u302c\\u09ee\\U000e0068", "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", "\\u0233\\U000e0020\\u0a69\\u0d6a", "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", "\\u58f4\\U000e0049\\u20e7\\u2027", - "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", + "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", "\\ua183\\u102d\\u0bec\\u003a", "\\u17e8\\u06e7\\u002e\\u096d\\u003b", "\\u003a\\u0e57\\u0fad\\u002e", --- source/test/intltest/rbbitst.h 2010-07-22 17:15:37.000000000 -0700 +++ source/test/intltest/rbbitst.h 2011-01-21 14:12:45.152007000 -0800 @@ -70,6 +70,7 @@ void TestBug5775(); void TestThaiBreaks(); void TestTailoredBreaks(); + void TestTrieDictWithValue(); void TestDictRules(); void TestBug5532(); --- source/test/testdata/rbbitst.txt 2010-07-28 17:18:28.000000000 -0700 +++ source/test/testdata/rbbitst.txt 2011-01-21 14:12:45.221011000 -0800 @@ -161,7 +161,23 @@ •abc<200>\U0001D800•def<200>\U0001D3FF• • # Hiragana & Katakana stay together, but separates from each other and Latin. -•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#• +# *** what to do about theoretical combos of chars? i.e. hiragana + accent +#•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#• + +# test normalization/dictionary handling of halfwidth katakana: same dictionary phrase in fullwidth and halfwidth +•芽キャベツ<400>芽キャベツ<400> + +# more Japanese tests +# TODO: Currently, U+30FC and other characters (script=common) in the Hiragana +# and the Katakana block are not treated correctly. Enable this later. +#•どー<400>せ<400>日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。• +•日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。• + +# Testing of word boundary for dictionary word containing both kanji and kana +•中だるみ<400>蔵王の森<400>ウ離島<400> + +# Testing of Chinese segmentation (taken from a Chinese news article) +•400<100>余<400>名<400>中央<400>委员<400>和<400>中央<400>候补<400>委员<400>都<400>领<400>到了<400>“•推荐<400>票<400>”•,•有<400>资格<400>在<400>200<100>多<400>名<400>符合<400>条件<400>的<400>63<100>岁<400>以下<400>中共<400>正<400>部<400>级<400>干部<400>中<400>,•选出<400>他们<400>属意<400>的<400>中央<400>政治局<400>委员<400>以<400>向<400>政治局<400>常委<400>会<400>举荐<400>。• # Words with interior formatting characters •def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> • @@ -169,6 +185,8 @@ # to test for bug #4097779 •aa\N{COMBINING GRAVE ACCENT}a<200> • +# fullwidth numeric, midletter characters etc should be treated like their halfwidth counterparts +•ISN'T<200> •19<100>日<400> # to test for bug #4098467 # What follows is a string of Korean characters (I found it in the Yellow Pages @@ -178,9 +196,15 @@ # precomposed syllables... •\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c<200> • -•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> • +# more Korean tests (Jamo not tested here, not counted as dictionary characters) +# Disable them now because we don't include a Korean dictionary. +#•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<200>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200> +#•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2dd<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200> •\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e• + +•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> • + +•\u06c9<200>\uc799<200>\ufffa• -•\u06c9\uc799\ufffa<200> # # Try some words from other scripts. @@ -491,8 +515,7 @@ •\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c• # conjoining jamo... -# TODO: rules update needed -#•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c• +•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c• # to test for bug #4117554: Fullwidth .!? should be treated as postJwrd •\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f• --- source/test/testdata/testaliases.txt 2009-11-12 13:53:42.000000000 -0800 +++ source/test/testdata/testaliases.txt 2011-01-21 14:12:45.204005000 -0800 @@ -28,7 +28,7 @@ LocaleScript:alias { "/ICUDATA/ja/LocaleScript" } // aliasing using position - boundaries:alias { "/ICUDATA-brkitr/ja" } // Referencing corresponding resource in another bundle + boundaries:alias { "/ICUDATA-brkitr/th" } // Referencing corresponding resource in another bundle // aliasing arrays zoneTests { --- source/tools/genctd/genctd.cpp 2009-08-04 14:09:17.000000000 -0700 +++ source/tools/genctd/genctd.cpp 2011-01-21 14:12:45.564923000 -0800 @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 2002-2009, International Business Machines +* Copyright (C) 2002-2010, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * @@ -34,12 +34,15 @@ #include "unicode/udata.h" #include "unicode/putil.h" +//#include "unicode/ustdio.h" + #include "uoptions.h" #include "unewdata.h" #include "ucmndata.h" #include "rbbidata.h" #include "triedict.h" #include "cmemory.h" +#include "uassert.h" #include #include @@ -199,147 +202,191 @@ long wordFileSize; FILE *file; char *wordBufferC; - + MutableTrieDictionary *mtd = NULL; + file = fopen(wordFileName, "rb"); - if( file == 0 ) { - fprintf(stderr, "Could not open file \"%s\"\n", wordFileName); - exit(-1); - } - fseek(file, 0, SEEK_END); - wordFileSize = ftell(file); - fseek(file, 0, SEEK_SET); - wordBufferC = new char[wordFileSize+10]; - - result = (long)fread(wordBufferC, 1, wordFileSize, file); - if (result != wordFileSize) { - fprintf(stderr, "Error reading file \"%s\"\n", wordFileName); - exit (-1); - } - wordBufferC[wordFileSize]=0; - fclose(file); - - // - // Look for a Unicode Signature (BOM) on the word file - // - int32_t signatureLength; - const char * wordSourceC = wordBufferC; - const char* encoding = ucnv_detectUnicodeSignature( - wordSourceC, wordFileSize, &signatureLength, &status); - if (U_FAILURE(status)) { - exit(status); - } - if(encoding!=NULL ){ - wordSourceC += signatureLength; - wordFileSize -= signatureLength; - } - - // - // Open a converter to take the rule file to UTF-16 - // - UConverter* conv; - conv = ucnv_open(encoding, &status); - if (U_FAILURE(status)) { - fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); - exit(status); - } - - // - // Convert the words to UChar. - // Preflight first to determine required buffer size. - // - uint32_t destCap = ucnv_toUChars(conv, - NULL, // dest, - 0, // destCapacity, - wordSourceC, - wordFileSize, - &status); - if (status != U_BUFFER_OVERFLOW_ERROR) { - fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); - exit(status); - }; - - status = U_ZERO_ERROR; - UChar *wordSourceU = new UChar[destCap+1]; - ucnv_toUChars(conv, - wordSourceU, // dest, - destCap+1, - wordSourceC, - wordFileSize, - &status); - if (U_FAILURE(status)) { - fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); - exit(status); - }; - ucnv_close(conv); - - // Get rid of the original file buffer - delete[] wordBufferC; - - // Create a MutableTrieDictionary, and loop through all the lines, inserting - // words. - - // First, pick a median character. - UChar *current = wordSourceU + (destCap/2); - UChar uc = *current++; - UnicodeSet breaks; - breaks.add(0x000A); // Line Feed - breaks.add(0x000D); // Carriage Return - breaks.add(0x2028); // Line Separator - breaks.add(0x2029); // Paragraph Separator - - do { - // Look for line break - while (uc && !breaks.contains(uc)) { - uc = *current++; - } - // Now skip to first non-line-break - while (uc && breaks.contains(uc)) { - uc = *current++; + if( file == 0 ) { //cannot find file + //create 1-line dummy file: ie 1 char, 1 value + UNewDataMemory *pData; + char msg[1024]; + + /* write message with just the name */ + sprintf(msg, "%s not found, genctd writes dummy %s", wordFileName, outFileName); + fprintf(stderr, "%s\n", msg); + + UChar c = 0x0020; + mtd = new MutableTrieDictionary(c, status, TRUE); + mtd->addWord(&c, 1, status, 1); + + } else { //read words in from input file + fseek(file, 0, SEEK_END); + wordFileSize = ftell(file); + fseek(file, 0, SEEK_SET); + wordBufferC = new char[wordFileSize+10]; + + result = (long)fread(wordBufferC, 1, wordFileSize, file); + if (result != wordFileSize) { + fprintf(stderr, "Error reading file \"%s\"\n", wordFileName); + exit (-1); } - } - while (uc && (breaks.contains(uc) || u_isspace(uc))); - - MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status); + wordBufferC[wordFileSize]=0; + fclose(file); - if (U_FAILURE(status)) { - fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status)); - exit(status); - } + // + // Look for a Unicode Signature (BOM) on the word file + // + int32_t signatureLength; + const char * wordSourceC = wordBufferC; + const char* encoding = ucnv_detectUnicodeSignature( + wordSourceC, wordFileSize, &signatureLength, &status); + if (U_FAILURE(status)) { + exit(status); + } + if(encoding!=NULL ){ + wordSourceC += signatureLength; + wordFileSize -= signatureLength; + } - // Now add the words. Words are non-space characters at the beginning of - // lines, and must be at least one UChar. - current = wordSourceU; - UChar *candidate = current; - uc = *current++; - int32_t length = 0; - - while (uc) { - while (uc && !u_isspace(uc)) { - ++length; - uc = *current++; + // + // Open a converter to take the rule file to UTF-16 + // + UConverter* conv; + conv = ucnv_open(encoding, &status); + if (U_FAILURE(status)) { + fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); + exit(status); } - if (length > 0) { - mtd->addWord(candidate, length, status); - if (U_FAILURE(status)) { - fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\"\n", - u_errorName(status)); - exit(status); + + // + // Convert the words to UChar. + // Preflight first to determine required buffer size. + // + uint32_t destCap = ucnv_toUChars(conv, + NULL, // dest, + 0, // destCapacity, + wordSourceC, + wordFileSize, + &status); + if (status != U_BUFFER_OVERFLOW_ERROR) { + fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); + exit(status); + }; + + status = U_ZERO_ERROR; + UChar *wordSourceU = new UChar[destCap+1]; + ucnv_toUChars(conv, + wordSourceU, // dest, + destCap+1, + wordSourceC, + wordFileSize, + &status); + if (U_FAILURE(status)) { + fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); + exit(status); + }; + ucnv_close(conv); + + // Get rid of the original file buffer + delete[] wordBufferC; + + // Create a MutableTrieDictionary, and loop through all the lines, inserting + // words. + + // First, pick a median character. + UChar *current = wordSourceU + (destCap/2); + UChar uc = *current++; + UnicodeSet breaks; + breaks.add(0x000A); // Line Feed + breaks.add(0x000D); // Carriage Return + breaks.add(0x2028); // Line Separator + breaks.add(0x2029); // Paragraph Separator + + do { + // Look for line break + while (uc && !breaks.contains(uc)) { + uc = *current++; + } + // Now skip to first non-line-break + while (uc && breaks.contains(uc)) { + uc = *current++; } } - // Find beginning of next line - while (uc && !breaks.contains(uc)) { - uc = *current++; + while (uc && (breaks.contains(uc) || u_isspace(uc))); + + mtd = new MutableTrieDictionary(uc, status); + + if (U_FAILURE(status)) { + fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status)); + exit(status); } - while (uc && breaks.contains(uc)) { - uc = *current++; + + // Now add the words. Words are non-space characters at the beginning of + // lines, and must be at least one UChar. If a word has an associated value, + // the value should follow the word on the same line after a tab character. + current = wordSourceU; + UChar *candidate = current; + uc = *current++; + int32_t length = 0; + int count = 0; + + while (uc) { + while (uc && !u_isspace(uc)) { + ++length; + uc = *current++; + } + + UnicodeString valueString; + UChar candidateValue; + if(uc == 0x0009){ //separator is a tab char, read in number after space + while (uc && u_isspace(uc)) { + uc = *current++; + } + while (uc && !u_isspace(uc)) { + valueString.append(uc); + uc = *current++; + } + } + + if (length > 0) { + count++; + if(valueString.length() > 0){ + mtd->setValued(TRUE); + + uint32_t value = 0; + char* s = new char[valueString.length()]; + valueString.extract(0,valueString.length(), s, valueString.length()); + int n = sscanf(s, "%ud", &value); + U_ASSERT(n == 1); + U_ASSERT(value >= 0); + mtd->addWord(candidate, length, status, (uint16_t)value); + delete[] s; + } else { + mtd->addWord(candidate, length, status); + } + + if (U_FAILURE(status)) { + fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\" at line %d in input file\n", + u_errorName(status), count); + exit(status); + } + } + + // Find beginning of next line + while (uc && !breaks.contains(uc)) { + uc = *current++; + } + // Find next non-line-breaking character + while (uc && breaks.contains(uc)) { + uc = *current++; + } + candidate = current-1; + length = 0; } - candidate = current-1; - length = 0; + + // Get rid of the Unicode text buffer + delete[] wordSourceU; } - // Get rid of the Unicode text buffer - delete[] wordSourceU; - // Now, create a CompactTrieDictionary from the mutable dictionary CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status); if (U_FAILURE(status)) { @@ -393,4 +440,3 @@ #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ } - --- source/tools/genctd/Makefile.in 2006-12-16 13:07:01.000000000 -0800 +++ source/tools/genctd/Makefile.in 2011-01-21 14:12:45.555920000 -0800 @@ -23,13 +23,13 @@ ## Extra files to remove for 'make clean' CLEANFILES = *~ $(DEPS) $(MAN_FILES) -## Target information +## Target informationcd TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) ifneq ($(top_builddir),$(top_srcdir)) CPPFLAGS += -I$(top_builddir)/common endif -CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil -I$(top_srcdir)/i18n LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) OBJECTS = genctd.o