• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 * Copyright (C) 2013, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * dictionarydata.h
7 *
8 * created on: 2012may31
9 * created by: Markus W. Scherer & Maxime Serrano
10 */
11 
12 #include "dictionarydata.h"
13 #include "unicode/ucharstrie.h"
14 #include "unicode/bytestrie.h"
15 #include "unicode/udata.h"
16 #include "cmemory.h"
17 
18 #if !UCONFIG_NO_BREAK_ITERATION
19 
20 U_NAMESPACE_BEGIN
21 
22 const int32_t  DictionaryData::TRIE_TYPE_BYTES = 0;
23 const int32_t  DictionaryData::TRIE_TYPE_UCHARS = 1;
24 const int32_t  DictionaryData::TRIE_TYPE_MASK = 7;
25 const int32_t  DictionaryData::TRIE_HAS_VALUES = 8;
26 
27 const int32_t  DictionaryData::TRANSFORM_NONE = 0;
28 const int32_t  DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
29 const int32_t  DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
30 const int32_t  DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
31 
~DictionaryMatcher()32 DictionaryMatcher::~DictionaryMatcher() {
33 }
34 
~UCharsDictionaryMatcher()35 UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
36     udata_close(file);
37 }
38 
getType() const39 int32_t UCharsDictionaryMatcher::getType() const {
40     return DictionaryData::TRIE_TYPE_UCHARS;
41 }
42 
matches(UText * text,int32_t maxLength,int32_t * lengths,int32_t & count,int32_t limit,int32_t * values) const43 int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
44     UCharsTrie uct(characters);
45     UChar32 c = utext_next32(text);
46     if (c < 0) {
47         return 0;
48     }
49     UStringTrieResult result = uct.first(c);
50     int32_t numChars = 1;
51     count = 0;
52     for (;;) {
53         if (USTRINGTRIE_HAS_VALUE(result)) {
54             if (count < limit) {
55                 if (values != NULL) {
56                     values[count] = uct.getValue();
57                 }
58                 lengths[count++] = numChars;
59             }
60             if (result == USTRINGTRIE_FINAL_VALUE) {
61                 break;
62             }
63         }
64         else if (result == USTRINGTRIE_NO_MATCH) {
65             break;
66         }
67 
68         // TODO: why do we have a text limit if the UText knows its length?
69         if (numChars >= maxLength) {
70             break;
71         }
72 
73         c = utext_next32(text);
74         if (c < 0) {
75             break;
76         }
77         ++numChars;
78         result = uct.next(c);
79     }
80     return numChars;
81 }
82 
~BytesDictionaryMatcher()83 BytesDictionaryMatcher::~BytesDictionaryMatcher() {
84     udata_close(file);
85 }
86 
transform(UChar32 c) const87 UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
88     if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
89         if (c == 0x200D) {
90             return 0xFF;
91         } else if (c == 0x200C) {
92             return 0xFE;
93         }
94         int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
95         if (delta < 0 || 0xFD < delta) {
96             return U_SENTINEL;
97         }
98         return (UChar32)delta;
99     }
100     return c;
101 }
102 
getType() const103 int32_t BytesDictionaryMatcher::getType() const {
104     return DictionaryData::TRIE_TYPE_BYTES;
105 }
106 
matches(UText * text,int32_t maxLength,int32_t * lengths,int32_t & count,int32_t limit,int32_t * values) const107 int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
108     BytesTrie bt(characters);
109     UChar32 c = utext_next32(text);
110     if (c < 0) {
111         return 0;
112     }
113     UStringTrieResult result = bt.first(transform(c));
114     int32_t numChars = 1;
115     count = 0;
116     for (;;) {
117         if (USTRINGTRIE_HAS_VALUE(result)) {
118             if (count < limit) {
119                 if (values != NULL) {
120                     values[count] = bt.getValue();
121             }
122                 lengths[count++] = numChars;
123             }
124             if (result == USTRINGTRIE_FINAL_VALUE) {
125                 break;
126             }
127         }
128         else if (result == USTRINGTRIE_NO_MATCH) {
129             break;
130         }
131 
132         // TODO: why do we have a text limit if the UText knows its length?
133         if (numChars >= maxLength) {
134             break;
135         }
136 
137         c = utext_next32(text);
138         if (c < 0) {
139             break;
140         }
141         ++numChars;
142         result = bt.next(transform(c));
143     }
144     return numChars;
145 }
146 
147 
148 U_NAMESPACE_END
149 
150 U_NAMESPACE_USE
151 
152 U_CAPI int32_t U_EXPORT2
udict_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)153 udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
154            void *outData, UErrorCode *pErrorCode) {
155     const UDataInfo *pInfo;
156     int32_t headerSize;
157     const uint8_t *inBytes;
158     uint8_t *outBytes;
159     const int32_t *inIndexes;
160     int32_t indexes[DictionaryData::IX_COUNT];
161     int32_t i, offset, size;
162 
163     headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
164     if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
165     pInfo = (const UDataInfo *)((const char *)inData + 4);
166     if (!(pInfo->dataFormat[0] == 0x44 &&
167           pInfo->dataFormat[1] == 0x69 &&
168           pInfo->dataFormat[2] == 0x63 &&
169           pInfo->dataFormat[3] == 0x74 &&
170           pInfo->formatVersion[0] == 1)) {
171         udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
172                          pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
173         *pErrorCode = U_UNSUPPORTED_ERROR;
174         return 0;
175     }
176 
177     inBytes = (const uint8_t *)inData + headerSize;
178     outBytes = (uint8_t *)outData + headerSize;
179 
180     inIndexes = (const int32_t *)inBytes;
181     if (length >= 0) {
182         length -= headerSize;
183         if (length < (int32_t)(sizeof(indexes))) {
184             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
185             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
186             return 0;
187         }
188     }
189 
190     for (i = 0; i < DictionaryData::IX_COUNT; i++) {
191         indexes[i] = udata_readInt32(ds, inIndexes[i]);
192     }
193 
194     size = indexes[DictionaryData::IX_TOTAL_SIZE];
195 
196     if (length >= 0) {
197         if (length < size) {
198             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
199             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
200             return 0;
201         }
202 
203         if (inBytes != outBytes) {
204             uprv_memcpy(outBytes, inBytes, size);
205         }
206 
207         offset = 0;
208         ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
209         offset = (int32_t)sizeof(indexes);
210         int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
211         int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
212 
213         if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
214             ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
215         } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
216             // nothing to do
217         } else {
218             udata_printError(ds, "udict_swap(): unknown trie type!\n");
219             *pErrorCode = U_UNSUPPORTED_ERROR;
220             return 0;
221         }
222 
223         // these next two sections are empty in the current format,
224         // but may be used later.
225         offset = nextOffset;
226         nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
227         offset = nextOffset;
228         nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
229         offset = nextOffset;
230     }
231     return headerSize + size;
232 }
233 #endif
234