1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2014-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * dictionarydata.h
9 *
10 * created on: 2012may31
11 * created by: Markus W. Scherer & Maxime Serrano
12 */
13
14 #include "dictionarydata.h"
15 #include "unicode/ucharstrie.h"
16 #include "unicode/bytestrie.h"
17 #include "unicode/udata.h"
18 #include "cmemory.h"
19
20 #if !UCONFIG_NO_BREAK_ITERATION
21
22 U_NAMESPACE_BEGIN
23
24 const int32_t DictionaryData::TRIE_TYPE_BYTES = 0;
25 const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1;
26 const int32_t DictionaryData::TRIE_TYPE_MASK = 7;
27 const int32_t DictionaryData::TRIE_HAS_VALUES = 8;
28
29 const int32_t DictionaryData::TRANSFORM_NONE = 0;
30 const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
31 const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
32 const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
33
~DictionaryMatcher()34 DictionaryMatcher::~DictionaryMatcher() {
35 }
36
~UCharsDictionaryMatcher()37 UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
38 udata_close(file);
39 }
40
getType() const41 int32_t UCharsDictionaryMatcher::getType() const {
42 return DictionaryData::TRIE_TYPE_UCHARS;
43 }
44
matches(UText * text,int32_t maxLength,int32_t limit,int32_t * lengths,int32_t * cpLengths,int32_t * values,int32_t * prefix) const45 int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
46 int32_t *lengths, int32_t *cpLengths, int32_t *values,
47 int32_t *prefix) const {
48
49 UCharsTrie uct(characters);
50 int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
51 int32_t wordCount = 0;
52 int32_t codePointsMatched = 0;
53
54 for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
55 UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
56 int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
57 codePointsMatched += 1;
58 if (USTRINGTRIE_HAS_VALUE(result)) {
59 if (wordCount < limit) {
60 if (values != nullptr) {
61 values[wordCount] = uct.getValue();
62 }
63 if (lengths != nullptr) {
64 lengths[wordCount] = lengthMatched;
65 }
66 if (cpLengths != nullptr) {
67 cpLengths[wordCount] = codePointsMatched;
68 }
69 ++wordCount;
70 }
71 if (result == USTRINGTRIE_FINAL_VALUE) {
72 break;
73 }
74 }
75 else if (result == USTRINGTRIE_NO_MATCH) {
76 break;
77 }
78 if (lengthMatched >= maxLength) {
79 break;
80 }
81 }
82
83 if (prefix != nullptr) {
84 *prefix = codePointsMatched;
85 }
86 return wordCount;
87 }
88
~BytesDictionaryMatcher()89 BytesDictionaryMatcher::~BytesDictionaryMatcher() {
90 udata_close(file);
91 }
92
transform(UChar32 c) const93 UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
94 if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
95 if (c == 0x200D) {
96 return 0xFF;
97 } else if (c == 0x200C) {
98 return 0xFE;
99 }
100 int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
101 if (delta < 0 || 0xFD < delta) {
102 return U_SENTINEL;
103 }
104 return (UChar32)delta;
105 }
106 return c;
107 }
108
getType() const109 int32_t BytesDictionaryMatcher::getType() const {
110 return DictionaryData::TRIE_TYPE_BYTES;
111 }
112
matches(UText * text,int32_t maxLength,int32_t limit,int32_t * lengths,int32_t * cpLengths,int32_t * values,int32_t * prefix) const113 int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
114 int32_t *lengths, int32_t *cpLengths, int32_t *values,
115 int32_t *prefix) const {
116 BytesTrie bt(characters);
117 int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
118 int32_t wordCount = 0;
119 int32_t codePointsMatched = 0;
120
121 for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
122 UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
123 int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
124 codePointsMatched += 1;
125 if (USTRINGTRIE_HAS_VALUE(result)) {
126 if (wordCount < limit) {
127 if (values != nullptr) {
128 values[wordCount] = bt.getValue();
129 }
130 if (lengths != nullptr) {
131 lengths[wordCount] = lengthMatched;
132 }
133 if (cpLengths != nullptr) {
134 cpLengths[wordCount] = codePointsMatched;
135 }
136 ++wordCount;
137 }
138 if (result == USTRINGTRIE_FINAL_VALUE) {
139 break;
140 }
141 }
142 else if (result == USTRINGTRIE_NO_MATCH) {
143 break;
144 }
145 if (lengthMatched >= maxLength) {
146 break;
147 }
148 }
149
150 if (prefix != nullptr) {
151 *prefix = codePointsMatched;
152 }
153 return wordCount;
154 }
155
156
157 U_NAMESPACE_END
158
159 U_NAMESPACE_USE
160
161 U_CAPI int32_t U_EXPORT2
udict_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)162 udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
163 void *outData, UErrorCode *pErrorCode) {
164 const UDataInfo *pInfo;
165 int32_t headerSize;
166 const uint8_t *inBytes;
167 uint8_t *outBytes;
168 const int32_t *inIndexes;
169 int32_t indexes[DictionaryData::IX_COUNT];
170 int32_t i, offset, size;
171
172 headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
173 if (pErrorCode == nullptr || U_FAILURE(*pErrorCode)) return 0;
174 pInfo = (const UDataInfo *)((const char *)inData + 4);
175 if (!(pInfo->dataFormat[0] == 0x44 &&
176 pInfo->dataFormat[1] == 0x69 &&
177 pInfo->dataFormat[2] == 0x63 &&
178 pInfo->dataFormat[3] == 0x74 &&
179 pInfo->formatVersion[0] == 1)) {
180 udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
181 pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
182 *pErrorCode = U_UNSUPPORTED_ERROR;
183 return 0;
184 }
185
186 inBytes = (const uint8_t *)inData + headerSize;
187 outBytes = (outData == nullptr) ? nullptr : (uint8_t *)outData + headerSize;
188
189 inIndexes = (const int32_t *)inBytes;
190 if (length >= 0) {
191 length -= headerSize;
192 if (length < (int32_t)(sizeof(indexes))) {
193 udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
194 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
195 return 0;
196 }
197 }
198
199 for (i = 0; i < DictionaryData::IX_COUNT; i++) {
200 indexes[i] = udata_readInt32(ds, inIndexes[i]);
201 }
202
203 size = indexes[DictionaryData::IX_TOTAL_SIZE];
204
205 if (length >= 0) {
206 if (length < size) {
207 udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
208 *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
209 return 0;
210 }
211
212 if (inBytes != outBytes) {
213 uprv_memcpy(outBytes, inBytes, size);
214 }
215
216 offset = 0;
217 ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
218 offset = (int32_t)sizeof(indexes);
219 int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
220 int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
221
222 if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
223 ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
224 } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
225 // nothing to do
226 } else {
227 udata_printError(ds, "udict_swap(): unknown trie type!\n");
228 *pErrorCode = U_UNSUPPORTED_ERROR;
229 return 0;
230 }
231
232 // these next two sections are empty in the current format,
233 // but may be used later.
234 offset = nextOffset;
235 nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
236 offset = nextOffset;
237 nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
238 offset = nextOffset;
239 }
240 return headerSize + size;
241 }
242 #endif
243