• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 * Copyright (C) 2012, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * dictionarydata.h
7 *
8 * created on: 2012may31
9 * created by: Markus W. Scherer & Maxime Serrano
10 */
11 
12 #include "dictionarydata.h"
13 #include "unicode/ucharstrie.h"
14 #include "unicode/bytestrie.h"
15 #include "unicode/udata.h"
16 #include "cmemory.h"
17 
18 #if !UCONFIG_NO_BREAK_ITERATION
19 
20 U_NAMESPACE_BEGIN
21 
22 #ifndef CYGWINMSVC /* On Cygwin/MSVC, the error redefinition of symbols occurs.*/
23 const int32_t DictionaryData::TRIE_TYPE_BYTES;
24 const int32_t DictionaryData::TRIE_TYPE_UCHARS;
25 #endif
26 
~DictionaryMatcher()27 DictionaryMatcher::~DictionaryMatcher() {
28 }
29 
~UCharsDictionaryMatcher()30 UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
31     udata_close(file);
32 }
33 
getType() const34 int32_t UCharsDictionaryMatcher::getType() const {
35     return DictionaryData::TRIE_TYPE_UCHARS;
36 }
37 
matches(UText * text,int32_t maxLength,int32_t * lengths,int32_t & count,int32_t limit,int32_t * values) const38 int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
39     UCharsTrie uct(characters);
40     UChar32 c = utext_next32(text);
41     if (c < 0) {
42         return 0;
43     }
44     UStringTrieResult result = uct.first(c);
45     int32_t numChars = 1;
46     count = 0;
47     for (;;) {
48         if (USTRINGTRIE_HAS_VALUE(result)) {
49             if (count < limit) {
50                 if (values != NULL) {
51                     values[count] = uct.getValue();
52                 }
53                 lengths[count++] = numChars;
54             }
55             if (result == USTRINGTRIE_FINAL_VALUE) {
56                 break;
57             }
58         }
59         else if (result == USTRINGTRIE_NO_MATCH) {
60             break;
61         }
62 
63         // TODO: why do we have a text limit if the UText knows its length?
64         if (numChars >= maxLength) {
65             break;
66         }
67 
68         c = utext_next32(text);
69         if (c < 0) {
70             break;
71         }
72         ++numChars;
73         result = uct.next(c);
74     }
75     return numChars;
76 }
77 
~BytesDictionaryMatcher()78 BytesDictionaryMatcher::~BytesDictionaryMatcher() {
79     udata_close(file);
80 }
81 
transform(UChar32 c) const82 UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
83     if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
84         if (c == 0x200D) {
85             return 0xFF;
86         } else if (c == 0x200C) {
87             return 0xFE;
88         }
89         int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
90         if (delta < 0 || 0xFD < delta) {
91             return U_SENTINEL;
92         }
93         return (UChar32)delta;
94     }
95     return c;
96 }
97 
getType() const98 int32_t BytesDictionaryMatcher::getType() const {
99     return DictionaryData::TRIE_TYPE_BYTES;
100 }
101 
matches(UText * text,int32_t maxLength,int32_t * lengths,int32_t & count,int32_t limit,int32_t * values) const102 int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
103     BytesTrie bt(characters);
104     UChar32 c = utext_next32(text);
105     if (c < 0) {
106         return 0;
107     }
108     UStringTrieResult result = bt.first(transform(c));
109     int32_t numChars = 1;
110     count = 0;
111     for (;;) {
112         if (USTRINGTRIE_HAS_VALUE(result)) {
113             if (count < limit) {
114                 if (values != NULL) {
115                     values[count] = bt.getValue();
116             }
117                 lengths[count++] = numChars;
118             }
119             if (result == USTRINGTRIE_FINAL_VALUE) {
120                 break;
121             }
122         }
123         else if (result == USTRINGTRIE_NO_MATCH) {
124             break;
125         }
126 
127         // TODO: why do we have a text limit if the UText knows its length?
128         if (numChars >= maxLength) {
129             break;
130         }
131 
132         c = utext_next32(text);
133         if (c < 0) {
134             break;
135         }
136         ++numChars;
137         result = bt.next(transform(c));
138     }
139     return numChars;
140 }
141 
142 
143 U_NAMESPACE_END
144 
145 U_NAMESPACE_USE
146 
147 U_CAPI int32_t U_EXPORT2
udict_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)148 udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
149            void *outData, UErrorCode *pErrorCode) {
150     const UDataInfo *pInfo;
151     int32_t headerSize;
152     const uint8_t *inBytes;
153     uint8_t *outBytes;
154     const int32_t *inIndexes;
155     int32_t indexes[DictionaryData::IX_COUNT];
156     int32_t i, offset, size;
157 
158     headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
159     if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
160     pInfo = (const UDataInfo *)((const char *)inData + 4);
161     if (!(pInfo->dataFormat[0] == 0x44 &&
162           pInfo->dataFormat[1] == 0x69 &&
163           pInfo->dataFormat[2] == 0x63 &&
164           pInfo->dataFormat[3] == 0x74 &&
165           pInfo->formatVersion[0] == 1)) {
166         udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
167                          pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
168         *pErrorCode = U_UNSUPPORTED_ERROR;
169         return 0;
170     }
171 
172     inBytes = (const uint8_t *)inData + headerSize;
173     outBytes = (uint8_t *)outData + headerSize;
174 
175     inIndexes = (const int32_t *)inBytes;
176     if (length >= 0) {
177         length -= headerSize;
178         if (length < (int32_t)(sizeof(indexes))) {
179             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
180             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
181             return 0;
182         }
183     }
184 
185     for (i = 0; i < DictionaryData::IX_COUNT; i++) {
186         indexes[i] = udata_readInt32(ds, inIndexes[i]);
187     }
188 
189     size = indexes[DictionaryData::IX_TOTAL_SIZE];
190 
191     if (length >= 0) {
192         if (length < size) {
193             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
194             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
195             return 0;
196         }
197 
198         if (inBytes != outBytes) {
199             uprv_memcpy(outBytes, inBytes, size);
200         }
201 
202         offset = 0;
203         ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
204         offset = (int32_t)sizeof(indexes);
205         int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
206         int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
207 
208         if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
209             ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
210         } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
211             // nothing to do
212         } else {
213             udata_printError(ds, "udict_swap(): unknown trie type!\n");
214             *pErrorCode = U_UNSUPPORTED_ERROR;
215             return 0;
216         }
217 
218         // these next two sections are empty in the current format,
219         // but may be used later.
220         offset = nextOffset;
221         nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
222         offset = nextOffset;
223         nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
224         offset = nextOffset;
225     }
226     return headerSize + size;
227 }
228 #endif
229