1 /*
2 *******************************************************************************
3 * Copyright (C) 2010-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: ucharstrie.h
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2010nov14
12 * created by: Markus W. Scherer
13 */
14
15 #include "unicode/utypes.h"
16 #include "unicode/appendable.h"
17 #include "unicode/ucharstrie.h"
18 #include "unicode/uobject.h"
19 #include "unicode/utf16.h"
20 #include "cmemory.h"
21 #include "uassert.h"
22
23 U_NAMESPACE_BEGIN
24
~UCharsTrie()25 UCharsTrie::~UCharsTrie() {
26 uprv_free(ownedArray_);
27 }
28
29 UStringTrieResult
current() const30 UCharsTrie::current() const {
31 const UChar *pos=pos_;
32 if(pos==NULL) {
33 return USTRINGTRIE_NO_MATCH;
34 } else {
35 int32_t node;
36 return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?
37 valueResult(node) : USTRINGTRIE_NO_VALUE;
38 }
39 }
40
41 UStringTrieResult
firstForCodePoint(UChar32 cp)42 UCharsTrie::firstForCodePoint(UChar32 cp) {
43 return cp<=0xffff ?
44 first(cp) :
45 (USTRINGTRIE_HAS_NEXT(first(U16_LEAD(cp))) ?
46 next(U16_TRAIL(cp)) :
47 USTRINGTRIE_NO_MATCH);
48 }
49
50 UStringTrieResult
nextForCodePoint(UChar32 cp)51 UCharsTrie::nextForCodePoint(UChar32 cp) {
52 return cp<=0xffff ?
53 next(cp) :
54 (USTRINGTRIE_HAS_NEXT(next(U16_LEAD(cp))) ?
55 next(U16_TRAIL(cp)) :
56 USTRINGTRIE_NO_MATCH);
57 }
58
59 UStringTrieResult
branchNext(const UChar * pos,int32_t length,int32_t uchar)60 UCharsTrie::branchNext(const UChar *pos, int32_t length, int32_t uchar) {
61 // Branch according to the current unit.
62 if(length==0) {
63 length=*pos++;
64 }
65 ++length;
66 // The length of the branch is the number of units to select from.
67 // The data structure encodes a binary search.
68 while(length>kMaxBranchLinearSubNodeLength) {
69 if(uchar<*pos++) {
70 length>>=1;
71 pos=jumpByDelta(pos);
72 } else {
73 length=length-(length>>1);
74 pos=skipDelta(pos);
75 }
76 }
77 // Drop down to linear search for the last few units.
78 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
79 // and divides length by 2.
80 do {
81 if(uchar==*pos++) {
82 UStringTrieResult result;
83 int32_t node=*pos;
84 if(node&kValueIsFinal) {
85 // Leave the final value for getValue() to read.
86 result=USTRINGTRIE_FINAL_VALUE;
87 } else {
88 // Use the non-final value as the jump delta.
89 ++pos;
90 // int32_t delta=readValue(pos, node);
91 int32_t delta;
92 if(node<kMinTwoUnitValueLead) {
93 delta=node;
94 } else if(node<kThreeUnitValueLead) {
95 delta=((node-kMinTwoUnitValueLead)<<16)|*pos++;
96 } else {
97 delta=(pos[0]<<16)|pos[1];
98 pos+=2;
99 }
100 // end readValue()
101 pos+=delta;
102 node=*pos;
103 result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
104 }
105 pos_=pos;
106 return result;
107 }
108 --length;
109 pos=skipValue(pos);
110 } while(length>1);
111 if(uchar==*pos++) {
112 pos_=pos;
113 int32_t node=*pos;
114 return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
115 } else {
116 stop();
117 return USTRINGTRIE_NO_MATCH;
118 }
119 }
120
121 UStringTrieResult
nextImpl(const UChar * pos,int32_t uchar)122 UCharsTrie::nextImpl(const UChar *pos, int32_t uchar) {
123 int32_t node=*pos++;
124 for(;;) {
125 if(node<kMinLinearMatch) {
126 return branchNext(pos, node, uchar);
127 } else if(node<kMinValueLead) {
128 // Match the first of length+1 units.
129 int32_t length=node-kMinLinearMatch; // Actual match length minus 1.
130 if(uchar==*pos++) {
131 remainingMatchLength_=--length;
132 pos_=pos;
133 return (length<0 && (node=*pos)>=kMinValueLead) ?
134 valueResult(node) : USTRINGTRIE_NO_VALUE;
135 } else {
136 // No match.
137 break;
138 }
139 } else if(node&kValueIsFinal) {
140 // No further matching units.
141 break;
142 } else {
143 // Skip intermediate value.
144 pos=skipNodeValue(pos, node);
145 node&=kNodeTypeMask;
146 }
147 }
148 stop();
149 return USTRINGTRIE_NO_MATCH;
150 }
151
152 UStringTrieResult
next(int32_t uchar)153 UCharsTrie::next(int32_t uchar) {
154 const UChar *pos=pos_;
155 if(pos==NULL) {
156 return USTRINGTRIE_NO_MATCH;
157 }
158 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
159 if(length>=0) {
160 // Remaining part of a linear-match node.
161 if(uchar==*pos++) {
162 remainingMatchLength_=--length;
163 pos_=pos;
164 int32_t node;
165 return (length<0 && (node=*pos)>=kMinValueLead) ?
166 valueResult(node) : USTRINGTRIE_NO_VALUE;
167 } else {
168 stop();
169 return USTRINGTRIE_NO_MATCH;
170 }
171 }
172 return nextImpl(pos, uchar);
173 }
174
175 UStringTrieResult
next(const UChar * s,int32_t sLength)176 UCharsTrie::next(const UChar *s, int32_t sLength) {
177 if(sLength<0 ? *s==0 : sLength==0) {
178 // Empty input.
179 return current();
180 }
181 const UChar *pos=pos_;
182 if(pos==NULL) {
183 return USTRINGTRIE_NO_MATCH;
184 }
185 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
186 for(;;) {
187 // Fetch the next input unit, if there is one.
188 // Continue a linear-match node without rechecking sLength<0.
189 int32_t uchar;
190 if(sLength<0) {
191 for(;;) {
192 if((uchar=*s++)==0) {
193 remainingMatchLength_=length;
194 pos_=pos;
195 int32_t node;
196 return (length<0 && (node=*pos)>=kMinValueLead) ?
197 valueResult(node) : USTRINGTRIE_NO_VALUE;
198 }
199 if(length<0) {
200 remainingMatchLength_=length;
201 break;
202 }
203 if(uchar!=*pos) {
204 stop();
205 return USTRINGTRIE_NO_MATCH;
206 }
207 ++pos;
208 --length;
209 }
210 } else {
211 for(;;) {
212 if(sLength==0) {
213 remainingMatchLength_=length;
214 pos_=pos;
215 int32_t node;
216 return (length<0 && (node=*pos)>=kMinValueLead) ?
217 valueResult(node) : USTRINGTRIE_NO_VALUE;
218 }
219 uchar=*s++;
220 --sLength;
221 if(length<0) {
222 remainingMatchLength_=length;
223 break;
224 }
225 if(uchar!=*pos) {
226 stop();
227 return USTRINGTRIE_NO_MATCH;
228 }
229 ++pos;
230 --length;
231 }
232 }
233 int32_t node=*pos++;
234 for(;;) {
235 if(node<kMinLinearMatch) {
236 UStringTrieResult result=branchNext(pos, node, uchar);
237 if(result==USTRINGTRIE_NO_MATCH) {
238 return USTRINGTRIE_NO_MATCH;
239 }
240 // Fetch the next input unit, if there is one.
241 if(sLength<0) {
242 if((uchar=*s++)==0) {
243 return result;
244 }
245 } else {
246 if(sLength==0) {
247 return result;
248 }
249 uchar=*s++;
250 --sLength;
251 }
252 if(result==USTRINGTRIE_FINAL_VALUE) {
253 // No further matching units.
254 stop();
255 return USTRINGTRIE_NO_MATCH;
256 }
257 pos=pos_; // branchNext() advanced pos and wrote it to pos_ .
258 node=*pos++;
259 } else if(node<kMinValueLead) {
260 // Match length+1 units.
261 length=node-kMinLinearMatch; // Actual match length minus 1.
262 if(uchar!=*pos) {
263 stop();
264 return USTRINGTRIE_NO_MATCH;
265 }
266 ++pos;
267 --length;
268 break;
269 } else if(node&kValueIsFinal) {
270 // No further matching units.
271 stop();
272 return USTRINGTRIE_NO_MATCH;
273 } else {
274 // Skip intermediate value.
275 pos=skipNodeValue(pos, node);
276 node&=kNodeTypeMask;
277 }
278 }
279 }
280 }
281
282 const UChar *
findUniqueValueFromBranch(const UChar * pos,int32_t length,UBool haveUniqueValue,int32_t & uniqueValue)283 UCharsTrie::findUniqueValueFromBranch(const UChar *pos, int32_t length,
284 UBool haveUniqueValue, int32_t &uniqueValue) {
285 while(length>kMaxBranchLinearSubNodeLength) {
286 ++pos; // ignore the comparison unit
287 if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {
288 return NULL;
289 }
290 length=length-(length>>1);
291 pos=skipDelta(pos);
292 }
293 do {
294 ++pos; // ignore a comparison unit
295 // handle its value
296 int32_t node=*pos++;
297 UBool isFinal=(UBool)(node>>15);
298 node&=0x7fff;
299 int32_t value=readValue(pos, node);
300 pos=skipValue(pos, node);
301 if(isFinal) {
302 if(haveUniqueValue) {
303 if(value!=uniqueValue) {
304 return NULL;
305 }
306 } else {
307 uniqueValue=value;
308 haveUniqueValue=TRUE;
309 }
310 } else {
311 if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {
312 return NULL;
313 }
314 haveUniqueValue=TRUE;
315 }
316 } while(--length>1);
317 return pos+1; // ignore the last comparison unit
318 }
319
320 UBool
findUniqueValue(const UChar * pos,UBool haveUniqueValue,int32_t & uniqueValue)321 UCharsTrie::findUniqueValue(const UChar *pos, UBool haveUniqueValue, int32_t &uniqueValue) {
322 int32_t node=*pos++;
323 for(;;) {
324 if(node<kMinLinearMatch) {
325 if(node==0) {
326 node=*pos++;
327 }
328 pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);
329 if(pos==NULL) {
330 return FALSE;
331 }
332 haveUniqueValue=TRUE;
333 node=*pos++;
334 } else if(node<kMinValueLead) {
335 // linear-match node
336 pos+=node-kMinLinearMatch+1; // Ignore the match units.
337 node=*pos++;
338 } else {
339 UBool isFinal=(UBool)(node>>15);
340 int32_t value;
341 if(isFinal) {
342 value=readValue(pos, node&0x7fff);
343 } else {
344 value=readNodeValue(pos, node);
345 }
346 if(haveUniqueValue) {
347 if(value!=uniqueValue) {
348 return FALSE;
349 }
350 } else {
351 uniqueValue=value;
352 haveUniqueValue=TRUE;
353 }
354 if(isFinal) {
355 return TRUE;
356 }
357 pos=skipNodeValue(pos, node);
358 node&=kNodeTypeMask;
359 }
360 }
361 }
362
363 int32_t
getNextUChars(Appendable & out) const364 UCharsTrie::getNextUChars(Appendable &out) const {
365 const UChar *pos=pos_;
366 if(pos==NULL) {
367 return 0;
368 }
369 if(remainingMatchLength_>=0) {
370 out.appendCodeUnit(*pos); // Next unit of a pending linear-match node.
371 return 1;
372 }
373 int32_t node=*pos++;
374 if(node>=kMinValueLead) {
375 if(node&kValueIsFinal) {
376 return 0;
377 } else {
378 pos=skipNodeValue(pos, node);
379 node&=kNodeTypeMask;
380 }
381 }
382 if(node<kMinLinearMatch) {
383 if(node==0) {
384 node=*pos++;
385 }
386 out.reserveAppendCapacity(++node);
387 getNextBranchUChars(pos, node, out);
388 return node;
389 } else {
390 // First unit of the linear-match node.
391 out.appendCodeUnit(*pos);
392 return 1;
393 }
394 }
395
396 void
getNextBranchUChars(const UChar * pos,int32_t length,Appendable & out)397 UCharsTrie::getNextBranchUChars(const UChar *pos, int32_t length, Appendable &out) {
398 while(length>kMaxBranchLinearSubNodeLength) {
399 ++pos; // ignore the comparison unit
400 getNextBranchUChars(jumpByDelta(pos), length>>1, out);
401 length=length-(length>>1);
402 pos=skipDelta(pos);
403 }
404 do {
405 out.appendCodeUnit(*pos++);
406 pos=skipValue(pos);
407 } while(--length>1);
408 out.appendCodeUnit(*pos);
409 }
410
411 U_NAMESPACE_END
412