1 /*
2 *******************************************************************************
3 * Copyright (C) 2010-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: ucharstrie.h
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2010nov14
12 * created by: Markus W. Scherer
13 */
14
15 #include "unicode/utypes.h"
16 #include "unicode/appendable.h"
17 #include "unicode/ucharstrie.h"
18 #include "unicode/uobject.h"
19 #include "cmemory.h"
20 #include "uassert.h"
21
22 U_NAMESPACE_BEGIN
23
~UCharsTrie()24 UCharsTrie::~UCharsTrie() {
25 uprv_free(ownedArray_);
26 }
27
28 UStringTrieResult
current() const29 UCharsTrie::current() const {
30 const UChar *pos=pos_;
31 if(pos==NULL) {
32 return USTRINGTRIE_NO_MATCH;
33 } else {
34 int32_t node;
35 return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?
36 valueResult(node) : USTRINGTRIE_NO_VALUE;
37 }
38 }
39
40 UStringTrieResult
branchNext(const UChar * pos,int32_t length,int32_t uchar)41 UCharsTrie::branchNext(const UChar *pos, int32_t length, int32_t uchar) {
42 // Branch according to the current unit.
43 if(length==0) {
44 length=*pos++;
45 }
46 ++length;
47 // The length of the branch is the number of units to select from.
48 // The data structure encodes a binary search.
49 while(length>kMaxBranchLinearSubNodeLength) {
50 if(uchar<*pos++) {
51 length>>=1;
52 pos=jumpByDelta(pos);
53 } else {
54 length=length-(length>>1);
55 pos=skipDelta(pos);
56 }
57 }
58 // Drop down to linear search for the last few units.
59 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
60 // and divides length by 2.
61 do {
62 if(uchar==*pos++) {
63 UStringTrieResult result;
64 int32_t node=*pos;
65 if(node&kValueIsFinal) {
66 // Leave the final value for getValue() to read.
67 result=USTRINGTRIE_FINAL_VALUE;
68 } else {
69 // Use the non-final value as the jump delta.
70 ++pos;
71 // int32_t delta=readValue(pos, node);
72 int32_t delta;
73 if(node<kMinTwoUnitValueLead) {
74 delta=node;
75 } else if(node<kThreeUnitValueLead) {
76 delta=((node-kMinTwoUnitValueLead)<<16)|*pos++;
77 } else {
78 delta=(pos[0]<<16)|pos[1];
79 pos+=2;
80 }
81 // end readValue()
82 pos+=delta;
83 node=*pos;
84 result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
85 }
86 pos_=pos;
87 return result;
88 }
89 --length;
90 pos=skipValue(pos);
91 } while(length>1);
92 if(uchar==*pos++) {
93 pos_=pos;
94 int32_t node=*pos;
95 return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
96 } else {
97 stop();
98 return USTRINGTRIE_NO_MATCH;
99 }
100 }
101
102 UStringTrieResult
nextImpl(const UChar * pos,int32_t uchar)103 UCharsTrie::nextImpl(const UChar *pos, int32_t uchar) {
104 int32_t node=*pos++;
105 for(;;) {
106 if(node<kMinLinearMatch) {
107 return branchNext(pos, node, uchar);
108 } else if(node<kMinValueLead) {
109 // Match the first of length+1 units.
110 int32_t length=node-kMinLinearMatch; // Actual match length minus 1.
111 if(uchar==*pos++) {
112 remainingMatchLength_=--length;
113 pos_=pos;
114 return (length<0 && (node=*pos)>=kMinValueLead) ?
115 valueResult(node) : USTRINGTRIE_NO_VALUE;
116 } else {
117 // No match.
118 break;
119 }
120 } else if(node&kValueIsFinal) {
121 // No further matching units.
122 break;
123 } else {
124 // Skip intermediate value.
125 pos=skipNodeValue(pos, node);
126 node&=kNodeTypeMask;
127 }
128 }
129 stop();
130 return USTRINGTRIE_NO_MATCH;
131 }
132
133 UStringTrieResult
next(int32_t uchar)134 UCharsTrie::next(int32_t uchar) {
135 const UChar *pos=pos_;
136 if(pos==NULL) {
137 return USTRINGTRIE_NO_MATCH;
138 }
139 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
140 if(length>=0) {
141 // Remaining part of a linear-match node.
142 if(uchar==*pos++) {
143 remainingMatchLength_=--length;
144 pos_=pos;
145 int32_t node;
146 return (length<0 && (node=*pos)>=kMinValueLead) ?
147 valueResult(node) : USTRINGTRIE_NO_VALUE;
148 } else {
149 stop();
150 return USTRINGTRIE_NO_MATCH;
151 }
152 }
153 return nextImpl(pos, uchar);
154 }
155
156 UStringTrieResult
next(const UChar * s,int32_t sLength)157 UCharsTrie::next(const UChar *s, int32_t sLength) {
158 if(sLength<0 ? *s==0 : sLength==0) {
159 // Empty input.
160 return current();
161 }
162 const UChar *pos=pos_;
163 if(pos==NULL) {
164 return USTRINGTRIE_NO_MATCH;
165 }
166 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
167 for(;;) {
168 // Fetch the next input unit, if there is one.
169 // Continue a linear-match node without rechecking sLength<0.
170 int32_t uchar;
171 if(sLength<0) {
172 for(;;) {
173 if((uchar=*s++)==0) {
174 remainingMatchLength_=length;
175 pos_=pos;
176 int32_t node;
177 return (length<0 && (node=*pos)>=kMinValueLead) ?
178 valueResult(node) : USTRINGTRIE_NO_VALUE;
179 }
180 if(length<0) {
181 remainingMatchLength_=length;
182 break;
183 }
184 if(uchar!=*pos) {
185 stop();
186 return USTRINGTRIE_NO_MATCH;
187 }
188 ++pos;
189 --length;
190 }
191 } else {
192 for(;;) {
193 if(sLength==0) {
194 remainingMatchLength_=length;
195 pos_=pos;
196 int32_t node;
197 return (length<0 && (node=*pos)>=kMinValueLead) ?
198 valueResult(node) : USTRINGTRIE_NO_VALUE;
199 }
200 uchar=*s++;
201 --sLength;
202 if(length<0) {
203 remainingMatchLength_=length;
204 break;
205 }
206 if(uchar!=*pos) {
207 stop();
208 return USTRINGTRIE_NO_MATCH;
209 }
210 ++pos;
211 --length;
212 }
213 }
214 int32_t node=*pos++;
215 for(;;) {
216 if(node<kMinLinearMatch) {
217 UStringTrieResult result=branchNext(pos, node, uchar);
218 if(result==USTRINGTRIE_NO_MATCH) {
219 return USTRINGTRIE_NO_MATCH;
220 }
221 // Fetch the next input unit, if there is one.
222 if(sLength<0) {
223 if((uchar=*s++)==0) {
224 return result;
225 }
226 } else {
227 if(sLength==0) {
228 return result;
229 }
230 uchar=*s++;
231 --sLength;
232 }
233 if(result==USTRINGTRIE_FINAL_VALUE) {
234 // No further matching units.
235 stop();
236 return USTRINGTRIE_NO_MATCH;
237 }
238 pos=pos_; // branchNext() advanced pos and wrote it to pos_ .
239 node=*pos++;
240 } else if(node<kMinValueLead) {
241 // Match length+1 units.
242 length=node-kMinLinearMatch; // Actual match length minus 1.
243 if(uchar!=*pos) {
244 stop();
245 return USTRINGTRIE_NO_MATCH;
246 }
247 ++pos;
248 --length;
249 break;
250 } else if(node&kValueIsFinal) {
251 // No further matching units.
252 stop();
253 return USTRINGTRIE_NO_MATCH;
254 } else {
255 // Skip intermediate value.
256 pos=skipNodeValue(pos, node);
257 node&=kNodeTypeMask;
258 }
259 }
260 }
261 }
262
263 const UChar *
findUniqueValueFromBranch(const UChar * pos,int32_t length,UBool haveUniqueValue,int32_t & uniqueValue)264 UCharsTrie::findUniqueValueFromBranch(const UChar *pos, int32_t length,
265 UBool haveUniqueValue, int32_t &uniqueValue) {
266 while(length>kMaxBranchLinearSubNodeLength) {
267 ++pos; // ignore the comparison unit
268 if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {
269 return NULL;
270 }
271 length=length-(length>>1);
272 pos=skipDelta(pos);
273 }
274 do {
275 ++pos; // ignore a comparison unit
276 // handle its value
277 int32_t node=*pos++;
278 UBool isFinal=(UBool)(node>>15);
279 node&=0x7fff;
280 int32_t value=readValue(pos, node);
281 pos=skipValue(pos, node);
282 if(isFinal) {
283 if(haveUniqueValue) {
284 if(value!=uniqueValue) {
285 return NULL;
286 }
287 } else {
288 uniqueValue=value;
289 haveUniqueValue=TRUE;
290 }
291 } else {
292 if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {
293 return NULL;
294 }
295 haveUniqueValue=TRUE;
296 }
297 } while(--length>1);
298 return pos+1; // ignore the last comparison unit
299 }
300
301 UBool
findUniqueValue(const UChar * pos,UBool haveUniqueValue,int32_t & uniqueValue)302 UCharsTrie::findUniqueValue(const UChar *pos, UBool haveUniqueValue, int32_t &uniqueValue) {
303 int32_t node=*pos++;
304 for(;;) {
305 if(node<kMinLinearMatch) {
306 if(node==0) {
307 node=*pos++;
308 }
309 pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);
310 if(pos==NULL) {
311 return FALSE;
312 }
313 haveUniqueValue=TRUE;
314 node=*pos++;
315 } else if(node<kMinValueLead) {
316 // linear-match node
317 pos+=node-kMinLinearMatch+1; // Ignore the match units.
318 node=*pos++;
319 } else {
320 UBool isFinal=(UBool)(node>>15);
321 int32_t value;
322 if(isFinal) {
323 value=readValue(pos, node&0x7fff);
324 } else {
325 value=readNodeValue(pos, node);
326 }
327 if(haveUniqueValue) {
328 if(value!=uniqueValue) {
329 return FALSE;
330 }
331 } else {
332 uniqueValue=value;
333 haveUniqueValue=TRUE;
334 }
335 if(isFinal) {
336 return TRUE;
337 }
338 pos=skipNodeValue(pos, node);
339 node&=kNodeTypeMask;
340 }
341 }
342 }
343
344 int32_t
getNextUChars(Appendable & out) const345 UCharsTrie::getNextUChars(Appendable &out) const {
346 const UChar *pos=pos_;
347 if(pos==NULL) {
348 return 0;
349 }
350 if(remainingMatchLength_>=0) {
351 out.appendCodeUnit(*pos); // Next unit of a pending linear-match node.
352 return 1;
353 }
354 int32_t node=*pos++;
355 if(node>=kMinValueLead) {
356 if(node&kValueIsFinal) {
357 return 0;
358 } else {
359 pos=skipNodeValue(pos, node);
360 node&=kNodeTypeMask;
361 }
362 }
363 if(node<kMinLinearMatch) {
364 if(node==0) {
365 node=*pos++;
366 }
367 out.reserveAppendCapacity(++node);
368 getNextBranchUChars(pos, node, out);
369 return node;
370 } else {
371 // First unit of the linear-match node.
372 out.appendCodeUnit(*pos);
373 return 1;
374 }
375 }
376
377 void
getNextBranchUChars(const UChar * pos,int32_t length,Appendable & out)378 UCharsTrie::getNextBranchUChars(const UChar *pos, int32_t length, Appendable &out) {
379 while(length>kMaxBranchLinearSubNodeLength) {
380 ++pos; // ignore the comparison unit
381 getNextBranchUChars(jumpByDelta(pos), length>>1, out);
382 length=length-(length>>1);
383 pos=skipDelta(pos);
384 }
385 do {
386 out.appendCodeUnit(*pos++);
387 pos=skipValue(pos);
388 } while(--length>1);
389 out.appendCodeUnit(*pos);
390 }
391
392 U_NAMESPACE_END
393