1 /*
2 *******************************************************************************
3 * Copyright (C) 2010-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: bytestrie.cpp
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2010sep25
12 * created by: Markus W. Scherer
13 */
14
15 #include "unicode/utypes.h"
16 #include "unicode/bytestream.h"
17 #include "unicode/bytestrie.h"
18 #include "unicode/uobject.h"
19 #include "cmemory.h"
20 #include "uassert.h"
21
22 U_NAMESPACE_BEGIN
23
~BytesTrie()24 BytesTrie::~BytesTrie() {
25 uprv_free(ownedArray_);
26 }
27
28 // lead byte already shifted right by 1.
29 int32_t
readValue(const uint8_t * pos,int32_t leadByte)30 BytesTrie::readValue(const uint8_t *pos, int32_t leadByte) {
31 int32_t value;
32 if(leadByte<kMinTwoByteValueLead) {
33 value=leadByte-kMinOneByteValueLead;
34 } else if(leadByte<kMinThreeByteValueLead) {
35 value=((leadByte-kMinTwoByteValueLead)<<8)|*pos;
36 } else if(leadByte<kFourByteValueLead) {
37 value=((leadByte-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1];
38 } else if(leadByte==kFourByteValueLead) {
39 value=(pos[0]<<16)|(pos[1]<<8)|pos[2];
40 } else {
41 value=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
42 }
43 return value;
44 }
45
46 const uint8_t *
jumpByDelta(const uint8_t * pos)47 BytesTrie::jumpByDelta(const uint8_t *pos) {
48 int32_t delta=*pos++;
49 if(delta<kMinTwoByteDeltaLead) {
50 // nothing to do
51 } else if(delta<kMinThreeByteDeltaLead) {
52 delta=((delta-kMinTwoByteDeltaLead)<<8)|*pos++;
53 } else if(delta<kFourByteDeltaLead) {
54 delta=((delta-kMinThreeByteDeltaLead)<<16)|(pos[0]<<8)|pos[1];
55 pos+=2;
56 } else if(delta==kFourByteDeltaLead) {
57 delta=(pos[0]<<16)|(pos[1]<<8)|pos[2];
58 pos+=3;
59 } else {
60 delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
61 pos+=4;
62 }
63 return pos+delta;
64 }
65
66 UStringTrieResult
current() const67 BytesTrie::current() const {
68 const uint8_t *pos=pos_;
69 if(pos==NULL) {
70 return USTRINGTRIE_NO_MATCH;
71 } else {
72 int32_t node;
73 return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?
74 valueResult(node) : USTRINGTRIE_NO_VALUE;
75 }
76 }
77
78 UStringTrieResult
branchNext(const uint8_t * pos,int32_t length,int32_t inByte)79 BytesTrie::branchNext(const uint8_t *pos, int32_t length, int32_t inByte) {
80 // Branch according to the current byte.
81 if(length==0) {
82 length=*pos++;
83 }
84 ++length;
85 // The length of the branch is the number of bytes to select from.
86 // The data structure encodes a binary search.
87 while(length>kMaxBranchLinearSubNodeLength) {
88 if(inByte<*pos++) {
89 length>>=1;
90 pos=jumpByDelta(pos);
91 } else {
92 length=length-(length>>1);
93 pos=skipDelta(pos);
94 }
95 }
96 // Drop down to linear search for the last few bytes.
97 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
98 // and divides length by 2.
99 do {
100 if(inByte==*pos++) {
101 UStringTrieResult result;
102 int32_t node=*pos;
103 U_ASSERT(node>=kMinValueLead);
104 if(node&kValueIsFinal) {
105 // Leave the final value for getValue() to read.
106 result=USTRINGTRIE_FINAL_VALUE;
107 } else {
108 // Use the non-final value as the jump delta.
109 ++pos;
110 // int32_t delta=readValue(pos, node>>1);
111 node>>=1;
112 int32_t delta;
113 if(node<kMinTwoByteValueLead) {
114 delta=node-kMinOneByteValueLead;
115 } else if(node<kMinThreeByteValueLead) {
116 delta=((node-kMinTwoByteValueLead)<<8)|*pos++;
117 } else if(node<kFourByteValueLead) {
118 delta=((node-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1];
119 pos+=2;
120 } else if(node==kFourByteValueLead) {
121 delta=(pos[0]<<16)|(pos[1]<<8)|pos[2];
122 pos+=3;
123 } else {
124 delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
125 pos+=4;
126 }
127 // end readValue()
128 pos+=delta;
129 node=*pos;
130 result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
131 }
132 pos_=pos;
133 return result;
134 }
135 --length;
136 pos=skipValue(pos);
137 } while(length>1);
138 if(inByte==*pos++) {
139 pos_=pos;
140 int32_t node=*pos;
141 return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
142 } else {
143 stop();
144 return USTRINGTRIE_NO_MATCH;
145 }
146 }
147
148 UStringTrieResult
nextImpl(const uint8_t * pos,int32_t inByte)149 BytesTrie::nextImpl(const uint8_t *pos, int32_t inByte) {
150 for(;;) {
151 int32_t node=*pos++;
152 if(node<kMinLinearMatch) {
153 return branchNext(pos, node, inByte);
154 } else if(node<kMinValueLead) {
155 // Match the first of length+1 bytes.
156 int32_t length=node-kMinLinearMatch; // Actual match length minus 1.
157 if(inByte==*pos++) {
158 remainingMatchLength_=--length;
159 pos_=pos;
160 return (length<0 && (node=*pos)>=kMinValueLead) ?
161 valueResult(node) : USTRINGTRIE_NO_VALUE;
162 } else {
163 // No match.
164 break;
165 }
166 } else if(node&kValueIsFinal) {
167 // No further matching bytes.
168 break;
169 } else {
170 // Skip intermediate value.
171 pos=skipValue(pos, node);
172 // The next node must not also be a value node.
173 U_ASSERT(*pos<kMinValueLead);
174 }
175 }
176 stop();
177 return USTRINGTRIE_NO_MATCH;
178 }
179
180 UStringTrieResult
next(int32_t inByte)181 BytesTrie::next(int32_t inByte) {
182 const uint8_t *pos=pos_;
183 if(pos==NULL) {
184 return USTRINGTRIE_NO_MATCH;
185 }
186 if(inByte<0) {
187 inByte+=0x100;
188 }
189 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
190 if(length>=0) {
191 // Remaining part of a linear-match node.
192 if(inByte==*pos++) {
193 remainingMatchLength_=--length;
194 pos_=pos;
195 int32_t node;
196 return (length<0 && (node=*pos)>=kMinValueLead) ?
197 valueResult(node) : USTRINGTRIE_NO_VALUE;
198 } else {
199 stop();
200 return USTRINGTRIE_NO_MATCH;
201 }
202 }
203 return nextImpl(pos, inByte);
204 }
205
206 UStringTrieResult
next(const char * s,int32_t sLength)207 BytesTrie::next(const char *s, int32_t sLength) {
208 if(sLength<0 ? *s==0 : sLength==0) {
209 // Empty input.
210 return current();
211 }
212 const uint8_t *pos=pos_;
213 if(pos==NULL) {
214 return USTRINGTRIE_NO_MATCH;
215 }
216 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
217 for(;;) {
218 // Fetch the next input byte, if there is one.
219 // Continue a linear-match node without rechecking sLength<0.
220 int32_t inByte;
221 if(sLength<0) {
222 for(;;) {
223 if((inByte=*s++)==0) {
224 remainingMatchLength_=length;
225 pos_=pos;
226 int32_t node;
227 return (length<0 && (node=*pos)>=kMinValueLead) ?
228 valueResult(node) : USTRINGTRIE_NO_VALUE;
229 }
230 if(length<0) {
231 remainingMatchLength_=length;
232 break;
233 }
234 if(inByte!=*pos) {
235 stop();
236 return USTRINGTRIE_NO_MATCH;
237 }
238 ++pos;
239 --length;
240 }
241 } else {
242 for(;;) {
243 if(sLength==0) {
244 remainingMatchLength_=length;
245 pos_=pos;
246 int32_t node;
247 return (length<0 && (node=*pos)>=kMinValueLead) ?
248 valueResult(node) : USTRINGTRIE_NO_VALUE;
249 }
250 inByte=*s++;
251 --sLength;
252 if(length<0) {
253 remainingMatchLength_=length;
254 break;
255 }
256 if(inByte!=*pos) {
257 stop();
258 return USTRINGTRIE_NO_MATCH;
259 }
260 ++pos;
261 --length;
262 }
263 }
264 for(;;) {
265 int32_t node=*pos++;
266 if(node<kMinLinearMatch) {
267 UStringTrieResult result=branchNext(pos, node, inByte);
268 if(result==USTRINGTRIE_NO_MATCH) {
269 return USTRINGTRIE_NO_MATCH;
270 }
271 // Fetch the next input byte, if there is one.
272 if(sLength<0) {
273 if((inByte=*s++)==0) {
274 return result;
275 }
276 } else {
277 if(sLength==0) {
278 return result;
279 }
280 inByte=*s++;
281 --sLength;
282 }
283 if(result==USTRINGTRIE_FINAL_VALUE) {
284 // No further matching bytes.
285 stop();
286 return USTRINGTRIE_NO_MATCH;
287 }
288 pos=pos_; // branchNext() advanced pos and wrote it to pos_ .
289 } else if(node<kMinValueLead) {
290 // Match length+1 bytes.
291 length=node-kMinLinearMatch; // Actual match length minus 1.
292 if(inByte!=*pos) {
293 stop();
294 return USTRINGTRIE_NO_MATCH;
295 }
296 ++pos;
297 --length;
298 break;
299 } else if(node&kValueIsFinal) {
300 // No further matching bytes.
301 stop();
302 return USTRINGTRIE_NO_MATCH;
303 } else {
304 // Skip intermediate value.
305 pos=skipValue(pos, node);
306 // The next node must not also be a value node.
307 U_ASSERT(*pos<kMinValueLead);
308 }
309 }
310 }
311 }
312
313 const uint8_t *
findUniqueValueFromBranch(const uint8_t * pos,int32_t length,UBool haveUniqueValue,int32_t & uniqueValue)314 BytesTrie::findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
315 UBool haveUniqueValue, int32_t &uniqueValue) {
316 while(length>kMaxBranchLinearSubNodeLength) {
317 ++pos; // ignore the comparison byte
318 if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {
319 return NULL;
320 }
321 length=length-(length>>1);
322 pos=skipDelta(pos);
323 }
324 do {
325 ++pos; // ignore a comparison byte
326 // handle its value
327 int32_t node=*pos++;
328 UBool isFinal=(UBool)(node&kValueIsFinal);
329 int32_t value=readValue(pos, node>>1);
330 pos=skipValue(pos, node);
331 if(isFinal) {
332 if(haveUniqueValue) {
333 if(value!=uniqueValue) {
334 return NULL;
335 }
336 } else {
337 uniqueValue=value;
338 haveUniqueValue=TRUE;
339 }
340 } else {
341 if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {
342 return NULL;
343 }
344 haveUniqueValue=TRUE;
345 }
346 } while(--length>1);
347 return pos+1; // ignore the last comparison byte
348 }
349
350 UBool
findUniqueValue(const uint8_t * pos,UBool haveUniqueValue,int32_t & uniqueValue)351 BytesTrie::findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue) {
352 for(;;) {
353 int32_t node=*pos++;
354 if(node<kMinLinearMatch) {
355 if(node==0) {
356 node=*pos++;
357 }
358 pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);
359 if(pos==NULL) {
360 return FALSE;
361 }
362 haveUniqueValue=TRUE;
363 } else if(node<kMinValueLead) {
364 // linear-match node
365 pos+=node-kMinLinearMatch+1; // Ignore the match bytes.
366 } else {
367 UBool isFinal=(UBool)(node&kValueIsFinal);
368 int32_t value=readValue(pos, node>>1);
369 if(haveUniqueValue) {
370 if(value!=uniqueValue) {
371 return FALSE;
372 }
373 } else {
374 uniqueValue=value;
375 haveUniqueValue=TRUE;
376 }
377 if(isFinal) {
378 return TRUE;
379 }
380 pos=skipValue(pos, node);
381 }
382 }
383 }
384
385 int32_t
getNextBytes(ByteSink & out) const386 BytesTrie::getNextBytes(ByteSink &out) const {
387 const uint8_t *pos=pos_;
388 if(pos==NULL) {
389 return 0;
390 }
391 if(remainingMatchLength_>=0) {
392 append(out, *pos); // Next byte of a pending linear-match node.
393 return 1;
394 }
395 int32_t node=*pos++;
396 if(node>=kMinValueLead) {
397 if(node&kValueIsFinal) {
398 return 0;
399 } else {
400 pos=skipValue(pos, node);
401 node=*pos++;
402 U_ASSERT(node<kMinValueLead);
403 }
404 }
405 if(node<kMinLinearMatch) {
406 if(node==0) {
407 node=*pos++;
408 }
409 getNextBranchBytes(pos, ++node, out);
410 return node;
411 } else {
412 // First byte of the linear-match node.
413 append(out, *pos);
414 return 1;
415 }
416 }
417
418 void
getNextBranchBytes(const uint8_t * pos,int32_t length,ByteSink & out)419 BytesTrie::getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out) {
420 while(length>kMaxBranchLinearSubNodeLength) {
421 ++pos; // ignore the comparison byte
422 getNextBranchBytes(jumpByDelta(pos), length>>1, out);
423 length=length-(length>>1);
424 pos=skipDelta(pos);
425 }
426 do {
427 append(out, *pos++);
428 pos=skipValue(pos);
429 } while(--length>1);
430 append(out, *pos);
431 }
432
433 void
append(ByteSink & out,int c)434 BytesTrie::append(ByteSink &out, int c) {
435 char ch=(char)c;
436 out.Append(&ch, 1);
437 }
438
439 U_NAMESPACE_END
440