1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2010-2011, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: bytestrie.cpp
9 * encoding: UTF-8
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2010sep25
14 * created by: Markus W. Scherer
15 */
16
17 #include "unicode/utypes.h"
18 #include "unicode/bytestream.h"
19 #include "unicode/bytestrie.h"
20 #include "unicode/uobject.h"
21 #include "cmemory.h"
22 #include "uassert.h"
23
24 U_NAMESPACE_BEGIN
25
~BytesTrie()26 BytesTrie::~BytesTrie() {
27 uprv_free(ownedArray_);
28 }
29
30 // lead byte already shifted right by 1.
31 int32_t
readValue(const uint8_t * pos,int32_t leadByte)32 BytesTrie::readValue(const uint8_t *pos, int32_t leadByte) {
33 int32_t value;
34 if(leadByte<kMinTwoByteValueLead) {
35 value=leadByte-kMinOneByteValueLead;
36 } else if(leadByte<kMinThreeByteValueLead) {
37 value=((leadByte-kMinTwoByteValueLead)<<8)|*pos;
38 } else if(leadByte<kFourByteValueLead) {
39 value=((leadByte-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1];
40 } else if(leadByte==kFourByteValueLead) {
41 value=(pos[0]<<16)|(pos[1]<<8)|pos[2];
42 } else {
43 value=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
44 }
45 return value;
46 }
47
48 const uint8_t *
jumpByDelta(const uint8_t * pos)49 BytesTrie::jumpByDelta(const uint8_t *pos) {
50 int32_t delta=*pos++;
51 if(delta<kMinTwoByteDeltaLead) {
52 // nothing to do
53 } else if(delta<kMinThreeByteDeltaLead) {
54 delta=((delta-kMinTwoByteDeltaLead)<<8)|*pos++;
55 } else if(delta<kFourByteDeltaLead) {
56 delta=((delta-kMinThreeByteDeltaLead)<<16)|(pos[0]<<8)|pos[1];
57 pos+=2;
58 } else if(delta==kFourByteDeltaLead) {
59 delta=(pos[0]<<16)|(pos[1]<<8)|pos[2];
60 pos+=3;
61 } else {
62 delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
63 pos+=4;
64 }
65 return pos+delta;
66 }
67
68 UStringTrieResult
current() const69 BytesTrie::current() const {
70 const uint8_t *pos=pos_;
71 if(pos==NULL) {
72 return USTRINGTRIE_NO_MATCH;
73 } else {
74 int32_t node;
75 return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?
76 valueResult(node) : USTRINGTRIE_NO_VALUE;
77 }
78 }
79
80 UStringTrieResult
branchNext(const uint8_t * pos,int32_t length,int32_t inByte)81 BytesTrie::branchNext(const uint8_t *pos, int32_t length, int32_t inByte) {
82 // Branch according to the current byte.
83 if(length==0) {
84 length=*pos++;
85 }
86 ++length;
87 // The length of the branch is the number of bytes to select from.
88 // The data structure encodes a binary search.
89 while(length>kMaxBranchLinearSubNodeLength) {
90 if(inByte<*pos++) {
91 length>>=1;
92 pos=jumpByDelta(pos);
93 } else {
94 length=length-(length>>1);
95 pos=skipDelta(pos);
96 }
97 }
98 // Drop down to linear search for the last few bytes.
99 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
100 // and divides length by 2.
101 do {
102 if(inByte==*pos++) {
103 UStringTrieResult result;
104 int32_t node=*pos;
105 U_ASSERT(node>=kMinValueLead);
106 if(node&kValueIsFinal) {
107 // Leave the final value for getValue() to read.
108 result=USTRINGTRIE_FINAL_VALUE;
109 } else {
110 // Use the non-final value as the jump delta.
111 ++pos;
112 // int32_t delta=readValue(pos, node>>1);
113 node>>=1;
114 int32_t delta;
115 if(node<kMinTwoByteValueLead) {
116 delta=node-kMinOneByteValueLead;
117 } else if(node<kMinThreeByteValueLead) {
118 delta=((node-kMinTwoByteValueLead)<<8)|*pos++;
119 } else if(node<kFourByteValueLead) {
120 delta=((node-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1];
121 pos+=2;
122 } else if(node==kFourByteValueLead) {
123 delta=(pos[0]<<16)|(pos[1]<<8)|pos[2];
124 pos+=3;
125 } else {
126 delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
127 pos+=4;
128 }
129 // end readValue()
130 pos+=delta;
131 node=*pos;
132 result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
133 }
134 pos_=pos;
135 return result;
136 }
137 --length;
138 pos=skipValue(pos);
139 } while(length>1);
140 if(inByte==*pos++) {
141 pos_=pos;
142 int32_t node=*pos;
143 return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
144 } else {
145 stop();
146 return USTRINGTRIE_NO_MATCH;
147 }
148 }
149
150 UStringTrieResult
nextImpl(const uint8_t * pos,int32_t inByte)151 BytesTrie::nextImpl(const uint8_t *pos, int32_t inByte) {
152 for(;;) {
153 int32_t node=*pos++;
154 if(node<kMinLinearMatch) {
155 return branchNext(pos, node, inByte);
156 } else if(node<kMinValueLead) {
157 // Match the first of length+1 bytes.
158 int32_t length=node-kMinLinearMatch; // Actual match length minus 1.
159 if(inByte==*pos++) {
160 remainingMatchLength_=--length;
161 pos_=pos;
162 return (length<0 && (node=*pos)>=kMinValueLead) ?
163 valueResult(node) : USTRINGTRIE_NO_VALUE;
164 } else {
165 // No match.
166 break;
167 }
168 } else if(node&kValueIsFinal) {
169 // No further matching bytes.
170 break;
171 } else {
172 // Skip intermediate value.
173 pos=skipValue(pos, node);
174 // The next node must not also be a value node.
175 U_ASSERT(*pos<kMinValueLead);
176 }
177 }
178 stop();
179 return USTRINGTRIE_NO_MATCH;
180 }
181
182 UStringTrieResult
next(int32_t inByte)183 BytesTrie::next(int32_t inByte) {
184 const uint8_t *pos=pos_;
185 if(pos==NULL) {
186 return USTRINGTRIE_NO_MATCH;
187 }
188 if(inByte<0) {
189 inByte+=0x100;
190 }
191 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
192 if(length>=0) {
193 // Remaining part of a linear-match node.
194 if(inByte==*pos++) {
195 remainingMatchLength_=--length;
196 pos_=pos;
197 int32_t node;
198 return (length<0 && (node=*pos)>=kMinValueLead) ?
199 valueResult(node) : USTRINGTRIE_NO_VALUE;
200 } else {
201 stop();
202 return USTRINGTRIE_NO_MATCH;
203 }
204 }
205 return nextImpl(pos, inByte);
206 }
207
208 UStringTrieResult
next(const char * s,int32_t sLength)209 BytesTrie::next(const char *s, int32_t sLength) {
210 if(sLength<0 ? *s==0 : sLength==0) {
211 // Empty input.
212 return current();
213 }
214 const uint8_t *pos=pos_;
215 if(pos==NULL) {
216 return USTRINGTRIE_NO_MATCH;
217 }
218 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
219 for(;;) {
220 // Fetch the next input byte, if there is one.
221 // Continue a linear-match node without rechecking sLength<0.
222 int32_t inByte;
223 if(sLength<0) {
224 for(;;) {
225 if((inByte=*s++)==0) {
226 remainingMatchLength_=length;
227 pos_=pos;
228 int32_t node;
229 return (length<0 && (node=*pos)>=kMinValueLead) ?
230 valueResult(node) : USTRINGTRIE_NO_VALUE;
231 }
232 if(length<0) {
233 remainingMatchLength_=length;
234 break;
235 }
236 if(inByte!=*pos) {
237 stop();
238 return USTRINGTRIE_NO_MATCH;
239 }
240 ++pos;
241 --length;
242 }
243 } else {
244 for(;;) {
245 if(sLength==0) {
246 remainingMatchLength_=length;
247 pos_=pos;
248 int32_t node;
249 return (length<0 && (node=*pos)>=kMinValueLead) ?
250 valueResult(node) : USTRINGTRIE_NO_VALUE;
251 }
252 inByte=*s++;
253 --sLength;
254 if(length<0) {
255 remainingMatchLength_=length;
256 break;
257 }
258 if(inByte!=*pos) {
259 stop();
260 return USTRINGTRIE_NO_MATCH;
261 }
262 ++pos;
263 --length;
264 }
265 }
266 for(;;) {
267 int32_t node=*pos++;
268 if(node<kMinLinearMatch) {
269 UStringTrieResult result=branchNext(pos, node, inByte);
270 if(result==USTRINGTRIE_NO_MATCH) {
271 return USTRINGTRIE_NO_MATCH;
272 }
273 // Fetch the next input byte, if there is one.
274 if(sLength<0) {
275 if((inByte=*s++)==0) {
276 return result;
277 }
278 } else {
279 if(sLength==0) {
280 return result;
281 }
282 inByte=*s++;
283 --sLength;
284 }
285 if(result==USTRINGTRIE_FINAL_VALUE) {
286 // No further matching bytes.
287 stop();
288 return USTRINGTRIE_NO_MATCH;
289 }
290 pos=pos_; // branchNext() advanced pos and wrote it to pos_ .
291 } else if(node<kMinValueLead) {
292 // Match length+1 bytes.
293 length=node-kMinLinearMatch; // Actual match length minus 1.
294 if(inByte!=*pos) {
295 stop();
296 return USTRINGTRIE_NO_MATCH;
297 }
298 ++pos;
299 --length;
300 break;
301 } else if(node&kValueIsFinal) {
302 // No further matching bytes.
303 stop();
304 return USTRINGTRIE_NO_MATCH;
305 } else {
306 // Skip intermediate value.
307 pos=skipValue(pos, node);
308 // The next node must not also be a value node.
309 U_ASSERT(*pos<kMinValueLead);
310 }
311 }
312 }
313 }
314
315 const uint8_t *
findUniqueValueFromBranch(const uint8_t * pos,int32_t length,UBool haveUniqueValue,int32_t & uniqueValue)316 BytesTrie::findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
317 UBool haveUniqueValue, int32_t &uniqueValue) {
318 while(length>kMaxBranchLinearSubNodeLength) {
319 ++pos; // ignore the comparison byte
320 if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {
321 return NULL;
322 }
323 length=length-(length>>1);
324 pos=skipDelta(pos);
325 }
326 do {
327 ++pos; // ignore a comparison byte
328 // handle its value
329 int32_t node=*pos++;
330 UBool isFinal=(UBool)(node&kValueIsFinal);
331 int32_t value=readValue(pos, node>>1);
332 pos=skipValue(pos, node);
333 if(isFinal) {
334 if(haveUniqueValue) {
335 if(value!=uniqueValue) {
336 return NULL;
337 }
338 } else {
339 uniqueValue=value;
340 haveUniqueValue=TRUE;
341 }
342 } else {
343 if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {
344 return NULL;
345 }
346 haveUniqueValue=TRUE;
347 }
348 } while(--length>1);
349 return pos+1; // ignore the last comparison byte
350 }
351
352 UBool
findUniqueValue(const uint8_t * pos,UBool haveUniqueValue,int32_t & uniqueValue)353 BytesTrie::findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue) {
354 for(;;) {
355 int32_t node=*pos++;
356 if(node<kMinLinearMatch) {
357 if(node==0) {
358 node=*pos++;
359 }
360 pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);
361 if(pos==NULL) {
362 return FALSE;
363 }
364 haveUniqueValue=TRUE;
365 } else if(node<kMinValueLead) {
366 // linear-match node
367 pos+=node-kMinLinearMatch+1; // Ignore the match bytes.
368 } else {
369 UBool isFinal=(UBool)(node&kValueIsFinal);
370 int32_t value=readValue(pos, node>>1);
371 if(haveUniqueValue) {
372 if(value!=uniqueValue) {
373 return FALSE;
374 }
375 } else {
376 uniqueValue=value;
377 haveUniqueValue=TRUE;
378 }
379 if(isFinal) {
380 return TRUE;
381 }
382 pos=skipValue(pos, node);
383 }
384 }
385 }
386
387 int32_t
getNextBytes(ByteSink & out) const388 BytesTrie::getNextBytes(ByteSink &out) const {
389 const uint8_t *pos=pos_;
390 if(pos==NULL) {
391 return 0;
392 }
393 if(remainingMatchLength_>=0) {
394 append(out, *pos); // Next byte of a pending linear-match node.
395 return 1;
396 }
397 int32_t node=*pos++;
398 if(node>=kMinValueLead) {
399 if(node&kValueIsFinal) {
400 return 0;
401 } else {
402 pos=skipValue(pos, node);
403 node=*pos++;
404 U_ASSERT(node<kMinValueLead);
405 }
406 }
407 if(node<kMinLinearMatch) {
408 if(node==0) {
409 node=*pos++;
410 }
411 getNextBranchBytes(pos, ++node, out);
412 return node;
413 } else {
414 // First byte of the linear-match node.
415 append(out, *pos);
416 return 1;
417 }
418 }
419
420 void
getNextBranchBytes(const uint8_t * pos,int32_t length,ByteSink & out)421 BytesTrie::getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out) {
422 while(length>kMaxBranchLinearSubNodeLength) {
423 ++pos; // ignore the comparison byte
424 getNextBranchBytes(jumpByDelta(pos), length>>1, out);
425 length=length-(length>>1);
426 pos=skipDelta(pos);
427 }
428 do {
429 append(out, *pos++);
430 pos=skipValue(pos);
431 } while(--length>1);
432 append(out, *pos);
433 }
434
435 void
append(ByteSink & out,int c)436 BytesTrie::append(ByteSink &out, int c) {
437 char ch=(char)c;
438 out.Append(&ch, 1);
439 }
440
441 U_NAMESPACE_END
442