1 /*
2 *******************************************************************************
3 * Copyright (C) 2010-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: bytestriebuilder.cpp
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2010sep25
12 * created by: Markus W. Scherer
13 */
14
15 #include "unicode/utypes.h"
16 #include "unicode/bytestrie.h"
17 #include "unicode/bytestriebuilder.h"
18 #include "unicode/stringpiece.h"
19 #include "charstr.h"
20 #include "cmemory.h"
21 #include "uhash.h"
22 #include "uarrsort.h"
23 #include "uassert.h"
24 #include "ustr_imp.h"
25
26 U_NAMESPACE_BEGIN
27
28 /*
29 * Note: This builder implementation stores (bytes, value) pairs with full copies
30 * of the byte sequences, until the BytesTrie is built.
31 * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
32 */
33
34 class BytesTrieElement : public UMemory {
35 public:
36 // Use compiler's default constructor, initializes nothing.
37
38 void setTo(const StringPiece &s, int32_t val, CharString &strings, UErrorCode &errorCode);
39
getString(const CharString & strings) const40 StringPiece getString(const CharString &strings) const {
41 int32_t offset=stringOffset;
42 int32_t length;
43 if(offset>=0) {
44 length=(uint8_t)strings[offset++];
45 } else {
46 offset=~offset;
47 length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
48 offset+=2;
49 }
50 return StringPiece(strings.data()+offset, length);
51 }
getStringLength(const CharString & strings) const52 int32_t getStringLength(const CharString &strings) const {
53 int32_t offset=stringOffset;
54 if(offset>=0) {
55 return (uint8_t)strings[offset];
56 } else {
57 offset=~offset;
58 return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
59 }
60 }
61
charAt(int32_t index,const CharString & strings) const62 char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }
63
getValue() const64 int32_t getValue() const { return value; }
65
66 int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const;
67
68 private:
data(const CharString & strings) const69 const char *data(const CharString &strings) const {
70 int32_t offset=stringOffset;
71 if(offset>=0) {
72 ++offset;
73 } else {
74 offset=~offset+2;
75 }
76 return strings.data()+offset;
77 }
78
79 // If the stringOffset is non-negative, then the first strings byte contains
80 // the string length.
81 // If the stringOffset is negative, then the first two strings bytes contain
82 // the string length (big-endian), and the offset needs to be bit-inverted.
83 // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
84 int32_t stringOffset;
85 int32_t value;
86 };
87
88 void
setTo(const StringPiece & s,int32_t val,CharString & strings,UErrorCode & errorCode)89 BytesTrieElement::setTo(const StringPiece &s, int32_t val,
90 CharString &strings, UErrorCode &errorCode) {
91 if(U_FAILURE(errorCode)) {
92 return;
93 }
94 int32_t length=s.length();
95 if(length>0xffff) {
96 // Too long: We store the length in 1 or 2 bytes.
97 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
98 return;
99 }
100 int32_t offset=strings.length();
101 if(length>0xff) {
102 offset=~offset;
103 strings.append((char)(length>>8), errorCode);
104 }
105 strings.append((char)length, errorCode);
106 stringOffset=offset;
107 value=val;
108 strings.append(s, errorCode);
109 }
110
111 int32_t
compareStringTo(const BytesTrieElement & other,const CharString & strings) const112 BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const {
113 // TODO: add StringPiece::compare(), see ticket #8187
114 StringPiece thisString=getString(strings);
115 StringPiece otherString=other.getString(strings);
116 int32_t lengthDiff=thisString.length()-otherString.length();
117 int32_t commonLength;
118 if(lengthDiff<=0) {
119 commonLength=thisString.length();
120 } else {
121 commonLength=otherString.length();
122 }
123 int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);
124 return diff!=0 ? diff : lengthDiff;
125 }
126
BytesTrieBuilder(UErrorCode & errorCode)127 BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode)
128 : strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0),
129 bytes(NULL), bytesCapacity(0), bytesLength(0) {
130 if(U_FAILURE(errorCode)) {
131 return;
132 }
133 strings=new CharString();
134 if(strings==NULL) {
135 errorCode=U_MEMORY_ALLOCATION_ERROR;
136 }
137 }
138
~BytesTrieBuilder()139 BytesTrieBuilder::~BytesTrieBuilder() {
140 delete strings;
141 delete[] elements;
142 uprv_free(bytes);
143 }
144
145 BytesTrieBuilder &
add(const StringPiece & s,int32_t value,UErrorCode & errorCode)146 BytesTrieBuilder::add(const StringPiece &s, int32_t value, UErrorCode &errorCode) {
147 if(U_FAILURE(errorCode)) {
148 return *this;
149 }
150 if(bytesLength>0) {
151 // Cannot add elements after building.
152 errorCode=U_NO_WRITE_PERMISSION;
153 return *this;
154 }
155 if(elementsLength==elementsCapacity) {
156 int32_t newCapacity;
157 if(elementsCapacity==0) {
158 newCapacity=1024;
159 } else {
160 newCapacity=4*elementsCapacity;
161 }
162 BytesTrieElement *newElements=new BytesTrieElement[newCapacity];
163 if(newElements==NULL) {
164 errorCode=U_MEMORY_ALLOCATION_ERROR;
165 return *this; // error instead of dereferencing null
166 }
167 if(elementsLength>0) {
168 uprv_memcpy(newElements, elements, elementsLength*sizeof(BytesTrieElement));
169 }
170 delete[] elements;
171 elements=newElements;
172 elementsCapacity=newCapacity;
173 }
174 elements[elementsLength++].setTo(s, value, *strings, errorCode);
175 return *this;
176 }
177
178 U_CDECL_BEGIN
179
180 static int32_t U_CALLCONV
compareElementStrings(const void * context,const void * left,const void * right)181 compareElementStrings(const void *context, const void *left, const void *right) {
182 const CharString *strings=static_cast<const CharString *>(context);
183 const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left);
184 const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right);
185 return leftElement->compareStringTo(*rightElement, *strings);
186 }
187
188 U_CDECL_END
189
190 BytesTrie *
build(UStringTrieBuildOption buildOption,UErrorCode & errorCode)191 BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
192 buildBytes(buildOption, errorCode);
193 BytesTrie *newTrie=NULL;
194 if(U_SUCCESS(errorCode)) {
195 newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength));
196 if(newTrie==NULL) {
197 errorCode=U_MEMORY_ALLOCATION_ERROR;
198 } else {
199 bytes=NULL; // The new trie now owns the array.
200 bytesCapacity=0;
201 }
202 }
203 return newTrie;
204 }
205
206 StringPiece
buildStringPiece(UStringTrieBuildOption buildOption,UErrorCode & errorCode)207 BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
208 buildBytes(buildOption, errorCode);
209 StringPiece result;
210 if(U_SUCCESS(errorCode)) {
211 result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
212 }
213 return result;
214 }
215
216 void
buildBytes(UStringTrieBuildOption buildOption,UErrorCode & errorCode)217 BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
218 if(U_FAILURE(errorCode)) {
219 return;
220 }
221 if(bytes!=NULL && bytesLength>0) {
222 // Already built.
223 return;
224 }
225 if(bytesLength==0) {
226 if(elementsLength==0) {
227 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
228 return;
229 }
230 uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement),
231 compareElementStrings, strings,
232 FALSE, // need not be a stable sort
233 &errorCode);
234 if(U_FAILURE(errorCode)) {
235 return;
236 }
237 // Duplicate strings are not allowed.
238 StringPiece prev=elements[0].getString(*strings);
239 for(int32_t i=1; i<elementsLength; ++i) {
240 StringPiece current=elements[i].getString(*strings);
241 if(prev==current) {
242 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
243 return;
244 }
245 prev=current;
246 }
247 }
248 // Create and byte-serialize the trie for the elements.
249 bytesLength=0;
250 int32_t capacity=strings->length();
251 if(capacity<1024) {
252 capacity=1024;
253 }
254 if(bytesCapacity<capacity) {
255 uprv_free(bytes);
256 bytes=static_cast<char *>(uprv_malloc(capacity));
257 if(bytes==NULL) {
258 errorCode=U_MEMORY_ALLOCATION_ERROR;
259 bytesCapacity=0;
260 return;
261 }
262 bytesCapacity=capacity;
263 }
264 StringTrieBuilder::build(buildOption, elementsLength, errorCode);
265 if(bytes==NULL) {
266 errorCode=U_MEMORY_ALLOCATION_ERROR;
267 }
268 }
269
270 BytesTrieBuilder &
clear()271 BytesTrieBuilder::clear() {
272 strings->clear();
273 elementsLength=0;
274 bytesLength=0;
275 return *this;
276 }
277
278 int32_t
getElementStringLength(int32_t i) const279 BytesTrieBuilder::getElementStringLength(int32_t i) const {
280 return elements[i].getStringLength(*strings);
281 }
282
283 UChar
getElementUnit(int32_t i,int32_t byteIndex) const284 BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {
285 return (uint8_t)elements[i].charAt(byteIndex, *strings);
286 }
287
288 int32_t
getElementValue(int32_t i) const289 BytesTrieBuilder::getElementValue(int32_t i) const {
290 return elements[i].getValue();
291 }
292
293 int32_t
getLimitOfLinearMatch(int32_t first,int32_t last,int32_t byteIndex) const294 BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {
295 const BytesTrieElement &firstElement=elements[first];
296 const BytesTrieElement &lastElement=elements[last];
297 int32_t minStringLength=firstElement.getStringLength(*strings);
298 while(++byteIndex<minStringLength &&
299 firstElement.charAt(byteIndex, *strings)==
300 lastElement.charAt(byteIndex, *strings)) {}
301 return byteIndex;
302 }
303
304 int32_t
countElementUnits(int32_t start,int32_t limit,int32_t byteIndex) const305 BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {
306 int32_t length=0; // Number of different bytes at byteIndex.
307 int32_t i=start;
308 do {
309 char byte=elements[i++].charAt(byteIndex, *strings);
310 while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) {
311 ++i;
312 }
313 ++length;
314 } while(i<limit);
315 return length;
316 }
317
318 int32_t
skipElementsBySomeUnits(int32_t i,int32_t byteIndex,int32_t count) const319 BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {
320 do {
321 char byte=elements[i++].charAt(byteIndex, *strings);
322 while(byte==elements[i].charAt(byteIndex, *strings)) {
323 ++i;
324 }
325 } while(--count>0);
326 return i;
327 }
328
329 int32_t
indexOfElementWithNextUnit(int32_t i,int32_t byteIndex,UChar byte) const330 BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const {
331 char b=(char)byte;
332 while(b==elements[i].charAt(byteIndex, *strings)) {
333 ++i;
334 }
335 return i;
336 }
337
BTLinearMatchNode(const char * bytes,int32_t len,Node * nextNode)338 BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
339 : LinearMatchNode(len, nextNode), s(bytes) {
340 hash=hash*37+ustr_hashCharsN(bytes, len);
341 }
342
343 UBool
operator ==(const Node & other) const344 BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {
345 if(this==&other) {
346 return TRUE;
347 }
348 if(!LinearMatchNode::operator==(other)) {
349 return FALSE;
350 }
351 const BTLinearMatchNode &o=(const BTLinearMatchNode &)other;
352 return 0==uprv_memcmp(s, o.s, length);
353 }
354
355 void
write(StringTrieBuilder & builder)356 BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) {
357 BytesTrieBuilder &b=(BytesTrieBuilder &)builder;
358 next->write(builder);
359 b.write(s, length);
360 offset=b.write(b.getMinLinearMatch()+length-1);
361 }
362
363 StringTrieBuilder::Node *
createLinearMatchNode(int32_t i,int32_t byteIndex,int32_t length,Node * nextNode) const364 BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
365 Node *nextNode) const {
366 return new BTLinearMatchNode(
367 elements[i].getString(*strings).data()+byteIndex,
368 length,
369 nextNode);
370 }
371
372 UBool
ensureCapacity(int32_t length)373 BytesTrieBuilder::ensureCapacity(int32_t length) {
374 if(bytes==NULL) {
375 return FALSE; // previous memory allocation had failed
376 }
377 if(length>bytesCapacity) {
378 int32_t newCapacity=bytesCapacity;
379 do {
380 newCapacity*=2;
381 } while(newCapacity<=length);
382 char *newBytes=static_cast<char *>(uprv_malloc(newCapacity));
383 if(newBytes==NULL) {
384 // unable to allocate memory
385 uprv_free(bytes);
386 bytes=NULL;
387 bytesCapacity=0;
388 return FALSE;
389 }
390 uprv_memcpy(newBytes+(newCapacity-bytesLength),
391 bytes+(bytesCapacity-bytesLength), bytesLength);
392 uprv_free(bytes);
393 bytes=newBytes;
394 bytesCapacity=newCapacity;
395 }
396 return TRUE;
397 }
398
399 int32_t
write(int32_t byte)400 BytesTrieBuilder::write(int32_t byte) {
401 int32_t newLength=bytesLength+1;
402 if(ensureCapacity(newLength)) {
403 bytesLength=newLength;
404 bytes[bytesCapacity-bytesLength]=(char)byte;
405 }
406 return bytesLength;
407 }
408
409 int32_t
write(const char * b,int32_t length)410 BytesTrieBuilder::write(const char *b, int32_t length) {
411 int32_t newLength=bytesLength+length;
412 if(ensureCapacity(newLength)) {
413 bytesLength=newLength;
414 uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);
415 }
416 return bytesLength;
417 }
418
419 int32_t
writeElementUnits(int32_t i,int32_t byteIndex,int32_t length)420 BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {
421 return write(elements[i].getString(*strings).data()+byteIndex, length);
422 }
423
424 int32_t
writeValueAndFinal(int32_t i,UBool isFinal)425 BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
426 if(0<=i && i<=BytesTrie::kMaxOneByteValue) {
427 return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal);
428 }
429 char intBytes[5];
430 int32_t length=1;
431 if(i<0 || i>0xffffff) {
432 intBytes[0]=(char)BytesTrie::kFiveByteValueLead;
433 intBytes[1]=(char)((uint32_t)i>>24);
434 intBytes[2]=(char)((uint32_t)i>>16);
435 intBytes[3]=(char)((uint32_t)i>>8);
436 intBytes[4]=(char)i;
437 length=5;
438 // } else if(i<=BytesTrie::kMaxOneByteValue) {
439 // intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i);
440 } else {
441 if(i<=BytesTrie::kMaxTwoByteValue) {
442 intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8));
443 } else {
444 if(i<=BytesTrie::kMaxThreeByteValue) {
445 intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16));
446 } else {
447 intBytes[0]=(char)BytesTrie::kFourByteValueLead;
448 intBytes[1]=(char)(i>>16);
449 length=2;
450 }
451 intBytes[length++]=(char)(i>>8);
452 }
453 intBytes[length++]=(char)i;
454 }
455 intBytes[0]=(char)((intBytes[0]<<1)|isFinal);
456 return write(intBytes, length);
457 }
458
459 int32_t
writeValueAndType(UBool hasValue,int32_t value,int32_t node)460 BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
461 int32_t offset=write(node);
462 if(hasValue) {
463 offset=writeValueAndFinal(value, FALSE);
464 }
465 return offset;
466 }
467
468 int32_t
writeDeltaTo(int32_t jumpTarget)469 BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
470 int32_t i=bytesLength-jumpTarget;
471 U_ASSERT(i>=0);
472 if(i<=BytesTrie::kMaxOneByteDelta) {
473 return write(i);
474 }
475 char intBytes[5];
476 int32_t length;
477 if(i<=BytesTrie::kMaxTwoByteDelta) {
478 intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8));
479 length=1;
480 } else {
481 if(i<=BytesTrie::kMaxThreeByteDelta) {
482 intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16));
483 length=2;
484 } else {
485 if(i<=0xffffff) {
486 intBytes[0]=(char)BytesTrie::kFourByteDeltaLead;
487 length=3;
488 } else {
489 intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead;
490 intBytes[1]=(char)(i>>24);
491 length=4;
492 }
493 intBytes[1]=(char)(i>>16);
494 }
495 intBytes[1]=(char)(i>>8);
496 }
497 intBytes[length++]=(char)i;
498 return write(intBytes, length);
499 }
500
501 U_NAMESPACE_END
502