1 /*
2 ******************************************************************************
3 * Copyright (C) 2001-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************
6 *
7 * File ucoleitr.cpp
8 *
9 * Modification History:
10 *
11 * Date Name Description
12 * 02/15/2001 synwee Modified all methods to process its own function
13 * instead of calling the equivalent c++ api (coleitr.h)
14 * 2012-2014 markus Rewritten in C++ again.
15 ******************************************************************************/
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_COLLATION
20
21 #include "unicode/coleitr.h"
22 #include "unicode/tblcoll.h"
23 #include "unicode/ucoleitr.h"
24 #include "unicode/ustring.h"
25 #include "unicode/sortkey.h"
26 #include "unicode/uobject.h"
27 #include "cmemory.h"
28 #include "usrchimp.h"
29
30 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
31
32 U_NAMESPACE_USE
33
34 #define BUFFER_LENGTH 100
35
36 #define DEFAULT_BUFFER_SIZE 16
37 #define BUFFER_GROW 8
38
39 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
40
41 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
42
43 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
44
45 #define GROW_ARRAY(array, newSize) uprv_realloc((void *) (array), (newSize) * sizeof (array)[0])
46
47 #define DELETE_ARRAY(array) uprv_free((void *) (array))
48
49 struct RCEI
50 {
51 uint32_t ce;
52 int32_t low;
53 int32_t high;
54 };
55
56 U_NAMESPACE_BEGIN
57
58 struct RCEBuffer
59 {
60 RCEI defaultBuffer[DEFAULT_BUFFER_SIZE];
61 RCEI *buffer;
62 int32_t bufferIndex;
63 int32_t bufferSize;
64
65 RCEBuffer();
66 ~RCEBuffer();
67
68 UBool empty() const;
69 void put(uint32_t ce, int32_t ixLow, int32_t ixHigh);
70 const RCEI *get();
71 };
72
RCEBuffer()73 RCEBuffer::RCEBuffer()
74 {
75 buffer = defaultBuffer;
76 bufferIndex = 0;
77 bufferSize = LENGTHOF(defaultBuffer);
78 }
79
~RCEBuffer()80 RCEBuffer::~RCEBuffer()
81 {
82 if (buffer != defaultBuffer) {
83 DELETE_ARRAY(buffer);
84 }
85 }
86
empty() const87 UBool RCEBuffer::empty() const
88 {
89 return bufferIndex <= 0;
90 }
91
put(uint32_t ce,int32_t ixLow,int32_t ixHigh)92 void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh)
93 {
94 if (bufferIndex >= bufferSize) {
95 RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW);
96
97 ARRAY_COPY(newBuffer, buffer, bufferSize);
98
99 if (buffer != defaultBuffer) {
100 DELETE_ARRAY(buffer);
101 }
102
103 buffer = newBuffer;
104 bufferSize += BUFFER_GROW;
105 }
106
107 buffer[bufferIndex].ce = ce;
108 buffer[bufferIndex].low = ixLow;
109 buffer[bufferIndex].high = ixHigh;
110
111 bufferIndex += 1;
112 }
113
get()114 const RCEI *RCEBuffer::get()
115 {
116 if (bufferIndex > 0) {
117 return &buffer[--bufferIndex];
118 }
119
120 return NULL;
121 }
122
PCEBuffer()123 PCEBuffer::PCEBuffer()
124 {
125 buffer = defaultBuffer;
126 bufferIndex = 0;
127 bufferSize = LENGTHOF(defaultBuffer);
128 }
129
~PCEBuffer()130 PCEBuffer::~PCEBuffer()
131 {
132 if (buffer != defaultBuffer) {
133 DELETE_ARRAY(buffer);
134 }
135 }
136
reset()137 void PCEBuffer::reset()
138 {
139 bufferIndex = 0;
140 }
141
empty() const142 UBool PCEBuffer::empty() const
143 {
144 return bufferIndex <= 0;
145 }
146
put(uint64_t ce,int32_t ixLow,int32_t ixHigh)147 void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh)
148 {
149 if (bufferIndex >= bufferSize) {
150 PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW);
151
152 ARRAY_COPY(newBuffer, buffer, bufferSize);
153
154 if (buffer != defaultBuffer) {
155 DELETE_ARRAY(buffer);
156 }
157
158 buffer = newBuffer;
159 bufferSize += BUFFER_GROW;
160 }
161
162 buffer[bufferIndex].ce = ce;
163 buffer[bufferIndex].low = ixLow;
164 buffer[bufferIndex].high = ixHigh;
165
166 bufferIndex += 1;
167 }
168
get()169 const PCEI *PCEBuffer::get()
170 {
171 if (bufferIndex > 0) {
172 return &buffer[--bufferIndex];
173 }
174
175 return NULL;
176 }
177
UCollationPCE(UCollationElements * elems)178 UCollationPCE::UCollationPCE(UCollationElements *elems) { init(elems); }
179
UCollationPCE(CollationElementIterator * iter)180 UCollationPCE::UCollationPCE(CollationElementIterator *iter) { init(iter); }
181
init(UCollationElements * elems)182 void UCollationPCE::init(UCollationElements *elems) {
183 init(CollationElementIterator::fromUCollationElements(elems));
184 }
185
init(CollationElementIterator * iter)186 void UCollationPCE::init(CollationElementIterator *iter)
187 {
188 cei = iter;
189 init(*iter->rbc_);
190 }
191
init(const Collator & coll)192 void UCollationPCE::init(const Collator &coll)
193 {
194 UErrorCode status = U_ZERO_ERROR;
195
196 strength = coll.getAttribute(UCOL_STRENGTH, status);
197 toShift = coll.getAttribute(UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED;
198 isShifted = FALSE;
199 variableTop = coll.getVariableTop(status);
200 }
201
~UCollationPCE()202 UCollationPCE::~UCollationPCE()
203 {
204 // nothing to do
205 }
206
processCE(uint32_t ce)207 uint64_t UCollationPCE::processCE(uint32_t ce)
208 {
209 uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
210
211 // This is clean, but somewhat slow...
212 // We could apply the mask to ce and then
213 // just get all three orders...
214 switch(strength) {
215 default:
216 tertiary = ucol_tertiaryOrder(ce);
217 /* note fall-through */
218
219 case UCOL_SECONDARY:
220 secondary = ucol_secondaryOrder(ce);
221 /* note fall-through */
222
223 case UCOL_PRIMARY:
224 primary = ucol_primaryOrder(ce);
225 }
226
227 // **** This should probably handle continuations too. ****
228 // **** That means that we need 24 bits for the primary ****
229 // **** instead of the 16 that we're currently using. ****
230 // **** So we can lay out the 64 bits as: 24.12.12.16. ****
231 // **** Another complication with continuations is that ****
232 // **** the *second* CE is marked as a continuation, so ****
233 // **** we always have to peek ahead to know how long ****
234 // **** the primary is... ****
235 if ((toShift && variableTop > ce && primary != 0)
236 || (isShifted && primary == 0)) {
237
238 if (primary == 0) {
239 return UCOL_IGNORABLE;
240 }
241
242 if (strength >= UCOL_QUATERNARY) {
243 quaternary = primary;
244 }
245
246 primary = secondary = tertiary = 0;
247 isShifted = TRUE;
248 } else {
249 if (strength >= UCOL_QUATERNARY) {
250 quaternary = 0xFFFF;
251 }
252
253 isShifted = FALSE;
254 }
255
256 return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
257 }
258
259 U_NAMESPACE_END
260
261 /* public methods ---------------------------------------------------- */
262
263 U_CAPI UCollationElements* U_EXPORT2
ucol_openElements(const UCollator * coll,const UChar * text,int32_t textLength,UErrorCode * status)264 ucol_openElements(const UCollator *coll,
265 const UChar *text,
266 int32_t textLength,
267 UErrorCode *status)
268 {
269 if (U_FAILURE(*status)) {
270 return NULL;
271 }
272 if (coll == NULL || (text == NULL && textLength != 0)) {
273 *status = U_ILLEGAL_ARGUMENT_ERROR;
274 return NULL;
275 }
276 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
277 if (rbc == NULL) {
278 *status = U_UNSUPPORTED_ERROR; // coll is a Collator but not a RuleBasedCollator
279 return NULL;
280 }
281
282 UnicodeString s((UBool)(textLength < 0), text, textLength);
283 CollationElementIterator *cei = rbc->createCollationElementIterator(s);
284 if (cei == NULL) {
285 *status = U_MEMORY_ALLOCATION_ERROR;
286 return NULL;
287 }
288
289 return cei->toUCollationElements();
290 }
291
292
293 U_CAPI void U_EXPORT2
ucol_closeElements(UCollationElements * elems)294 ucol_closeElements(UCollationElements *elems)
295 {
296 delete CollationElementIterator::fromUCollationElements(elems);
297 }
298
299 U_CAPI void U_EXPORT2
ucol_reset(UCollationElements * elems)300 ucol_reset(UCollationElements *elems)
301 {
302 CollationElementIterator::fromUCollationElements(elems)->reset();
303 }
304
305 U_CAPI int32_t U_EXPORT2
ucol_next(UCollationElements * elems,UErrorCode * status)306 ucol_next(UCollationElements *elems,
307 UErrorCode *status)
308 {
309 if (U_FAILURE(*status)) {
310 return UCOL_NULLORDER;
311 }
312
313 return CollationElementIterator::fromUCollationElements(elems)->next(*status);
314 }
315
316 U_NAMESPACE_BEGIN
317
318 int64_t
nextProcessed(int32_t * ixLow,int32_t * ixHigh,UErrorCode * status)319 UCollationPCE::nextProcessed(
320 int32_t *ixLow,
321 int32_t *ixHigh,
322 UErrorCode *status)
323 {
324 int64_t result = UCOL_IGNORABLE;
325 uint32_t low = 0, high = 0;
326
327 if (U_FAILURE(*status)) {
328 return UCOL_PROCESSED_NULLORDER;
329 }
330
331 pceBuffer.reset();
332
333 do {
334 low = cei->getOffset();
335 int32_t ce = cei->next(*status);
336 high = cei->getOffset();
337
338 if (ce == UCOL_NULLORDER) {
339 result = UCOL_PROCESSED_NULLORDER;
340 break;
341 }
342
343 result = processCE((uint32_t)ce);
344 } while (result == UCOL_IGNORABLE);
345
346 if (ixLow != NULL) {
347 *ixLow = low;
348 }
349
350 if (ixHigh != NULL) {
351 *ixHigh = high;
352 }
353
354 return result;
355 }
356
357 U_NAMESPACE_END
358
359 U_CAPI int32_t U_EXPORT2
ucol_previous(UCollationElements * elems,UErrorCode * status)360 ucol_previous(UCollationElements *elems,
361 UErrorCode *status)
362 {
363 if(U_FAILURE(*status)) {
364 return UCOL_NULLORDER;
365 }
366 return CollationElementIterator::fromUCollationElements(elems)->previous(*status);
367 }
368
369 U_NAMESPACE_BEGIN
370
371 int64_t
previousProcessed(int32_t * ixLow,int32_t * ixHigh,UErrorCode * status)372 UCollationPCE::previousProcessed(
373 int32_t *ixLow,
374 int32_t *ixHigh,
375 UErrorCode *status)
376 {
377 int64_t result = UCOL_IGNORABLE;
378 int32_t low = 0, high = 0;
379
380 if (U_FAILURE(*status)) {
381 return UCOL_PROCESSED_NULLORDER;
382 }
383
384 // pceBuffer.reset();
385
386 while (pceBuffer.empty()) {
387 // buffer raw CEs up to non-ignorable primary
388 RCEBuffer rceb;
389 int32_t ce;
390
391 // **** do we need to reset rceb, or will it always be empty at this point ****
392 do {
393 high = cei->getOffset();
394 ce = cei->previous(*status);
395 low = cei->getOffset();
396
397 if (ce == UCOL_NULLORDER) {
398 if (! rceb.empty()) {
399 break;
400 }
401
402 goto finish;
403 }
404
405 rceb.put((uint32_t)ce, low, high);
406 } while ((ce & UCOL_PRIMARYORDERMASK) == 0 || isContinuation(ce));
407
408 // process the raw CEs
409 while (! rceb.empty()) {
410 const RCEI *rcei = rceb.get();
411
412 result = processCE(rcei->ce);
413
414 if (result != UCOL_IGNORABLE) {
415 pceBuffer.put(result, rcei->low, rcei->high);
416 }
417 }
418 }
419
420 finish:
421 if (pceBuffer.empty()) {
422 // **** Is -1 the right value for ixLow, ixHigh? ****
423 if (ixLow != NULL) {
424 *ixLow = -1;
425 }
426
427 if (ixHigh != NULL) {
428 *ixHigh = -1
429 ;
430 }
431 return UCOL_PROCESSED_NULLORDER;
432 }
433
434 const PCEI *pcei = pceBuffer.get();
435
436 if (ixLow != NULL) {
437 *ixLow = pcei->low;
438 }
439
440 if (ixHigh != NULL) {
441 *ixHigh = pcei->high;
442 }
443
444 return pcei->ce;
445 }
446
447 U_NAMESPACE_END
448
449 U_CAPI int32_t U_EXPORT2
ucol_getMaxExpansion(const UCollationElements * elems,int32_t order)450 ucol_getMaxExpansion(const UCollationElements *elems,
451 int32_t order)
452 {
453 return CollationElementIterator::fromUCollationElements(elems)->getMaxExpansion(order);
454
455 // TODO: The old code masked the order according to strength and then did a binary search.
456 // However this was probably at least partially broken because of the following comment.
457 // Still, it might have found a match when this version may not.
458
459 // FIXME: with a masked search, there might be more than one hit,
460 // so we need to look forward and backward from the match to find all
461 // of the hits...
462 }
463
464 U_CAPI void U_EXPORT2
ucol_setText(UCollationElements * elems,const UChar * text,int32_t textLength,UErrorCode * status)465 ucol_setText( UCollationElements *elems,
466 const UChar *text,
467 int32_t textLength,
468 UErrorCode *status)
469 {
470 if (U_FAILURE(*status)) {
471 return;
472 }
473
474 if ((text == NULL && textLength != 0)) {
475 *status = U_ILLEGAL_ARGUMENT_ERROR;
476 return;
477 }
478 UnicodeString s((UBool)(textLength < 0), text, textLength);
479 return CollationElementIterator::fromUCollationElements(elems)->setText(s, *status);
480 }
481
482 U_CAPI int32_t U_EXPORT2
ucol_getOffset(const UCollationElements * elems)483 ucol_getOffset(const UCollationElements *elems)
484 {
485 return CollationElementIterator::fromUCollationElements(elems)->getOffset();
486 }
487
488 U_CAPI void U_EXPORT2
ucol_setOffset(UCollationElements * elems,int32_t offset,UErrorCode * status)489 ucol_setOffset(UCollationElements *elems,
490 int32_t offset,
491 UErrorCode *status)
492 {
493 if (U_FAILURE(*status)) {
494 return;
495 }
496
497 CollationElementIterator::fromUCollationElements(elems)->setOffset(offset, *status);
498 }
499
500 U_CAPI int32_t U_EXPORT2
ucol_primaryOrder(int32_t order)501 ucol_primaryOrder (int32_t order)
502 {
503 return (order >> 16) & 0xffff;
504 }
505
506 U_CAPI int32_t U_EXPORT2
ucol_secondaryOrder(int32_t order)507 ucol_secondaryOrder (int32_t order)
508 {
509 return (order >> 8) & 0xff;
510 }
511
512 U_CAPI int32_t U_EXPORT2
ucol_tertiaryOrder(int32_t order)513 ucol_tertiaryOrder (int32_t order)
514 {
515 return order & 0xff;
516 }
517
518 #endif /* #if !UCONFIG_NO_COLLATION */
519