1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 * Copyright (C) 2001-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
8 *
9 * File ucoleitr.cpp
10 *
11 * Modification History:
12 *
13 * Date Name Description
14 * 02/15/2001 synwee Modified all methods to process its own function
15 * instead of calling the equivalent c++ api (coleitr.h)
16 * 2012-2014 markus Rewritten in C++ again.
17 ******************************************************************************/
18
19 #include "unicode/utypes.h"
20
21 #if !UCONFIG_NO_COLLATION
22
23 #include "unicode/coleitr.h"
24 #include "unicode/tblcoll.h"
25 #include "unicode/ucoleitr.h"
26 #include "unicode/ustring.h"
27 #include "unicode/sortkey.h"
28 #include "unicode/uobject.h"
29 #include "cmemory.h"
30 #include "usrchimp.h"
31
32 U_NAMESPACE_USE
33
34 #define BUFFER_LENGTH 100
35
36 #define DEFAULT_BUFFER_SIZE 16
37 #define BUFFER_GROW 8
38
39 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (size_t)(count) * sizeof (src)[0])
40
41 #define NEW_ARRAY(type, count) (type *) uprv_malloc((size_t)(count) * sizeof(type))
42
43 #define DELETE_ARRAY(array) uprv_free((void *) (array))
44
45 struct RCEI
46 {
47 uint32_t ce;
48 int32_t low;
49 int32_t high;
50 };
51
52 U_NAMESPACE_BEGIN
53
54 struct RCEBuffer
55 {
56 RCEI defaultBuffer[DEFAULT_BUFFER_SIZE];
57 RCEI *buffer;
58 int32_t bufferIndex;
59 int32_t bufferSize;
60
61 RCEBuffer();
62 ~RCEBuffer();
63
64 UBool isEmpty() const;
65 void put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
66 const RCEI *get();
67 };
68
RCEBuffer()69 RCEBuffer::RCEBuffer()
70 {
71 buffer = defaultBuffer;
72 bufferIndex = 0;
73 bufferSize = UPRV_LENGTHOF(defaultBuffer);
74 }
75
~RCEBuffer()76 RCEBuffer::~RCEBuffer()
77 {
78 if (buffer != defaultBuffer) {
79 DELETE_ARRAY(buffer);
80 }
81 }
82
isEmpty() const83 UBool RCEBuffer::isEmpty() const
84 {
85 return bufferIndex <= 0;
86 }
87
put(uint32_t ce,int32_t ixLow,int32_t ixHigh,UErrorCode & errorCode)88 void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode)
89 {
90 if (U_FAILURE(errorCode)) {
91 return;
92 }
93 if (bufferIndex >= bufferSize) {
94 RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW);
95 if (newBuffer == NULL) {
96 errorCode = U_MEMORY_ALLOCATION_ERROR;
97 return;
98 }
99
100 ARRAY_COPY(newBuffer, buffer, bufferSize);
101
102 if (buffer != defaultBuffer) {
103 DELETE_ARRAY(buffer);
104 }
105
106 buffer = newBuffer;
107 bufferSize += BUFFER_GROW;
108 }
109
110 buffer[bufferIndex].ce = ce;
111 buffer[bufferIndex].low = ixLow;
112 buffer[bufferIndex].high = ixHigh;
113
114 bufferIndex += 1;
115 }
116
get()117 const RCEI *RCEBuffer::get()
118 {
119 if (bufferIndex > 0) {
120 return &buffer[--bufferIndex];
121 }
122
123 return NULL;
124 }
125
PCEBuffer()126 PCEBuffer::PCEBuffer()
127 {
128 buffer = defaultBuffer;
129 bufferIndex = 0;
130 bufferSize = UPRV_LENGTHOF(defaultBuffer);
131 }
132
~PCEBuffer()133 PCEBuffer::~PCEBuffer()
134 {
135 if (buffer != defaultBuffer) {
136 DELETE_ARRAY(buffer);
137 }
138 }
139
reset()140 void PCEBuffer::reset()
141 {
142 bufferIndex = 0;
143 }
144
isEmpty() const145 UBool PCEBuffer::isEmpty() const
146 {
147 return bufferIndex <= 0;
148 }
149
put(uint64_t ce,int32_t ixLow,int32_t ixHigh,UErrorCode & errorCode)150 void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode)
151 {
152 if (U_FAILURE(errorCode)) {
153 return;
154 }
155 if (bufferIndex >= bufferSize) {
156 PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW);
157 if (newBuffer == NULL) {
158 errorCode = U_MEMORY_ALLOCATION_ERROR;
159 return;
160 }
161
162 ARRAY_COPY(newBuffer, buffer, bufferSize);
163
164 if (buffer != defaultBuffer) {
165 DELETE_ARRAY(buffer);
166 }
167
168 buffer = newBuffer;
169 bufferSize += BUFFER_GROW;
170 }
171
172 buffer[bufferIndex].ce = ce;
173 buffer[bufferIndex].low = ixLow;
174 buffer[bufferIndex].high = ixHigh;
175
176 bufferIndex += 1;
177 }
178
get()179 const PCEI *PCEBuffer::get()
180 {
181 if (bufferIndex > 0) {
182 return &buffer[--bufferIndex];
183 }
184
185 return NULL;
186 }
187
UCollationPCE(UCollationElements * elems)188 UCollationPCE::UCollationPCE(UCollationElements *elems) { init(elems); }
189
UCollationPCE(CollationElementIterator * iter)190 UCollationPCE::UCollationPCE(CollationElementIterator *iter) { init(iter); }
191
init(UCollationElements * elems)192 void UCollationPCE::init(UCollationElements *elems) {
193 init(CollationElementIterator::fromUCollationElements(elems));
194 }
195
init(CollationElementIterator * iter)196 void UCollationPCE::init(CollationElementIterator *iter)
197 {
198 cei = iter;
199 init(*iter->rbc_);
200 }
201
init(const Collator & coll)202 void UCollationPCE::init(const Collator &coll)
203 {
204 UErrorCode status = U_ZERO_ERROR;
205
206 strength = coll.getAttribute(UCOL_STRENGTH, status);
207 toShift = coll.getAttribute(UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED;
208 isShifted = FALSE;
209 variableTop = coll.getVariableTop(status);
210 }
211
~UCollationPCE()212 UCollationPCE::~UCollationPCE()
213 {
214 // nothing to do
215 }
216
processCE(uint32_t ce)217 uint64_t UCollationPCE::processCE(uint32_t ce)
218 {
219 uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
220
221 // This is clean, but somewhat slow...
222 // We could apply the mask to ce and then
223 // just get all three orders...
224 switch(strength) {
225 default:
226 tertiary = ucol_tertiaryOrder(ce);
227 U_FALLTHROUGH;
228
229 case UCOL_SECONDARY:
230 secondary = ucol_secondaryOrder(ce);
231 U_FALLTHROUGH;
232
233 case UCOL_PRIMARY:
234 primary = ucol_primaryOrder(ce);
235 }
236
237 // **** This should probably handle continuations too. ****
238 // **** That means that we need 24 bits for the primary ****
239 // **** instead of the 16 that we're currently using. ****
240 // **** So we can lay out the 64 bits as: 24.12.12.16. ****
241 // **** Another complication with continuations is that ****
242 // **** the *second* CE is marked as a continuation, so ****
243 // **** we always have to peek ahead to know how long ****
244 // **** the primary is... ****
245 if ((toShift && variableTop > ce && primary != 0)
246 || (isShifted && primary == 0)) {
247
248 if (primary == 0) {
249 return UCOL_IGNORABLE;
250 }
251
252 if (strength >= UCOL_QUATERNARY) {
253 quaternary = primary;
254 }
255
256 primary = secondary = tertiary = 0;
257 isShifted = TRUE;
258 } else {
259 if (strength >= UCOL_QUATERNARY) {
260 quaternary = 0xFFFF;
261 }
262
263 isShifted = FALSE;
264 }
265
266 return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
267 }
268
269 U_NAMESPACE_END
270
271 /* public methods ---------------------------------------------------- */
272
273 U_CAPI UCollationElements* U_EXPORT2
ucol_openElements(const UCollator * coll,const UChar * text,int32_t textLength,UErrorCode * status)274 ucol_openElements(const UCollator *coll,
275 const UChar *text,
276 int32_t textLength,
277 UErrorCode *status)
278 {
279 if (U_FAILURE(*status)) {
280 return NULL;
281 }
282 if (coll == NULL || (text == NULL && textLength != 0)) {
283 *status = U_ILLEGAL_ARGUMENT_ERROR;
284 return NULL;
285 }
286 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
287 if (rbc == NULL) {
288 *status = U_UNSUPPORTED_ERROR; // coll is a Collator but not a RuleBasedCollator
289 return NULL;
290 }
291
292 UnicodeString s((UBool)(textLength < 0), text, textLength);
293 CollationElementIterator *cei = rbc->createCollationElementIterator(s);
294 if (cei == NULL) {
295 *status = U_MEMORY_ALLOCATION_ERROR;
296 return NULL;
297 }
298
299 return cei->toUCollationElements();
300 }
301
302
303 U_CAPI void U_EXPORT2
ucol_closeElements(UCollationElements * elems)304 ucol_closeElements(UCollationElements *elems)
305 {
306 delete CollationElementIterator::fromUCollationElements(elems);
307 }
308
309 U_CAPI void U_EXPORT2
ucol_reset(UCollationElements * elems)310 ucol_reset(UCollationElements *elems)
311 {
312 CollationElementIterator::fromUCollationElements(elems)->reset();
313 }
314
315 U_CAPI int32_t U_EXPORT2
ucol_next(UCollationElements * elems,UErrorCode * status)316 ucol_next(UCollationElements *elems,
317 UErrorCode *status)
318 {
319 if (U_FAILURE(*status)) {
320 return UCOL_NULLORDER;
321 }
322
323 return CollationElementIterator::fromUCollationElements(elems)->next(*status);
324 }
325
326 U_NAMESPACE_BEGIN
327
328 int64_t
nextProcessed(int32_t * ixLow,int32_t * ixHigh,UErrorCode * status)329 UCollationPCE::nextProcessed(
330 int32_t *ixLow,
331 int32_t *ixHigh,
332 UErrorCode *status)
333 {
334 int64_t result = UCOL_IGNORABLE;
335 uint32_t low = 0, high = 0;
336
337 if (U_FAILURE(*status)) {
338 return UCOL_PROCESSED_NULLORDER;
339 }
340
341 pceBuffer.reset();
342
343 do {
344 low = cei->getOffset();
345 int32_t ce = cei->next(*status);
346 high = cei->getOffset();
347
348 if (ce == UCOL_NULLORDER) {
349 result = UCOL_PROCESSED_NULLORDER;
350 break;
351 }
352
353 result = processCE((uint32_t)ce);
354 } while (result == UCOL_IGNORABLE);
355
356 if (ixLow != NULL) {
357 *ixLow = low;
358 }
359
360 if (ixHigh != NULL) {
361 *ixHigh = high;
362 }
363
364 return result;
365 }
366
367 U_NAMESPACE_END
368
369 U_CAPI int32_t U_EXPORT2
ucol_previous(UCollationElements * elems,UErrorCode * status)370 ucol_previous(UCollationElements *elems,
371 UErrorCode *status)
372 {
373 if(U_FAILURE(*status)) {
374 return UCOL_NULLORDER;
375 }
376 return CollationElementIterator::fromUCollationElements(elems)->previous(*status);
377 }
378
379 U_NAMESPACE_BEGIN
380
381 int64_t
previousProcessed(int32_t * ixLow,int32_t * ixHigh,UErrorCode * status)382 UCollationPCE::previousProcessed(
383 int32_t *ixLow,
384 int32_t *ixHigh,
385 UErrorCode *status)
386 {
387 int64_t result = UCOL_IGNORABLE;
388 int32_t low = 0, high = 0;
389
390 if (U_FAILURE(*status)) {
391 return UCOL_PROCESSED_NULLORDER;
392 }
393
394 // pceBuffer.reset();
395
396 while (pceBuffer.isEmpty()) {
397 // buffer raw CEs up to non-ignorable primary
398 RCEBuffer rceb;
399 int32_t ce;
400
401 // **** do we need to reset rceb, or will it always be empty at this point ****
402 do {
403 high = cei->getOffset();
404 ce = cei->previous(*status);
405 low = cei->getOffset();
406
407 if (ce == UCOL_NULLORDER) {
408 if (!rceb.isEmpty()) {
409 break;
410 }
411
412 goto finish;
413 }
414
415 rceb.put((uint32_t)ce, low, high, *status);
416 } while (U_SUCCESS(*status) && ((ce & UCOL_PRIMARYORDERMASK) == 0 || isContinuation(ce)));
417
418 // process the raw CEs
419 while (U_SUCCESS(*status) && !rceb.isEmpty()) {
420 const RCEI *rcei = rceb.get();
421
422 result = processCE(rcei->ce);
423
424 if (result != UCOL_IGNORABLE) {
425 pceBuffer.put(result, rcei->low, rcei->high, *status);
426 }
427 }
428 if (U_FAILURE(*status)) {
429 return UCOL_PROCESSED_NULLORDER;
430 }
431 }
432
433 finish:
434 if (pceBuffer.isEmpty()) {
435 // **** Is -1 the right value for ixLow, ixHigh? ****
436 if (ixLow != NULL) {
437 *ixLow = -1;
438 }
439
440 if (ixHigh != NULL) {
441 *ixHigh = -1
442 ;
443 }
444 return UCOL_PROCESSED_NULLORDER;
445 }
446
447 const PCEI *pcei = pceBuffer.get();
448
449 if (ixLow != NULL) {
450 *ixLow = pcei->low;
451 }
452
453 if (ixHigh != NULL) {
454 *ixHigh = pcei->high;
455 }
456
457 return pcei->ce;
458 }
459
460 U_NAMESPACE_END
461
462 U_CAPI int32_t U_EXPORT2
ucol_getMaxExpansion(const UCollationElements * elems,int32_t order)463 ucol_getMaxExpansion(const UCollationElements *elems,
464 int32_t order)
465 {
466 return CollationElementIterator::fromUCollationElements(elems)->getMaxExpansion(order);
467
468 // TODO: The old code masked the order according to strength and then did a binary search.
469 // However this was probably at least partially broken because of the following comment.
470 // Still, it might have found a match when this version may not.
471
472 // FIXME: with a masked search, there might be more than one hit,
473 // so we need to look forward and backward from the match to find all
474 // of the hits...
475 }
476
477 U_CAPI void U_EXPORT2
ucol_setText(UCollationElements * elems,const UChar * text,int32_t textLength,UErrorCode * status)478 ucol_setText( UCollationElements *elems,
479 const UChar *text,
480 int32_t textLength,
481 UErrorCode *status)
482 {
483 if (U_FAILURE(*status)) {
484 return;
485 }
486
487 if ((text == NULL && textLength != 0)) {
488 *status = U_ILLEGAL_ARGUMENT_ERROR;
489 return;
490 }
491 UnicodeString s((UBool)(textLength < 0), text, textLength);
492 return CollationElementIterator::fromUCollationElements(elems)->setText(s, *status);
493 }
494
495 U_CAPI int32_t U_EXPORT2
ucol_getOffset(const UCollationElements * elems)496 ucol_getOffset(const UCollationElements *elems)
497 {
498 return CollationElementIterator::fromUCollationElements(elems)->getOffset();
499 }
500
501 U_CAPI void U_EXPORT2
ucol_setOffset(UCollationElements * elems,int32_t offset,UErrorCode * status)502 ucol_setOffset(UCollationElements *elems,
503 int32_t offset,
504 UErrorCode *status)
505 {
506 if (U_FAILURE(*status)) {
507 return;
508 }
509
510 CollationElementIterator::fromUCollationElements(elems)->setOffset(offset, *status);
511 }
512
513 U_CAPI int32_t U_EXPORT2
ucol_primaryOrder(int32_t order)514 ucol_primaryOrder (int32_t order)
515 {
516 return (order >> 16) & 0xffff;
517 }
518
519 U_CAPI int32_t U_EXPORT2
ucol_secondaryOrder(int32_t order)520 ucol_secondaryOrder (int32_t order)
521 {
522 return (order >> 8) & 0xff;
523 }
524
525 U_CAPI int32_t U_EXPORT2
ucol_tertiaryOrder(int32_t order)526 ucol_tertiaryOrder (int32_t order)
527 {
528 return order & 0xff;
529 }
530
531 #endif /* #if !UCONFIG_NO_COLLATION */
532