• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ******************************************************************************
3 *   Copyright (C) 2001-2014, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 ******************************************************************************
6 *
7 * File ucoleitr.cpp
8 *
9 * Modification History:
10 *
11 * Date        Name        Description
12 * 02/15/2001  synwee      Modified all methods to process its own function
13 *                         instead of calling the equivalent c++ api (coleitr.h)
14 * 2012-2014   markus      Rewritten in C++ again.
15 ******************************************************************************/
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_COLLATION
20 
21 #include "unicode/coleitr.h"
22 #include "unicode/tblcoll.h"
23 #include "unicode/ucoleitr.h"
24 #include "unicode/ustring.h"
25 #include "unicode/sortkey.h"
26 #include "unicode/uobject.h"
27 #include "cmemory.h"
28 #include "usrchimp.h"
29 
30 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
31 
32 U_NAMESPACE_USE
33 
34 #define BUFFER_LENGTH             100
35 
36 #define DEFAULT_BUFFER_SIZE 16
37 #define BUFFER_GROW 8
38 
39 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
40 
41 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
42 
43 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
44 
45 #define GROW_ARRAY(array, newSize) uprv_realloc((void *) (array), (newSize) * sizeof (array)[0])
46 
47 #define DELETE_ARRAY(array) uprv_free((void *) (array))
48 
49 struct RCEI
50 {
51     uint32_t ce;
52     int32_t  low;
53     int32_t  high;
54 };
55 
56 U_NAMESPACE_BEGIN
57 
58 struct RCEBuffer
59 {
60     RCEI    defaultBuffer[DEFAULT_BUFFER_SIZE];
61     RCEI   *buffer;
62     int32_t bufferIndex;
63     int32_t bufferSize;
64 
65     RCEBuffer();
66     ~RCEBuffer();
67 
68     UBool empty() const;
69     void  put(uint32_t ce, int32_t ixLow, int32_t ixHigh);
70     const RCEI *get();
71 };
72 
RCEBuffer()73 RCEBuffer::RCEBuffer()
74 {
75     buffer = defaultBuffer;
76     bufferIndex = 0;
77     bufferSize = LENGTHOF(defaultBuffer);
78 }
79 
~RCEBuffer()80 RCEBuffer::~RCEBuffer()
81 {
82     if (buffer != defaultBuffer) {
83         DELETE_ARRAY(buffer);
84     }
85 }
86 
empty() const87 UBool RCEBuffer::empty() const
88 {
89     return bufferIndex <= 0;
90 }
91 
put(uint32_t ce,int32_t ixLow,int32_t ixHigh)92 void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh)
93 {
94     if (bufferIndex >= bufferSize) {
95         RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW);
96 
97         ARRAY_COPY(newBuffer, buffer, bufferSize);
98 
99         if (buffer != defaultBuffer) {
100             DELETE_ARRAY(buffer);
101         }
102 
103         buffer = newBuffer;
104         bufferSize += BUFFER_GROW;
105     }
106 
107     buffer[bufferIndex].ce   = ce;
108     buffer[bufferIndex].low  = ixLow;
109     buffer[bufferIndex].high = ixHigh;
110 
111     bufferIndex += 1;
112 }
113 
get()114 const RCEI *RCEBuffer::get()
115 {
116     if (bufferIndex > 0) {
117      return &buffer[--bufferIndex];
118     }
119 
120     return NULL;
121 }
122 
PCEBuffer()123 PCEBuffer::PCEBuffer()
124 {
125     buffer = defaultBuffer;
126     bufferIndex = 0;
127     bufferSize = LENGTHOF(defaultBuffer);
128 }
129 
~PCEBuffer()130 PCEBuffer::~PCEBuffer()
131 {
132     if (buffer != defaultBuffer) {
133         DELETE_ARRAY(buffer);
134     }
135 }
136 
reset()137 void PCEBuffer::reset()
138 {
139     bufferIndex = 0;
140 }
141 
empty() const142 UBool PCEBuffer::empty() const
143 {
144     return bufferIndex <= 0;
145 }
146 
put(uint64_t ce,int32_t ixLow,int32_t ixHigh)147 void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh)
148 {
149     if (bufferIndex >= bufferSize) {
150         PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW);
151 
152         ARRAY_COPY(newBuffer, buffer, bufferSize);
153 
154         if (buffer != defaultBuffer) {
155             DELETE_ARRAY(buffer);
156         }
157 
158         buffer = newBuffer;
159         bufferSize += BUFFER_GROW;
160     }
161 
162     buffer[bufferIndex].ce   = ce;
163     buffer[bufferIndex].low  = ixLow;
164     buffer[bufferIndex].high = ixHigh;
165 
166     bufferIndex += 1;
167 }
168 
get()169 const PCEI *PCEBuffer::get()
170 {
171     if (bufferIndex > 0) {
172      return &buffer[--bufferIndex];
173     }
174 
175     return NULL;
176 }
177 
UCollationPCE(UCollationElements * elems)178 UCollationPCE::UCollationPCE(UCollationElements *elems) { init(elems); }
179 
UCollationPCE(CollationElementIterator * iter)180 UCollationPCE::UCollationPCE(CollationElementIterator *iter) { init(iter); }
181 
init(UCollationElements * elems)182 void UCollationPCE::init(UCollationElements *elems) {
183     init(CollationElementIterator::fromUCollationElements(elems));
184 }
185 
init(CollationElementIterator * iter)186 void UCollationPCE::init(CollationElementIterator *iter)
187 {
188     cei = iter;
189     init(*iter->rbc_);
190 }
191 
init(const Collator & coll)192 void UCollationPCE::init(const Collator &coll)
193 {
194     UErrorCode status = U_ZERO_ERROR;
195 
196     strength    = coll.getAttribute(UCOL_STRENGTH, status);
197     toShift     = coll.getAttribute(UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED;
198     isShifted   = FALSE;
199     variableTop = coll.getVariableTop(status);
200 }
201 
~UCollationPCE()202 UCollationPCE::~UCollationPCE()
203 {
204     // nothing to do
205 }
206 
processCE(uint32_t ce)207 uint64_t UCollationPCE::processCE(uint32_t ce)
208 {
209     uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
210 
211     // This is clean, but somewhat slow...
212     // We could apply the mask to ce and then
213     // just get all three orders...
214     switch(strength) {
215     default:
216         tertiary = ucol_tertiaryOrder(ce);
217         /* note fall-through */
218 
219     case UCOL_SECONDARY:
220         secondary = ucol_secondaryOrder(ce);
221         /* note fall-through */
222 
223     case UCOL_PRIMARY:
224         primary = ucol_primaryOrder(ce);
225     }
226 
227     // **** This should probably handle continuations too.  ****
228     // **** That means that we need 24 bits for the primary ****
229     // **** instead of the 16 that we're currently using.   ****
230     // **** So we can lay out the 64 bits as: 24.12.12.16.  ****
231     // **** Another complication with continuations is that ****
232     // **** the *second* CE is marked as a continuation, so ****
233     // **** we always have to peek ahead to know how long   ****
234     // **** the primary is...                               ****
235     if ((toShift && variableTop > ce && primary != 0)
236                 || (isShifted && primary == 0)) {
237 
238         if (primary == 0) {
239             return UCOL_IGNORABLE;
240         }
241 
242         if (strength >= UCOL_QUATERNARY) {
243             quaternary = primary;
244         }
245 
246         primary = secondary = tertiary = 0;
247         isShifted = TRUE;
248     } else {
249         if (strength >= UCOL_QUATERNARY) {
250             quaternary = 0xFFFF;
251         }
252 
253         isShifted = FALSE;
254     }
255 
256     return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
257 }
258 
259 U_NAMESPACE_END
260 
261 /* public methods ---------------------------------------------------- */
262 
263 U_CAPI UCollationElements* U_EXPORT2
ucol_openElements(const UCollator * coll,const UChar * text,int32_t textLength,UErrorCode * status)264 ucol_openElements(const UCollator  *coll,
265                   const UChar      *text,
266                         int32_t    textLength,
267                         UErrorCode *status)
268 {
269     if (U_FAILURE(*status)) {
270         return NULL;
271     }
272     if (coll == NULL || (text == NULL && textLength != 0)) {
273         *status = U_ILLEGAL_ARGUMENT_ERROR;
274         return NULL;
275     }
276     const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
277     if (rbc == NULL) {
278         *status = U_UNSUPPORTED_ERROR;  // coll is a Collator but not a RuleBasedCollator
279         return NULL;
280     }
281 
282     UnicodeString s((UBool)(textLength < 0), text, textLength);
283     CollationElementIterator *cei = rbc->createCollationElementIterator(s);
284     if (cei == NULL) {
285         *status = U_MEMORY_ALLOCATION_ERROR;
286         return NULL;
287     }
288 
289     return cei->toUCollationElements();
290 }
291 
292 
293 U_CAPI void U_EXPORT2
ucol_closeElements(UCollationElements * elems)294 ucol_closeElements(UCollationElements *elems)
295 {
296     delete CollationElementIterator::fromUCollationElements(elems);
297 }
298 
299 U_CAPI void U_EXPORT2
ucol_reset(UCollationElements * elems)300 ucol_reset(UCollationElements *elems)
301 {
302     CollationElementIterator::fromUCollationElements(elems)->reset();
303 }
304 
305 U_CAPI int32_t U_EXPORT2
ucol_next(UCollationElements * elems,UErrorCode * status)306 ucol_next(UCollationElements *elems,
307           UErrorCode         *status)
308 {
309     if (U_FAILURE(*status)) {
310         return UCOL_NULLORDER;
311     }
312 
313     return CollationElementIterator::fromUCollationElements(elems)->next(*status);
314 }
315 
316 U_NAMESPACE_BEGIN
317 
318 int64_t
nextProcessed(int32_t * ixLow,int32_t * ixHigh,UErrorCode * status)319 UCollationPCE::nextProcessed(
320                    int32_t            *ixLow,
321                    int32_t            *ixHigh,
322                    UErrorCode         *status)
323 {
324     int64_t result = UCOL_IGNORABLE;
325     uint32_t low = 0, high = 0;
326 
327     if (U_FAILURE(*status)) {
328         return UCOL_PROCESSED_NULLORDER;
329     }
330 
331     pceBuffer.reset();
332 
333     do {
334         low = cei->getOffset();
335         int32_t ce = cei->next(*status);
336         high = cei->getOffset();
337 
338         if (ce == UCOL_NULLORDER) {
339              result = UCOL_PROCESSED_NULLORDER;
340              break;
341         }
342 
343         result = processCE((uint32_t)ce);
344     } while (result == UCOL_IGNORABLE);
345 
346     if (ixLow != NULL) {
347         *ixLow = low;
348     }
349 
350     if (ixHigh != NULL) {
351         *ixHigh = high;
352     }
353 
354     return result;
355 }
356 
357 U_NAMESPACE_END
358 
359 U_CAPI int32_t U_EXPORT2
ucol_previous(UCollationElements * elems,UErrorCode * status)360 ucol_previous(UCollationElements *elems,
361               UErrorCode         *status)
362 {
363     if(U_FAILURE(*status)) {
364         return UCOL_NULLORDER;
365     }
366     return CollationElementIterator::fromUCollationElements(elems)->previous(*status);
367 }
368 
369 U_NAMESPACE_BEGIN
370 
371 int64_t
previousProcessed(int32_t * ixLow,int32_t * ixHigh,UErrorCode * status)372 UCollationPCE::previousProcessed(
373                    int32_t            *ixLow,
374                    int32_t            *ixHigh,
375                    UErrorCode         *status)
376 {
377     int64_t result = UCOL_IGNORABLE;
378     int32_t  low = 0, high = 0;
379 
380     if (U_FAILURE(*status)) {
381         return UCOL_PROCESSED_NULLORDER;
382     }
383 
384     // pceBuffer.reset();
385 
386     while (pceBuffer.empty()) {
387         // buffer raw CEs up to non-ignorable primary
388         RCEBuffer rceb;
389         int32_t ce;
390 
391         // **** do we need to reset rceb, or will it always be empty at this point ****
392         do {
393             high = cei->getOffset();
394             ce   = cei->previous(*status);
395             low  = cei->getOffset();
396 
397             if (ce == UCOL_NULLORDER) {
398                 if (! rceb.empty()) {
399                     break;
400                 }
401 
402                 goto finish;
403             }
404 
405             rceb.put((uint32_t)ce, low, high);
406         } while ((ce & UCOL_PRIMARYORDERMASK) == 0 || isContinuation(ce));
407 
408         // process the raw CEs
409         while (! rceb.empty()) {
410             const RCEI *rcei = rceb.get();
411 
412             result = processCE(rcei->ce);
413 
414             if (result != UCOL_IGNORABLE) {
415                 pceBuffer.put(result, rcei->low, rcei->high);
416             }
417         }
418     }
419 
420 finish:
421     if (pceBuffer.empty()) {
422         // **** Is -1 the right value for ixLow, ixHigh? ****
423     	if (ixLow != NULL) {
424     		*ixLow = -1;
425     	}
426 
427     	if (ixHigh != NULL) {
428     		*ixHigh = -1
429     		;
430     	}
431         return UCOL_PROCESSED_NULLORDER;
432     }
433 
434     const PCEI *pcei = pceBuffer.get();
435 
436     if (ixLow != NULL) {
437         *ixLow = pcei->low;
438     }
439 
440     if (ixHigh != NULL) {
441         *ixHigh = pcei->high;
442     }
443 
444     return pcei->ce;
445 }
446 
447 U_NAMESPACE_END
448 
449 U_CAPI int32_t U_EXPORT2
ucol_getMaxExpansion(const UCollationElements * elems,int32_t order)450 ucol_getMaxExpansion(const UCollationElements *elems,
451                            int32_t            order)
452 {
453     return CollationElementIterator::fromUCollationElements(elems)->getMaxExpansion(order);
454 
455     // TODO: The old code masked the order according to strength and then did a binary search.
456     // However this was probably at least partially broken because of the following comment.
457     // Still, it might have found a match when this version may not.
458 
459     // FIXME: with a masked search, there might be more than one hit,
460     // so we need to look forward and backward from the match to find all
461     // of the hits...
462 }
463 
464 U_CAPI void U_EXPORT2
ucol_setText(UCollationElements * elems,const UChar * text,int32_t textLength,UErrorCode * status)465 ucol_setText(      UCollationElements *elems,
466              const UChar              *text,
467                    int32_t            textLength,
468                    UErrorCode         *status)
469 {
470     if (U_FAILURE(*status)) {
471         return;
472     }
473 
474     if ((text == NULL && textLength != 0)) {
475         *status = U_ILLEGAL_ARGUMENT_ERROR;
476         return;
477     }
478     UnicodeString s((UBool)(textLength < 0), text, textLength);
479     return CollationElementIterator::fromUCollationElements(elems)->setText(s, *status);
480 }
481 
482 U_CAPI int32_t U_EXPORT2
ucol_getOffset(const UCollationElements * elems)483 ucol_getOffset(const UCollationElements *elems)
484 {
485     return CollationElementIterator::fromUCollationElements(elems)->getOffset();
486 }
487 
488 U_CAPI void U_EXPORT2
ucol_setOffset(UCollationElements * elems,int32_t offset,UErrorCode * status)489 ucol_setOffset(UCollationElements    *elems,
490                int32_t           offset,
491                UErrorCode            *status)
492 {
493     if (U_FAILURE(*status)) {
494         return;
495     }
496 
497     CollationElementIterator::fromUCollationElements(elems)->setOffset(offset, *status);
498 }
499 
500 U_CAPI int32_t U_EXPORT2
ucol_primaryOrder(int32_t order)501 ucol_primaryOrder (int32_t order)
502 {
503     return (order >> 16) & 0xffff;
504 }
505 
506 U_CAPI int32_t U_EXPORT2
ucol_secondaryOrder(int32_t order)507 ucol_secondaryOrder (int32_t order)
508 {
509     return (order >> 8) & 0xff;
510 }
511 
512 U_CAPI int32_t U_EXPORT2
ucol_tertiaryOrder(int32_t order)513 ucol_tertiaryOrder (int32_t order)
514 {
515     return order & 0xff;
516 }
517 
518 #endif /* #if !UCONFIG_NO_COLLATION */
519