1 /*
2 *******************************************************************************
3 * Copyright (C) 1996-2014, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
6 */
7
8 /*
9 * File coleitr.cpp
10 *
11 * Created by: Helena Shih
12 *
13 * Modification History:
14 *
15 * Date Name Description
16 *
17 * 6/23/97 helena Adding comments to make code more readable.
18 * 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java
19 * 12/10/99 aliu Ported Thai collation support from Java.
20 * 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h)
21 * 02/19/01 swquek Removed CollationElementIterator() since it is
22 * private constructor and no calls are made to it
23 * 2012-2014 markus Rewritten in C++ again.
24 */
25
26 #include "unicode/utypes.h"
27
28 #if !UCONFIG_NO_COLLATION
29
30 #include "unicode/coleitr.h"
31 #include "unicode/tblcoll.h"
32 #include "unicode/ustring.h"
33 #include "cmemory.h"
34 #include "collation.h"
35 #include "collationdata.h"
36 #include "collationiterator.h"
37 #include "collationsets.h"
38 #include "collationtailoring.h"
39 #include "uassert.h"
40 #include "uhash.h"
41 #include "utf16collationiterator.h"
42 #include "uvectr32.h"
43
44 /* Constants --------------------------------------------------------------- */
45
46 U_NAMESPACE_BEGIN
47
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator)48 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator)
49
50 /* CollationElementIterator public constructor/destructor ------------------ */
51
52 CollationElementIterator::CollationElementIterator(
53 const CollationElementIterator& other)
54 : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) {
55 *this = other;
56 }
57
~CollationElementIterator()58 CollationElementIterator::~CollationElementIterator()
59 {
60 delete iter_;
61 delete offsets_;
62 }
63
64 /* CollationElementIterator public methods --------------------------------- */
65
66 namespace {
67
getFirstHalf(uint32_t p,uint32_t lower32)68 uint32_t getFirstHalf(uint32_t p, uint32_t lower32) {
69 return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff);
70 }
getSecondHalf(uint32_t p,uint32_t lower32)71 uint32_t getSecondHalf(uint32_t p, uint32_t lower32) {
72 return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f);
73 }
ceNeedsTwoParts(int64_t ce)74 UBool ceNeedsTwoParts(int64_t ce) {
75 return (ce & INT64_C(0xffff00ff003f)) != 0;
76 }
77
78 } // namespace
79
getOffset() const80 int32_t CollationElementIterator::getOffset() const
81 {
82 if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) {
83 // CollationIterator::previousCE() decrements the CEs length
84 // while it pops CEs from its internal buffer.
85 int32_t i = iter_->getCEsLength();
86 if (otherHalf_ != 0) {
87 // Return the trailing CE offset while we are in the middle of a 64-bit CE.
88 ++i;
89 }
90 U_ASSERT(i < offsets_->size());
91 return offsets_->elementAti(i);
92 }
93 return iter_->getOffset();
94 }
95
96 /**
97 * Get the ordering priority of the next character in the string.
98 * @return the next character's ordering. Returns NULLORDER if an error has
99 * occured or if the end of string has been reached
100 */
next(UErrorCode & status)101 int32_t CollationElementIterator::next(UErrorCode& status)
102 {
103 if (U_FAILURE(status)) { return NULLORDER; }
104 if (dir_ > 1) {
105 // Continue forward iteration. Test this first.
106 if (otherHalf_ != 0) {
107 uint32_t oh = otherHalf_;
108 otherHalf_ = 0;
109 return oh;
110 }
111 } else if (dir_ == 1) {
112 // next() after setOffset()
113 dir_ = 2;
114 } else if (dir_ == 0) {
115 // The iter_ is already reset to the start of the text.
116 dir_ = 2;
117 } else /* dir_ < 0 */ {
118 // illegal change of direction
119 status = U_INVALID_STATE_ERROR;
120 return NULLORDER;
121 }
122 // No need to keep all CEs in the buffer when we iterate.
123 iter_->clearCEsIfNoneRemaining();
124 int64_t ce = iter_->nextCE(status);
125 if (ce == Collation::NO_CE) { return NULLORDER; }
126 // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
127 uint32_t p = (uint32_t)(ce >> 32);
128 uint32_t lower32 = (uint32_t)ce;
129 uint32_t firstHalf = getFirstHalf(p, lower32);
130 uint32_t secondHalf = getSecondHalf(p, lower32);
131 if (secondHalf != 0) {
132 otherHalf_ = secondHalf | 0xc0; // continuation CE
133 }
134 return firstHalf;
135 }
136
operator !=(const CollationElementIterator & other) const137 UBool CollationElementIterator::operator!=(
138 const CollationElementIterator& other) const
139 {
140 return !(*this == other);
141 }
142
operator ==(const CollationElementIterator & that) const143 UBool CollationElementIterator::operator==(
144 const CollationElementIterator& that) const
145 {
146 if (this == &that) {
147 return TRUE;
148 }
149
150 return
151 (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) &&
152 otherHalf_ == that.otherHalf_ &&
153 normalizeDir() == that.normalizeDir() &&
154 string_ == that.string_ &&
155 *iter_ == *that.iter_;
156 }
157
158 /**
159 * Get the ordering priority of the previous collation element in the string.
160 * @param status the error code status.
161 * @return the previous element's ordering. Returns NULLORDER if an error has
162 * occured or if the start of string has been reached.
163 */
previous(UErrorCode & status)164 int32_t CollationElementIterator::previous(UErrorCode& status)
165 {
166 if (U_FAILURE(status)) { return NULLORDER; }
167 if (dir_ < 0) {
168 // Continue backwards iteration. Test this first.
169 if (otherHalf_ != 0) {
170 uint32_t oh = otherHalf_;
171 otherHalf_ = 0;
172 return oh;
173 }
174 } else if (dir_ == 0) {
175 iter_->resetToOffset(string_.length());
176 dir_ = -1;
177 } else if (dir_ == 1) {
178 // previous() after setOffset()
179 dir_ = -1;
180 } else /* dir_ > 1 */ {
181 // illegal change of direction
182 status = U_INVALID_STATE_ERROR;
183 return NULLORDER;
184 }
185 if (offsets_ == NULL) {
186 offsets_ = new UVector32(status);
187 if (offsets_ == NULL) {
188 status = U_MEMORY_ALLOCATION_ERROR;
189 return NULLORDER;
190 }
191 }
192 // If we already have expansion CEs, then we also have offsets.
193 // Otherwise remember the trailing offset in case we need to
194 // write offsets for an artificial expansion.
195 int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0;
196 int64_t ce = iter_->previousCE(*offsets_, status);
197 if (ce == Collation::NO_CE) { return NULLORDER; }
198 // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
199 uint32_t p = (uint32_t)(ce >> 32);
200 uint32_t lower32 = (uint32_t)ce;
201 uint32_t firstHalf = getFirstHalf(p, lower32);
202 uint32_t secondHalf = getSecondHalf(p, lower32);
203 if (secondHalf != 0) {
204 if (offsets_->isEmpty()) {
205 // When we convert a single 64-bit CE into two 32-bit CEs,
206 // we need to make this artificial expansion behave like a normal expansion.
207 // See CollationIterator::previousCE().
208 offsets_->addElement(iter_->getOffset(), status);
209 offsets_->addElement(limitOffset, status);
210 }
211 otherHalf_ = firstHalf;
212 return secondHalf | 0xc0; // continuation CE
213 }
214 return firstHalf;
215 }
216
217 /**
218 * Resets the cursor to the beginning of the string.
219 */
reset()220 void CollationElementIterator::reset()
221 {
222 iter_ ->resetToOffset(0);
223 otherHalf_ = 0;
224 dir_ = 0;
225 }
226
setOffset(int32_t newOffset,UErrorCode & status)227 void CollationElementIterator::setOffset(int32_t newOffset,
228 UErrorCode& status)
229 {
230 if (U_FAILURE(status)) { return; }
231 if (0 < newOffset && newOffset < string_.length()) {
232 int32_t offset = newOffset;
233 do {
234 UChar c = string_.charAt(offset);
235 if (!rbc_->isUnsafe(c) ||
236 (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) {
237 break;
238 }
239 // Back up to before this unsafe character.
240 --offset;
241 } while (offset > 0);
242 if (offset < newOffset) {
243 // We might have backed up more than necessary.
244 // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
245 // but for text "chu" setOffset(2) should remain at 2
246 // although we initially back up to offset 0.
247 // Find the last safe offset no greater than newOffset by iterating forward.
248 int32_t lastSafeOffset = offset;
249 do {
250 iter_->resetToOffset(lastSafeOffset);
251 do {
252 iter_->nextCE(status);
253 if (U_FAILURE(status)) { return; }
254 } while ((offset = iter_->getOffset()) == lastSafeOffset);
255 if (offset <= newOffset) {
256 lastSafeOffset = offset;
257 }
258 } while (offset < newOffset);
259 newOffset = lastSafeOffset;
260 }
261 }
262 iter_->resetToOffset(newOffset);
263 otherHalf_ = 0;
264 dir_ = 1;
265 }
266
267 /**
268 * Sets the source to the new source string.
269 */
setText(const UnicodeString & source,UErrorCode & status)270 void CollationElementIterator::setText(const UnicodeString& source,
271 UErrorCode& status)
272 {
273 if (U_FAILURE(status)) {
274 return;
275 }
276
277 string_ = source;
278 const UChar *s = string_.getBuffer();
279 CollationIterator *newIter;
280 UBool numeric = rbc_->settings->isNumeric();
281 if (rbc_->settings->dontCheckFCD()) {
282 newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
283 } else {
284 newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
285 }
286 if (newIter == NULL) {
287 status = U_MEMORY_ALLOCATION_ERROR;
288 return;
289 }
290 delete iter_;
291 iter_ = newIter;
292 otherHalf_ = 0;
293 dir_ = 0;
294 }
295
296 // Sets the source to the new character iterator.
setText(CharacterIterator & source,UErrorCode & status)297 void CollationElementIterator::setText(CharacterIterator& source,
298 UErrorCode& status)
299 {
300 if (U_FAILURE(status))
301 return;
302
303 source.getText(string_);
304 setText(string_, status);
305 }
306
strengthOrder(int32_t order) const307 int32_t CollationElementIterator::strengthOrder(int32_t order) const
308 {
309 UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength();
310 // Mask off the unwanted differences.
311 if (s == UCOL_PRIMARY) {
312 order &= 0xffff0000;
313 }
314 else if (s == UCOL_SECONDARY) {
315 order &= 0xffffff00;
316 }
317
318 return order;
319 }
320
321 /* CollationElementIterator private constructors/destructors --------------- */
322
323 /**
324 * This is the "real" constructor for this class; it constructs an iterator
325 * over the source text using the specified collator
326 */
CollationElementIterator(const UnicodeString & source,const RuleBasedCollator * coll,UErrorCode & status)327 CollationElementIterator::CollationElementIterator(
328 const UnicodeString &source,
329 const RuleBasedCollator *coll,
330 UErrorCode &status)
331 : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
332 setText(source, status);
333 }
334
335 /**
336 * This is the "real" constructor for this class; it constructs an iterator over
337 * the source text using the specified collator
338 */
CollationElementIterator(const CharacterIterator & source,const RuleBasedCollator * coll,UErrorCode & status)339 CollationElementIterator::CollationElementIterator(
340 const CharacterIterator &source,
341 const RuleBasedCollator *coll,
342 UErrorCode &status)
343 : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
344 // We only call source.getText() which should be const anyway.
345 setText(const_cast<CharacterIterator &>(source), status);
346 }
347
348 /* CollationElementIterator private methods -------------------------------- */
349
operator =(const CollationElementIterator & other)350 const CollationElementIterator& CollationElementIterator::operator=(
351 const CollationElementIterator& other)
352 {
353 if (this == &other) {
354 return *this;
355 }
356
357 CollationIterator *newIter;
358 const FCDUTF16CollationIterator *otherFCDIter =
359 dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_);
360 if(otherFCDIter != NULL) {
361 newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer());
362 } else {
363 const UTF16CollationIterator *otherIter =
364 dynamic_cast<const UTF16CollationIterator *>(other.iter_);
365 if(otherIter != NULL) {
366 newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer());
367 } else {
368 newIter = NULL;
369 }
370 }
371 if(newIter != NULL) {
372 delete iter_;
373 iter_ = newIter;
374 rbc_ = other.rbc_;
375 otherHalf_ = other.otherHalf_;
376 dir_ = other.dir_;
377
378 string_ = other.string_;
379 }
380 if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) {
381 UErrorCode errorCode = U_ZERO_ERROR;
382 if(offsets_ == NULL) {
383 offsets_ = new UVector32(other.offsets_->size(), errorCode);
384 }
385 if(offsets_ != NULL) {
386 offsets_->assign(*other.offsets_, errorCode);
387 }
388 }
389 return *this;
390 }
391
392 namespace {
393
394 class MaxExpSink : public ContractionsAndExpansions::CESink {
395 public:
MaxExpSink(UHashtable * h,UErrorCode & ec)396 MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {}
397 virtual ~MaxExpSink();
handleCE(int64_t)398 virtual void handleCE(int64_t /*ce*/) {}
handleExpansion(const int64_t ces[],int32_t length)399 virtual void handleExpansion(const int64_t ces[], int32_t length) {
400 if (length <= 1) {
401 // We do not need to add single CEs into the map.
402 return;
403 }
404 int32_t count = 0; // number of CE "halves"
405 for (int32_t i = 0; i < length; ++i) {
406 count += ceNeedsTwoParts(ces[i]) ? 2 : 1;
407 }
408 // last "half" of the last CE
409 int64_t ce = ces[length - 1];
410 uint32_t p = (uint32_t)(ce >> 32);
411 uint32_t lower32 = (uint32_t)ce;
412 uint32_t lastHalf = getSecondHalf(p, lower32);
413 if (lastHalf == 0) {
414 lastHalf = getFirstHalf(p, lower32);
415 U_ASSERT(lastHalf != 0);
416 } else {
417 lastHalf |= 0xc0; // old-style continuation CE
418 }
419 if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) {
420 uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode);
421 }
422 }
423
424 private:
425 UHashtable *maxExpansions;
426 UErrorCode &errorCode;
427 };
428
~MaxExpSink()429 MaxExpSink::~MaxExpSink() {}
430
431 } // namespace
432
433 UHashtable *
computeMaxExpansions(const CollationData * data,UErrorCode & errorCode)434 CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) {
435 if (U_FAILURE(errorCode)) { return NULL; }
436 UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong,
437 uhash_compareLong, &errorCode);
438 if (U_FAILURE(errorCode)) { return NULL; }
439 MaxExpSink sink(maxExpansions, errorCode);
440 ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode);
441 if (U_FAILURE(errorCode)) {
442 uhash_close(maxExpansions);
443 return NULL;
444 }
445 return maxExpansions;
446 }
447
448 int32_t
getMaxExpansion(int32_t order) const449 CollationElementIterator::getMaxExpansion(int32_t order) const {
450 return getMaxExpansion(rbc_->tailoring->maxExpansions, order);
451 }
452
453 int32_t
getMaxExpansion(const UHashtable * maxExpansions,int32_t order)454 CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) {
455 if (order == 0) { return 1; }
456 int32_t max;
457 if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) {
458 return max;
459 }
460 if ((order & 0xc0) == 0xc0) {
461 // old-style continuation CE
462 return 2;
463 } else {
464 return 1;
465 }
466 }
467
468 U_NAMESPACE_END
469
470 #endif /* #if !UCONFIG_NO_COLLATION */
471