1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 * Copyright (C) 1997-2014, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
8 */
9
10 /**
11 * \file
12 * \brief C++ API: Collation Element Iterator.
13 */
14
15 /**
16 * File coleitr.h
17 *
18 * Created by: Helena Shih
19 *
20 * Modification History:
21 *
22 * Date Name Description
23 *
24 * 8/18/97 helena Added internal API documentation.
25 * 08/03/98 erm Synched with 1.2 version CollationElementIterator.java
26 * 12/10/99 aliu Ported Thai collation support from Java.
27 * 01/25/01 swquek Modified into a C++ wrapper calling C APIs (ucoliter.h)
28 * 02/19/01 swquek Removed CollationElementsIterator() since it is
29 * private constructor and no calls are made to it
30 * 2012-2014 markus Rewritten in C++ again.
31 */
32
33 #ifndef COLEITR_H
34 #define COLEITR_H
35
36 #include "unicode/utypes.h"
37
38 #if U_SHOW_CPLUSPLUS_API
39
40 #if !UCONFIG_NO_COLLATION
41
42 #include "unicode/unistr.h"
43 #include "unicode/uobject.h"
44
45 struct UCollationElements;
46 struct UHashtable;
47
48 U_NAMESPACE_BEGIN
49
50 struct CollationData;
51
52 class CharacterIterator;
53 class CollationIterator;
54 class RuleBasedCollator;
55 class UCollationPCE;
56 class UVector32;
57
58 /**
59 * The CollationElementIterator class is used as an iterator to walk through
60 * each character of an international string. Use the iterator to return the
61 * ordering priority of the positioned character. The ordering priority of a
62 * character, which we refer to as a key, defines how a character is collated in
63 * the given collation object.
64 * For example, consider the following in Slovak and in traditional Spanish collation:
65 * <pre>
66 * "ca" -> the first key is key('c') and second key is key('a').
67 * "cha" -> the first key is key('ch') and second key is key('a').</pre>
68 * And in German phonebook collation,
69 * <pre> \htmlonly "æb"-> the first key is key('a'), the second key is key('e'), and
70 * the third key is key('b'). \endhtmlonly </pre>
71 * The key of a character, is an integer composed of primary order(short),
72 * secondary order(char), and tertiary order(char). Java strictly defines the
73 * size and signedness of its primitive data types. Therefore, the static
74 * functions primaryOrder(), secondaryOrder(), and tertiaryOrder() return
75 * int32_t to ensure the correctness of the key value.
76 * <p>Example of the iterator usage: (without error checking)
77 * <pre>
78 * \code
79 * void CollationElementIterator_Example()
80 * {
81 * UnicodeString str = "This is a test";
82 * UErrorCode success = U_ZERO_ERROR;
83 * RuleBasedCollator* rbc =
84 * (RuleBasedCollator*) RuleBasedCollator::createInstance(success);
85 * CollationElementIterator* c =
86 * rbc->createCollationElementIterator( str );
87 * int32_t order = c->next(success);
88 * c->reset();
89 * order = c->previous(success);
90 * delete c;
91 * delete rbc;
92 * }
93 * \endcode
94 * </pre>
95 * <p>
96 * The method next() returns the collation order of the next character based on
97 * the comparison level of the collator. The method previous() returns the
98 * collation order of the previous character based on the comparison level of
99 * the collator. The Collation Element Iterator moves only in one direction
100 * between calls to reset(), setOffset(), or setText(). That is, next()
101 * and previous() can not be inter-used. Whenever previous() is to be called after
102 * next() or vice versa, reset(), setOffset() or setText() has to be called first
103 * to reset the status, shifting pointers to either the end or the start of
104 * the string (reset() or setText()), or the specified position (setOffset()).
105 * Hence at the next call of next() or previous(), the first or last collation order,
106 * or collation order at the spefcifieid position will be returned. If a change of
107 * direction is done without one of these calls, the result is undefined.
108 * <p>
109 * The result of a forward iterate (next()) and reversed result of the backward
110 * iterate (previous()) on the same string are equivalent, if collation orders
111 * with the value 0 are ignored.
112 * Character based on the comparison level of the collator. A collation order
113 * consists of primary order, secondary order and tertiary order. The data
114 * type of the collation order is <strong>int32_t</strong>.
115 *
116 * Note, CollationElementIterator should not be subclassed.
117 * @see Collator
118 * @see RuleBasedCollator
119 * @version 1.8 Jan 16 2001
120 */
121 class U_I18N_API CollationElementIterator U_FINAL : public UObject {
122 public:
123
124 // CollationElementIterator public data member ------------------------------
125
126 enum {
127 /**
128 * NULLORDER indicates that an error has occured while processing
129 * @stable ICU 2.0
130 */
131 NULLORDER = (int32_t)0xffffffff
132 };
133
134 // CollationElementIterator public constructor/destructor -------------------
135
136 /**
137 * Copy constructor.
138 *
139 * @param other the object to be copied from
140 * @stable ICU 2.0
141 */
142 CollationElementIterator(const CollationElementIterator& other);
143
144 /**
145 * Destructor
146 * @stable ICU 2.0
147 */
148 virtual ~CollationElementIterator();
149
150 // CollationElementIterator public methods ----------------------------------
151
152 /**
153 * Returns true if "other" is the same as "this"
154 *
155 * @param other the object to be compared
156 * @return true if "other" is the same as "this"
157 * @stable ICU 2.0
158 */
159 UBool operator==(const CollationElementIterator& other) const;
160
161 /**
162 * Returns true if "other" is not the same as "this".
163 *
164 * @param other the object to be compared
165 * @return true if "other" is not the same as "this"
166 * @stable ICU 2.0
167 */
168 UBool operator!=(const CollationElementIterator& other) const;
169
170 /**
171 * Resets the cursor to the beginning of the string.
172 * @stable ICU 2.0
173 */
174 void reset(void);
175
176 /**
177 * Gets the ordering priority of the next character in the string.
178 * @param status the error code status.
179 * @return the next character's ordering. otherwise returns NULLORDER if an
180 * error has occured or if the end of string has been reached
181 * @stable ICU 2.0
182 */
183 int32_t next(UErrorCode& status);
184
185 /**
186 * Get the ordering priority of the previous collation element in the string.
187 * @param status the error code status.
188 * @return the previous element's ordering. otherwise returns NULLORDER if an
189 * error has occured or if the start of string has been reached
190 * @stable ICU 2.0
191 */
192 int32_t previous(UErrorCode& status);
193
194 /**
195 * Gets the primary order of a collation order.
196 * @param order the collation order
197 * @return the primary order of a collation order.
198 * @stable ICU 2.0
199 */
200 static inline int32_t primaryOrder(int32_t order);
201
202 /**
203 * Gets the secondary order of a collation order.
204 * @param order the collation order
205 * @return the secondary order of a collation order.
206 * @stable ICU 2.0
207 */
208 static inline int32_t secondaryOrder(int32_t order);
209
210 /**
211 * Gets the tertiary order of a collation order.
212 * @param order the collation order
213 * @return the tertiary order of a collation order.
214 * @stable ICU 2.0
215 */
216 static inline int32_t tertiaryOrder(int32_t order);
217
218 /**
219 * Return the maximum length of any expansion sequences that end with the
220 * specified comparison order.
221 * @param order a collation order returned by previous or next.
222 * @return maximum size of the expansion sequences ending with the collation
223 * element or 1 if collation element does not occur at the end of any
224 * expansion sequence
225 * @stable ICU 2.0
226 */
227 int32_t getMaxExpansion(int32_t order) const;
228
229 /**
230 * Gets the comparison order in the desired strength. Ignore the other
231 * differences.
232 * @param order The order value
233 * @stable ICU 2.0
234 */
235 int32_t strengthOrder(int32_t order) const;
236
237 /**
238 * Sets the source string.
239 * @param str the source string.
240 * @param status the error code status.
241 * @stable ICU 2.0
242 */
243 void setText(const UnicodeString& str, UErrorCode& status);
244
245 /**
246 * Sets the source string.
247 * @param str the source character iterator.
248 * @param status the error code status.
249 * @stable ICU 2.0
250 */
251 void setText(CharacterIterator& str, UErrorCode& status);
252
253 /**
254 * Checks if a comparison order is ignorable.
255 * @param order the collation order.
256 * @return TRUE if a character is ignorable, FALSE otherwise.
257 * @stable ICU 2.0
258 */
259 static inline UBool isIgnorable(int32_t order);
260
261 /**
262 * Gets the offset of the currently processed character in the source string.
263 * @return the offset of the character.
264 * @stable ICU 2.0
265 */
266 int32_t getOffset(void) const;
267
268 /**
269 * Sets the offset of the currently processed character in the source string.
270 * @param newOffset the new offset.
271 * @param status the error code status.
272 * @return the offset of the character.
273 * @stable ICU 2.0
274 */
275 void setOffset(int32_t newOffset, UErrorCode& status);
276
277 /**
278 * ICU "poor man's RTTI", returns a UClassID for the actual class.
279 *
280 * @stable ICU 2.2
281 */
282 virtual UClassID getDynamicClassID() const;
283
284 /**
285 * ICU "poor man's RTTI", returns a UClassID for this class.
286 *
287 * @stable ICU 2.2
288 */
289 static UClassID U_EXPORT2 getStaticClassID();
290
291 #ifndef U_HIDE_INTERNAL_API
292 /** @internal */
fromUCollationElements(UCollationElements * uc)293 static inline CollationElementIterator *fromUCollationElements(UCollationElements *uc) {
294 return reinterpret_cast<CollationElementIterator *>(uc);
295 }
296 /** @internal */
fromUCollationElements(const UCollationElements * uc)297 static inline const CollationElementIterator *fromUCollationElements(const UCollationElements *uc) {
298 return reinterpret_cast<const CollationElementIterator *>(uc);
299 }
300 /** @internal */
toUCollationElements()301 inline UCollationElements *toUCollationElements() {
302 return reinterpret_cast<UCollationElements *>(this);
303 }
304 /** @internal */
toUCollationElements()305 inline const UCollationElements *toUCollationElements() const {
306 return reinterpret_cast<const UCollationElements *>(this);
307 }
308 #endif // U_HIDE_INTERNAL_API
309
310 private:
311 friend class RuleBasedCollator;
312 friend class UCollationPCE;
313
314 /**
315 * CollationElementIterator constructor. This takes the source string and the
316 * collation object. The cursor will walk thru the source string based on the
317 * predefined collation rules. If the source string is empty, NULLORDER will
318 * be returned on the calls to next().
319 * @param sourceText the source string.
320 * @param order the collation object.
321 * @param status the error code status.
322 */
323 CollationElementIterator(const UnicodeString& sourceText,
324 const RuleBasedCollator* order, UErrorCode& status);
325 // Note: The constructors should take settings & tailoring, not a collator,
326 // to avoid circular dependencies.
327 // However, for operator==() we would need to be able to compare tailoring data for equality
328 // without making CollationData or CollationTailoring depend on TailoredSet.
329 // (See the implementation of RuleBasedCollator::operator==().)
330 // That might require creating an intermediate class that would be used
331 // by both CollationElementIterator and RuleBasedCollator
332 // but only contain the part of RBC== related to data and rules.
333
334 /**
335 * CollationElementIterator constructor. This takes the source string and the
336 * collation object. The cursor will walk thru the source string based on the
337 * predefined collation rules. If the source string is empty, NULLORDER will
338 * be returned on the calls to next().
339 * @param sourceText the source string.
340 * @param order the collation object.
341 * @param status the error code status.
342 */
343 CollationElementIterator(const CharacterIterator& sourceText,
344 const RuleBasedCollator* order, UErrorCode& status);
345
346 /**
347 * Assignment operator
348 *
349 * @param other the object to be copied
350 */
351 const CollationElementIterator&
352 operator=(const CollationElementIterator& other);
353
354 CollationElementIterator(); // default constructor not implemented
355
356 /** Normalizes dir_=1 (just after setOffset()) to dir_=0 (just after reset()). */
normalizeDir()357 inline int8_t normalizeDir() const { return dir_ == 1 ? 0 : dir_; }
358
359 static UHashtable *computeMaxExpansions(const CollationData *data, UErrorCode &errorCode);
360
361 static int32_t getMaxExpansion(const UHashtable *maxExpansions, int32_t order);
362
363 // CollationElementIterator private data members ----------------------------
364
365 CollationIterator *iter_; // owned
366 const RuleBasedCollator *rbc_; // aliased
367 uint32_t otherHalf_;
368 /**
369 * <0: backwards; 0: just after reset() (previous() begins from end);
370 * 1: just after setOffset(); >1: forward
371 */
372 int8_t dir_;
373 /**
374 * Stores offsets from expansions and from unsafe-backwards iteration,
375 * so that getOffset() returns intermediate offsets for the CEs
376 * that are consistent with forward iteration.
377 */
378 UVector32 *offsets_;
379
380 UnicodeString string_;
381 };
382
383 // CollationElementIterator inline method definitions --------------------------
384
primaryOrder(int32_t order)385 inline int32_t CollationElementIterator::primaryOrder(int32_t order)
386 {
387 return (order >> 16) & 0xffff;
388 }
389
secondaryOrder(int32_t order)390 inline int32_t CollationElementIterator::secondaryOrder(int32_t order)
391 {
392 return (order >> 8) & 0xff;
393 }
394
tertiaryOrder(int32_t order)395 inline int32_t CollationElementIterator::tertiaryOrder(int32_t order)
396 {
397 return order & 0xff;
398 }
399
isIgnorable(int32_t order)400 inline UBool CollationElementIterator::isIgnorable(int32_t order)
401 {
402 return (order & 0xffff0000) == 0;
403 }
404
405 U_NAMESPACE_END
406
407 #endif /* #if !UCONFIG_NO_COLLATION */
408
409 #endif /* U_SHOW_CPLUSPLUS_API */
410
411 #endif
412