• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ******************************************************************************
3 * Copyright (C) 1996-2014, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ******************************************************************************
6 */
7 
8 /**
9  * \file
10  * \brief C++ API: The RuleBasedCollator class implements the Collator abstract base class.
11  */
12 
13 /**
14 * File tblcoll.h
15 *
16 * Created by: Helena Shih
17 *
18 * Modification History:
19 *
20 *  Date        Name        Description
21 *  2/5/97      aliu        Added streamIn and streamOut methods.  Added
22 *                          constructor which reads RuleBasedCollator object from
23 *                          a binary file.  Added writeToFile method which streams
24 *                          RuleBasedCollator out to a binary file.  The streamIn
25 *                          and streamOut methods use istream and ostream objects
26 *                          in binary mode.
27 *  2/12/97     aliu        Modified to use TableCollationData sub-object to
28 *                          hold invariant data.
29 *  2/13/97     aliu        Moved several methods into this class from Collation.
30 *                          Added a private RuleBasedCollator(Locale&) constructor,
31 *                          to be used by Collator::createDefault().  General
32 *                          clean up.
33 *  2/20/97     helena      Added clone, operator==, operator!=, operator=, and copy
34 *                          constructor and getDynamicClassID.
35 *  3/5/97      aliu        Modified constructFromFile() to add parameter
36 *                          specifying whether or not binary loading is to be
37 *                          attempted.  This is required for dynamic rule loading.
38 * 05/07/97     helena      Added memory allocation error detection.
39 *  6/17/97     helena      Added IDENTICAL strength for compare, changed getRules to
40 *                          use MergeCollation::getPattern.
41 *  6/20/97     helena      Java class name change.
42 *  8/18/97     helena      Added internal API documentation.
43 * 09/03/97     helena      Added createCollationKeyValues().
44 * 02/10/98     damiba      Added compare with "length" parameter
45 * 08/05/98     erm         Synched with 1.2 version of RuleBasedCollator.java
46 * 04/23/99     stephen     Removed EDecompositionMode, merged with
47 *                          Normalizer::EMode
48 * 06/14/99     stephen     Removed kResourceBundleSuffix
49 * 11/02/99     helena      Collator performance enhancements.  Eliminates the
50 *                          UnicodeString construction and special case for NO_OP.
51 * 11/23/99     srl         More performance enhancements. Updates to NormalizerIterator
52 *                          internal state management.
53 * 12/15/99     aliu        Update to support Thai collation.  Move NormalizerIterator
54 *                          to implementation file.
55 * 01/29/01     synwee      Modified into a C++ wrapper which calls C API
56 *                          (ucol.h)
57 * 2012-2014    markus      Rewritten in C++ again.
58 */
59 
60 #ifndef TBLCOLL_H
61 #define TBLCOLL_H
62 
63 #include "unicode/utypes.h"
64 
65 #if !UCONFIG_NO_COLLATION
66 
67 #include "unicode/coll.h"
68 #include "unicode/locid.h"
69 #include "unicode/uiter.h"
70 #include "unicode/ucol.h"
71 
72 U_NAMESPACE_BEGIN
73 
74 struct CollationData;
75 struct CollationSettings;
76 struct CollationTailoring;
77 /**
78 * @stable ICU 2.0
79 */
80 class StringSearch;
81 /**
82 * @stable ICU 2.0
83 */
84 class CollationElementIterator;
85 class CollationKey;
86 class SortKeyByteSink;
87 class UnicodeSet;
88 class UnicodeString;
89 class UVector64;
90 
91 /**
92  * The RuleBasedCollator class provides the implementation of
93  * Collator, using data-driven tables. The user can create a customized
94  * table-based collation.
95  * <p>
96  * For more information about the collation service see
97  * <a href="http://userguide.icu-project.org/collation">the User Guide</a>.
98  * <p>
99  * Collation service provides correct sorting orders for most locales supported in ICU.
100  * If specific data for a locale is not available, the orders eventually falls back
101  * to the <a href="http://www.unicode.org/reports/tr35/tr35-collation.html#Root_Collation">CLDR root sort order</a>.
102  * <p>
103  * Sort ordering may be customized by providing your own set of rules. For more on
104  * this subject see the <a href="http://userguide.icu-project.org/collation/customization">
105  * Collation Customization</a> section of the User Guide.
106  * <p>
107  * Note, RuleBasedCollator is not to be subclassed.
108  * @see        Collator
109  */
110 class U_I18N_API RuleBasedCollator : public Collator {
111 public:
112     /**
113      * RuleBasedCollator constructor. This takes the table rules and builds a
114      * collation table out of them. Please see RuleBasedCollator class
115      * description for more details on the collation rule syntax.
116      * @param rules the collation rules to build the collation table from.
117      * @param status reporting a success or an error.
118      * @stable ICU 2.0
119      */
120     RuleBasedCollator(const UnicodeString& rules, UErrorCode& status);
121 
122     /**
123      * RuleBasedCollator constructor. This takes the table rules and builds a
124      * collation table out of them. Please see RuleBasedCollator class
125      * description for more details on the collation rule syntax.
126      * @param rules the collation rules to build the collation table from.
127      * @param collationStrength strength for comparison
128      * @param status reporting a success or an error.
129      * @stable ICU 2.0
130      */
131     RuleBasedCollator(const UnicodeString& rules,
132                        ECollationStrength collationStrength,
133                        UErrorCode& status);
134 
135     /**
136      * RuleBasedCollator constructor. This takes the table rules and builds a
137      * collation table out of them. Please see RuleBasedCollator class
138      * description for more details on the collation rule syntax.
139      * @param rules the collation rules to build the collation table from.
140      * @param decompositionMode the normalisation mode
141      * @param status reporting a success or an error.
142      * @stable ICU 2.0
143      */
144     RuleBasedCollator(const UnicodeString& rules,
145                     UColAttributeValue decompositionMode,
146                     UErrorCode& status);
147 
148     /**
149      * RuleBasedCollator constructor. This takes the table rules and builds a
150      * collation table out of them. Please see RuleBasedCollator class
151      * description for more details on the collation rule syntax.
152      * @param rules the collation rules to build the collation table from.
153      * @param collationStrength strength for comparison
154      * @param decompositionMode the normalisation mode
155      * @param status reporting a success or an error.
156      * @stable ICU 2.0
157      */
158     RuleBasedCollator(const UnicodeString& rules,
159                     ECollationStrength collationStrength,
160                     UColAttributeValue decompositionMode,
161                     UErrorCode& status);
162 
163 #ifndef U_HIDE_INTERNAL_API
164     /**
165      * TODO: document & propose as public API
166      * @internal
167      */
168     RuleBasedCollator(const UnicodeString &rules,
169                       UParseError &parseError, UnicodeString &reason,
170                       UErrorCode &errorCode);
171 #endif  /* U_HIDE_INTERNAL_API */
172 
173     /**
174      * Copy constructor.
175      * @param other the RuleBasedCollator object to be copied
176      * @stable ICU 2.0
177      */
178     RuleBasedCollator(const RuleBasedCollator& other);
179 
180 
181     /** Opens a collator from a collator binary image created using
182     *  cloneBinary. Binary image used in instantiation of the
183     *  collator remains owned by the user and should stay around for
184     *  the lifetime of the collator. The API also takes a base collator
185     *  which usually should be the root collator.
186     *  @param bin binary image owned by the user and required through the
187     *             lifetime of the collator
188     *  @param length size of the image. If negative, the API will try to
189     *                figure out the length of the image
190     *  @param base fallback collator, usually root. The base is required to be
191     *              present through the lifetime of the collator. Currently
192     *              it cannot be NULL.
193     *  @param status for catching errors
194     *  @return newly created collator
195     *  @see cloneBinary
196     *  @stable ICU 3.4
197     */
198     RuleBasedCollator(const uint8_t *bin, int32_t length,
199                     const RuleBasedCollator *base,
200                     UErrorCode &status);
201 
202     /**
203      * Destructor.
204      * @stable ICU 2.0
205      */
206     virtual ~RuleBasedCollator();
207 
208     /**
209      * Assignment operator.
210      * @param other other RuleBasedCollator object to copy from.
211      * @stable ICU 2.0
212      */
213     RuleBasedCollator& operator=(const RuleBasedCollator& other);
214 
215     /**
216      * Returns true if argument is the same as this object.
217      * @param other Collator object to be compared.
218      * @return true if arguments is the same as this object.
219      * @stable ICU 2.0
220      */
221     virtual UBool operator==(const Collator& other) const;
222 
223     /**
224      * Makes a copy of this object.
225      * @return a copy of this object, owned by the caller
226      * @stable ICU 2.0
227      */
228     virtual Collator* clone(void) const;
229 
230     /**
231      * Creates a collation element iterator for the source string. The caller of
232      * this method is responsible for the memory management of the return
233      * pointer.
234      * @param source the string over which the CollationElementIterator will
235      *        iterate.
236      * @return the collation element iterator of the source string using this as
237      *         the based Collator.
238      * @stable ICU 2.2
239      */
240     virtual CollationElementIterator* createCollationElementIterator(
241                                            const UnicodeString& source) const;
242 
243     /**
244      * Creates a collation element iterator for the source. The caller of this
245      * method is responsible for the memory management of the returned pointer.
246      * @param source the CharacterIterator which produces the characters over
247      *        which the CollationElementItgerator will iterate.
248      * @return the collation element iterator of the source using this as the
249      *         based Collator.
250      * @stable ICU 2.2
251      */
252     virtual CollationElementIterator* createCollationElementIterator(
253                                          const CharacterIterator& source) const;
254 
255     // Make deprecated versions of Collator::compare() visible.
256     using Collator::compare;
257 
258     /**
259     * The comparison function compares the character data stored in two
260     * different strings. Returns information about whether a string is less
261     * than, greater than or equal to another string.
262     * @param source the source string to be compared with.
263     * @param target the string that is to be compared with the source string.
264     * @param status possible error code
265     * @return Returns an enum value. UCOL_GREATER if source is greater
266     * than target; UCOL_EQUAL if source is equal to target; UCOL_LESS if source is less
267     * than target
268     * @stable ICU 2.6
269     **/
270     virtual UCollationResult compare(const UnicodeString& source,
271                                      const UnicodeString& target,
272                                      UErrorCode &status) const;
273 
274     /**
275     * Does the same thing as compare but limits the comparison to a specified
276     * length
277     * @param source the source string to be compared with.
278     * @param target the string that is to be compared with the source string.
279     * @param length the length the comparison is limited to
280     * @param status possible error code
281     * @return Returns an enum value. UCOL_GREATER if source (up to the specified
282     *         length) is greater than target; UCOL_EQUAL if source (up to specified
283     *         length) is equal to target; UCOL_LESS if source (up to the specified
284     *         length) is less  than target.
285     * @stable ICU 2.6
286     */
287     virtual UCollationResult compare(const UnicodeString& source,
288                                      const UnicodeString& target,
289                                      int32_t length,
290                                      UErrorCode &status) const;
291 
292     /**
293     * The comparison function compares the character data stored in two
294     * different string arrays. Returns information about whether a string array
295     * is less than, greater than or equal to another string array.
296     * @param source the source string array to be compared with.
297     * @param sourceLength the length of the source string array.  If this value
298     *        is equal to -1, the string array is null-terminated.
299     * @param target the string that is to be compared with the source string.
300     * @param targetLength the length of the target string array.  If this value
301     *        is equal to -1, the string array is null-terminated.
302     * @param status possible error code
303     * @return Returns an enum value. UCOL_GREATER if source is greater
304     * than target; UCOL_EQUAL if source is equal to target; UCOL_LESS if source is less
305     * than target
306     * @stable ICU 2.6
307     */
308     virtual UCollationResult compare(const UChar* source, int32_t sourceLength,
309                                      const UChar* target, int32_t targetLength,
310                                      UErrorCode &status) const;
311 
312     /**
313      * Compares two strings using the Collator.
314      * Returns whether the first one compares less than/equal to/greater than
315      * the second one.
316      * This version takes UCharIterator input.
317      * @param sIter the first ("source") string iterator
318      * @param tIter the second ("target") string iterator
319      * @param status ICU status
320      * @return UCOL_LESS, UCOL_EQUAL or UCOL_GREATER
321      * @stable ICU 4.2
322      */
323     virtual UCollationResult compare(UCharIterator &sIter,
324                                      UCharIterator &tIter,
325                                      UErrorCode &status) const;
326 
327     /**
328      * Compares two UTF-8 strings using the Collator.
329      * Returns whether the first one compares less than/equal to/greater than
330      * the second one.
331      * This version takes UTF-8 input.
332      * Note that a StringPiece can be implicitly constructed
333      * from a std::string or a NUL-terminated const char * string.
334      * @param source the first UTF-8 string
335      * @param target the second UTF-8 string
336      * @param status ICU status
337      * @return UCOL_LESS, UCOL_EQUAL or UCOL_GREATER
338      * @stable ICU 51
339      */
340     virtual UCollationResult compareUTF8(const StringPiece &source,
341                                          const StringPiece &target,
342                                          UErrorCode &status) const;
343 
344     /**
345     * Transforms a specified region of the string into a series of characters
346     * that can be compared with CollationKey.compare. Use a CollationKey when
347     * you need to do repeated comparisions on the same string. For a single
348     * comparison the compare method will be faster.
349     * @param source the source string.
350     * @param key the transformed key of the source string.
351     * @param status the error code status.
352     * @return the transformed key.
353     * @see CollationKey
354     * @stable ICU 2.0
355     */
356     virtual CollationKey& getCollationKey(const UnicodeString& source,
357                                           CollationKey& key,
358                                           UErrorCode& status) const;
359 
360     /**
361     * Transforms a specified region of the string into a series of characters
362     * that can be compared with CollationKey.compare. Use a CollationKey when
363     * you need to do repeated comparisions on the same string. For a single
364     * comparison the compare method will be faster.
365     * @param source the source string.
366     * @param sourceLength the length of the source string.
367     * @param key the transformed key of the source string.
368     * @param status the error code status.
369     * @return the transformed key.
370     * @see CollationKey
371     * @stable ICU 2.0
372     */
373     virtual CollationKey& getCollationKey(const UChar *source,
374                                           int32_t sourceLength,
375                                           CollationKey& key,
376                                           UErrorCode& status) const;
377 
378     /**
379      * Generates the hash code for the rule-based collation object.
380      * @return the hash code.
381      * @stable ICU 2.0
382      */
383     virtual int32_t hashCode() const;
384 
385     /**
386     * Gets the locale of the Collator
387     * @param type can be either requested, valid or actual locale. For more
388     *             information see the definition of ULocDataLocaleType in
389     *             uloc.h
390     * @param status the error code status.
391     * @return locale where the collation data lives. If the collator
392     *         was instantiated from rules, locale is empty.
393     * @deprecated ICU 2.8 likely to change in ICU 3.0, based on feedback
394     */
395     virtual Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const;
396 
397     /**
398      * Gets the tailoring rules for this collator.
399      * @return the collation tailoring from which this collator was created
400      * @stable ICU 2.0
401      */
402     const UnicodeString& getRules() const;
403 
404     /**
405      * Gets the version information for a Collator.
406      * @param info the version # information, the result will be filled in
407      * @stable ICU 2.0
408      */
409     virtual void getVersion(UVersionInfo info) const;
410 
411 #ifndef U_HIDE_DEPRECATED_API
412     /**
413      * Returns the maximum length of any expansion sequences that end with the
414      * specified comparison order.
415      *
416      * This is specific to the kind of collation element values and sequences
417      * returned by the CollationElementIterator.
418      * Call CollationElementIterator::getMaxExpansion() instead.
419      *
420      * @param order a collation order returned by CollationElementIterator::previous
421      *              or CollationElementIterator::next.
422      * @return maximum size of the expansion sequences ending with the collation
423      *         element, or 1 if the collation element does not occur at the end of
424      *         any expansion sequence
425      * @see CollationElementIterator#getMaxExpansion
426      * @deprecated ICU 51 Use CollationElementIterator::getMaxExpansion() instead.
427      */
428     int32_t getMaxExpansion(int32_t order) const;
429 #endif  /* U_HIDE_DEPRECATED_API */
430 
431     /**
432      * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. This
433      * method is to implement a simple version of RTTI, since not all C++
434      * compilers support genuine RTTI. Polymorphic operator==() and clone()
435      * methods call this method.
436      * @return The class ID for this object. All objects of a given class have
437      *         the same class ID. Objects of other classes have different class
438      *         IDs.
439      * @stable ICU 2.0
440      */
441     virtual UClassID getDynamicClassID(void) const;
442 
443     /**
444      * Returns the class ID for this class. This is useful only for comparing to
445      * a return value from getDynamicClassID(). For example:
446      * <pre>
447      * Base* polymorphic_pointer = createPolymorphicObject();
448      * if (polymorphic_pointer->getDynamicClassID() ==
449      *                                          Derived::getStaticClassID()) ...
450      * </pre>
451      * @return The class ID for all objects of this class.
452      * @stable ICU 2.0
453      */
454     static UClassID U_EXPORT2 getStaticClassID(void);
455 
456 #ifndef U_HIDE_DEPRECATED_API
457     /**
458      * Do not use this method: The caller and the ICU library might use different heaps.
459      * Use cloneBinary() instead which writes to caller-provided memory.
460      *
461      * Returns a binary format of this collator.
462      * @param length Returns the length of the data, in bytes
463      * @param status the error code status.
464      * @return memory, owned by the caller, of size 'length' bytes.
465      * @deprecated ICU 52. Use cloneBinary() instead.
466      */
467     uint8_t *cloneRuleData(int32_t &length, UErrorCode &status) const;
468 #endif  /* U_HIDE_DEPRECATED_API */
469 
470     /** Creates a binary image of a collator. This binary image can be stored and
471     *  later used to instantiate a collator using ucol_openBinary.
472     *  This API supports preflighting.
473     *  @param buffer a fill-in buffer to receive the binary image
474     *  @param capacity capacity of the destination buffer
475     *  @param status for catching errors
476     *  @return size of the image
477     *  @see ucol_openBinary
478     *  @stable ICU 3.4
479     */
480     int32_t cloneBinary(uint8_t *buffer, int32_t capacity, UErrorCode &status) const;
481 
482     /**
483      * Returns current rules. Delta defines whether full rules are returned or
484      * just the tailoring.
485      *
486      * getRules(void) should normally be used instead.
487      * See http://userguide.icu-project.org/collation/customization#TOC-Building-on-Existing-Locales
488      * @param delta one of UCOL_TAILORING_ONLY, UCOL_FULL_RULES.
489      * @param buffer UnicodeString to store the result rules
490      * @stable ICU 2.2
491      * @see UCOL_FULL_RULES
492      */
493     void getRules(UColRuleOption delta, UnicodeString &buffer) const;
494 
495     /**
496      * Universal attribute setter
497      * @param attr attribute type
498      * @param value attribute value
499      * @param status to indicate whether the operation went on smoothly or there were errors
500      * @stable ICU 2.2
501      */
502     virtual void setAttribute(UColAttribute attr, UColAttributeValue value,
503                               UErrorCode &status);
504 
505     /**
506      * Universal attribute getter.
507      * @param attr attribute type
508      * @param status to indicate whether the operation went on smoothly or there were errors
509      * @return attribute value
510      * @stable ICU 2.2
511      */
512     virtual UColAttributeValue getAttribute(UColAttribute attr,
513                                             UErrorCode &status) const;
514 
515     /**
516      * Sets the variable top to the top of the specified reordering group.
517      * The variable top determines the highest-sorting character
518      * which is affected by UCOL_ALTERNATE_HANDLING.
519      * If that attribute is set to UCOL_NON_IGNORABLE, then the variable top has no effect.
520      * @param group one of UCOL_REORDER_CODE_SPACE, UCOL_REORDER_CODE_PUNCTUATION,
521      *              UCOL_REORDER_CODE_SYMBOL, UCOL_REORDER_CODE_CURRENCY;
522      *              or UCOL_REORDER_CODE_DEFAULT to restore the default max variable group
523      * @param errorCode Standard ICU error code. Its input value must
524      *                  pass the U_SUCCESS() test, or else the function returns
525      *                  immediately. Check for U_FAILURE() on output or use with
526      *                  function chaining. (See User Guide for details.)
527      * @return *this
528      * @see getMaxVariable
529      * @draft ICU 53
530      */
531     virtual Collator &setMaxVariable(UColReorderCode group, UErrorCode &errorCode);
532 
533     /**
534      * Returns the maximum reordering group whose characters are affected by UCOL_ALTERNATE_HANDLING.
535      * @return the maximum variable reordering group.
536      * @see setMaxVariable
537      * @draft ICU 53
538      */
539     virtual UColReorderCode getMaxVariable() const;
540 
541     /**
542      * Sets the variable top to the primary weight of the specified string.
543      *
544      * Beginning with ICU 53, the variable top is pinned to
545      * the top of one of the supported reordering groups,
546      * and it must not be beyond the last of those groups.
547      * See setMaxVariable().
548      * @param varTop one or more (if contraction) UChars to which the variable top should be set
549      * @param len length of variable top string. If -1 it is considered to be zero terminated.
550      * @param status error code. If error code is set, the return value is undefined. Errors set by this function are: <br>
551      *    U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such contraction<br>
552      *    U_ILLEGAL_ARGUMENT_ERROR if the variable top is beyond
553      *    the last reordering group supported by setMaxVariable()
554      * @return variable top primary weight
555      * @deprecated ICU 53 Call setMaxVariable() instead.
556      */
557     virtual uint32_t setVariableTop(const UChar *varTop, int32_t len, UErrorCode &status);
558 
559     /**
560      * Sets the variable top to the primary weight of the specified string.
561      *
562      * Beginning with ICU 53, the variable top is pinned to
563      * the top of one of the supported reordering groups,
564      * and it must not be beyond the last of those groups.
565      * See setMaxVariable().
566      * @param varTop a UnicodeString size 1 or more (if contraction) of UChars to which the variable top should be set
567      * @param status error code. If error code is set, the return value is undefined. Errors set by this function are: <br>
568      *    U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such contraction<br>
569      *    U_ILLEGAL_ARGUMENT_ERROR if the variable top is beyond
570      *    the last reordering group supported by setMaxVariable()
571      * @return variable top primary weight
572      * @deprecated ICU 53 Call setMaxVariable() instead.
573      */
574     virtual uint32_t setVariableTop(const UnicodeString &varTop, UErrorCode &status);
575 
576     /**
577      * Sets the variable top to the specified primary weight.
578      *
579      * Beginning with ICU 53, the variable top is pinned to
580      * the top of one of the supported reordering groups,
581      * and it must not be beyond the last of those groups.
582      * See setMaxVariable().
583      * @param varTop primary weight, as returned by setVariableTop or ucol_getVariableTop
584      * @param status error code
585      * @deprecated ICU 53 Call setMaxVariable() instead.
586      */
587     virtual void setVariableTop(uint32_t varTop, UErrorCode &status);
588 
589     /**
590      * Gets the variable top value of a Collator.
591      * @param status error code (not changed by function). If error code is set, the return value is undefined.
592      * @return the variable top primary weight
593      * @see getMaxVariable
594      * @stable ICU 2.0
595      */
596     virtual uint32_t getVariableTop(UErrorCode &status) const;
597 
598     /**
599      * Get a UnicodeSet that contains all the characters and sequences tailored in
600      * this collator.
601      * @param status      error code of the operation
602      * @return a pointer to a UnicodeSet object containing all the
603      *         code points and sequences that may sort differently than
604      *         in the root collator. The object must be disposed of by using delete
605      * @stable ICU 2.4
606      */
607     virtual UnicodeSet *getTailoredSet(UErrorCode &status) const;
608 
609     /**
610      * Get the sort key as an array of bytes from a UnicodeString.
611      * @param source string to be processed.
612      * @param result buffer to store result in. If NULL, number of bytes needed
613      *        will be returned.
614      * @param resultLength length of the result buffer. If if not enough the
615      *        buffer will be filled to capacity.
616      * @return Number of bytes needed for storing the sort key
617      * @stable ICU 2.0
618      */
619     virtual int32_t getSortKey(const UnicodeString& source, uint8_t *result,
620                                int32_t resultLength) const;
621 
622     /**
623      * Get the sort key as an array of bytes from a UChar buffer.
624      * @param source string to be processed.
625      * @param sourceLength length of string to be processed. If -1, the string
626      *        is 0 terminated and length will be decided by the function.
627      * @param result buffer to store result in. If NULL, number of bytes needed
628      *        will be returned.
629      * @param resultLength length of the result buffer. If if not enough the
630      *        buffer will be filled to capacity.
631      * @return Number of bytes needed for storing the sort key
632      * @stable ICU 2.2
633      */
634     virtual int32_t getSortKey(const UChar *source, int32_t sourceLength,
635                                uint8_t *result, int32_t resultLength) const;
636 
637     /**
638      * Retrieves the reordering codes for this collator.
639      * @param dest The array to fill with the script ordering.
640      * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function
641      *  will only return the length of the result without writing any of the result string (pre-flighting).
642      * @param status A reference to an error code value, which must not indicate
643      * a failure before the function call.
644      * @return The length of the script ordering array.
645      * @see ucol_setReorderCodes
646      * @see Collator#getEquivalentReorderCodes
647      * @see Collator#setReorderCodes
648      * @stable ICU 4.8
649      */
650      virtual int32_t getReorderCodes(int32_t *dest,
651                                      int32_t destCapacity,
652                                      UErrorCode& status) const;
653 
654     /**
655      * Sets the ordering of scripts for this collator.
656      * @param reorderCodes An array of script codes in the new order. This can be NULL if the
657      * length is also set to 0. An empty array will clear any reordering codes on the collator.
658      * @param reorderCodesLength The length of reorderCodes.
659      * @param status error code
660      * @see Collator#getReorderCodes
661      * @see Collator#getEquivalentReorderCodes
662      * @stable ICU 4.8
663      */
664      virtual void setReorderCodes(const int32_t* reorderCodes,
665                                   int32_t reorderCodesLength,
666                                   UErrorCode& status) ;
667 
668     /**
669      * Implements ucol_strcollUTF8().
670      * @internal
671      */
672     virtual UCollationResult internalCompareUTF8(
673             const char *left, int32_t leftLength,
674             const char *right, int32_t rightLength,
675             UErrorCode &errorCode) const;
676 
677     /** Get the short definition string for a collator. This internal API harvests the collator's
678      *  locale and the attribute set and produces a string that can be used for opening
679      *  a collator with the same attributes using the ucol_openFromShortString API.
680      *  This string will be normalized.
681      *  The structure and the syntax of the string is defined in the "Naming collators"
682      *  section of the users guide:
683      *  http://userguide.icu-project.org/collation/concepts#TOC-Collator-naming-scheme
684      *  This function supports preflighting.
685      *
686      *  This is internal, and intended to be used with delegate converters.
687      *
688      *  @param locale a locale that will appear as a collators locale in the resulting
689      *                short string definition. If NULL, the locale will be harvested
690      *                from the collator.
691      *  @param buffer space to hold the resulting string
692      *  @param capacity capacity of the buffer
693      *  @param status for returning errors. All the preflighting errors are featured
694      *  @return length of the resulting string
695      *  @see ucol_openFromShortString
696      *  @see ucol_normalizeShortDefinitionString
697      *  @see ucol_getShortDefinitionString
698      *  @internal
699      */
700     virtual int32_t internalGetShortDefinitionString(const char *locale,
701                                                      char *buffer,
702                                                      int32_t capacity,
703                                                      UErrorCode &status) const;
704 
705     /**
706      * Implements ucol_nextSortKeyPart().
707      * @internal
708      */
709     virtual int32_t internalNextSortKeyPart(
710             UCharIterator *iter, uint32_t state[2],
711             uint8_t *dest, int32_t count, UErrorCode &errorCode) const;
712 
713 #ifndef U_HIDE_INTERNAL_API
714     /**
715      * Only for use in ucol_openRules().
716      * @internal
717      */
718     RuleBasedCollator();
719 
720     /**
721      * Implements ucol_getLocaleByType().
722      * Needed because the lifetime of the locale ID string must match that of the collator.
723      * getLocale() returns a copy of a Locale, with minimal lifetime in a C wrapper.
724      * @internal
725      */
726     const char *internalGetLocaleID(ULocDataLocaleType type, UErrorCode &errorCode) const;
727 
728     /**
729      * Implements ucol_getContractionsAndExpansions().
730      * Gets this collator's sets of contraction strings and/or
731      * characters and strings that map to multiple collation elements (expansions).
732      * If addPrefixes is TRUE, then contractions that are expressed as
733      * prefix/pre-context rules are included.
734      * @param contractions if not NULL, the set to hold the contractions
735      * @param expansions if not NULL, the set to hold the expansions
736      * @param addPrefixes include prefix contextual mappings
737      * @param errorCode in/out ICU error code
738      * @internal
739      */
740     void internalGetContractionsAndExpansions(
741             UnicodeSet *contractions, UnicodeSet *expansions,
742             UBool addPrefixes, UErrorCode &errorCode) const;
743 
744     /**
745      * Adds the contractions that start with character c to the set.
746      * Ignores prefixes. Used by AlphabeticIndex.
747      * @internal
748      */
749     void internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCode &errorCode) const;
750 
751     /**
752      * Implements from-rule constructors, and ucol_openRules().
753      * @internal
754      */
755     void internalBuildTailoring(
756             const UnicodeString &rules,
757             int32_t strength,
758             UColAttributeValue decompositionMode,
759             UParseError *outParseError, UnicodeString *outReason,
760             UErrorCode &errorCode);
761 
762     /** @internal */
rbcFromUCollator(UCollator * uc)763     static inline RuleBasedCollator *rbcFromUCollator(UCollator *uc) {
764         return dynamic_cast<RuleBasedCollator *>(fromUCollator(uc));
765     }
766     /** @internal */
rbcFromUCollator(const UCollator * uc)767     static inline const RuleBasedCollator *rbcFromUCollator(const UCollator *uc) {
768         return dynamic_cast<const RuleBasedCollator *>(fromUCollator(uc));
769     }
770 
771     /**
772      * Appends the CEs for the string to the vector.
773      * @internal for tests & tools
774      */
775     void internalGetCEs(const UnicodeString &str, UVector64 &ces, UErrorCode &errorCode) const;
776 #endif  // U_HIDE_INTERNAL_API
777 
778 protected:
779    /**
780     * Used internally by registration to define the requested and valid locales.
781     * @param requestedLocale the requested locale
782     * @param validLocale the valid locale
783     * @param actualLocale the actual locale
784     * @internal
785     */
786     virtual void setLocales(const Locale& requestedLocale, const Locale& validLocale, const Locale& actualLocale);
787 
788 private:
789     friend class CollationElementIterator;
790     friend class Collator;
791 
792     RuleBasedCollator(const CollationTailoring *t, const Locale &vl);
793 
794     /**
795      * Enumeration of attributes that are relevant for short definition strings
796      * (e.g., ucol_getShortDefinitionString()).
797      * Effectively extends UColAttribute.
798      */
799     enum Attributes {
800         ATTR_VARIABLE_TOP = UCOL_ATTRIBUTE_COUNT,
801         ATTR_LIMIT
802     };
803 
804     void adoptTailoring(CollationTailoring *t);
805 
806     // Both lengths must be <0 or else both must be >=0.
807     UCollationResult doCompare(const UChar *left, int32_t leftLength,
808                                const UChar *right, int32_t rightLength,
809                                UErrorCode &errorCode) const;
810     UCollationResult doCompare(const uint8_t *left, int32_t leftLength,
811                                const uint8_t *right, int32_t rightLength,
812                                UErrorCode &errorCode) const;
813 
814     void writeSortKey(const UChar *s, int32_t length,
815                       SortKeyByteSink &sink, UErrorCode &errorCode) const;
816 
817     void writeIdenticalLevel(const UChar *s, const UChar *limit,
818                              SortKeyByteSink &sink, UErrorCode &errorCode) const;
819 
820     const CollationSettings &getDefaultSettings() const;
821 
setAttributeDefault(int32_t attribute)822     void setAttributeDefault(int32_t attribute) {
823         explicitlySetAttributes &= ~((uint32_t)1 << attribute);
824     }
setAttributeExplicitly(int32_t attribute)825     void setAttributeExplicitly(int32_t attribute) {
826         explicitlySetAttributes |= (uint32_t)1 << attribute;
827     }
attributeHasBeenSetExplicitly(int32_t attribute)828     UBool attributeHasBeenSetExplicitly(int32_t attribute) const {
829         // assert(0 <= attribute < ATTR_LIMIT);
830         return (UBool)((explicitlySetAttributes & ((uint32_t)1 << attribute)) != 0);
831     }
832 
833     /**
834      * Tests whether a character is "unsafe" for use as a collation starting point.
835      *
836      * @param c code point or code unit
837      * @return TRUE if c is unsafe
838      * @see CollationElementIterator#setOffset(int)
839      */
840     UBool isUnsafe(UChar32 c) const;
841 
842     static void computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode);
843     UBool initMaxExpansions(UErrorCode &errorCode) const;
844 
845     void setFastLatinOptions(CollationSettings &ownedSettings) const;
846 
847     const CollationData *data;
848     const CollationSettings *settings;  // reference-counted
849     const CollationTailoring *tailoring;  // reference-counted
850     Locale validLocale;
851     uint32_t explicitlySetAttributes;
852 
853     UBool actualLocaleIsSameAsValid;
854 };
855 
856 U_NAMESPACE_END
857 
858 #endif  // !UCONFIG_NO_COLLATION
859 #endif  // TBLCOLL_H
860