1 /* 2 ****************************************************************************** 3 * Copyright (C) 1996-2014, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ****************************************************************************** 6 */ 7 8 /** 9 * \file 10 * \brief C++ API: The RuleBasedCollator class implements the Collator abstract base class. 11 */ 12 13 /** 14 * File tblcoll.h 15 * 16 * Created by: Helena Shih 17 * 18 * Modification History: 19 * 20 * Date Name Description 21 * 2/5/97 aliu Added streamIn and streamOut methods. Added 22 * constructor which reads RuleBasedCollator object from 23 * a binary file. Added writeToFile method which streams 24 * RuleBasedCollator out to a binary file. The streamIn 25 * and streamOut methods use istream and ostream objects 26 * in binary mode. 27 * 2/12/97 aliu Modified to use TableCollationData sub-object to 28 * hold invariant data. 29 * 2/13/97 aliu Moved several methods into this class from Collation. 30 * Added a private RuleBasedCollator(Locale&) constructor, 31 * to be used by Collator::createDefault(). General 32 * clean up. 33 * 2/20/97 helena Added clone, operator==, operator!=, operator=, and copy 34 * constructor and getDynamicClassID. 35 * 3/5/97 aliu Modified constructFromFile() to add parameter 36 * specifying whether or not binary loading is to be 37 * attempted. This is required for dynamic rule loading. 38 * 05/07/97 helena Added memory allocation error detection. 39 * 6/17/97 helena Added IDENTICAL strength for compare, changed getRules to 40 * use MergeCollation::getPattern. 41 * 6/20/97 helena Java class name change. 42 * 8/18/97 helena Added internal API documentation. 43 * 09/03/97 helena Added createCollationKeyValues(). 44 * 02/10/98 damiba Added compare with "length" parameter 45 * 08/05/98 erm Synched with 1.2 version of RuleBasedCollator.java 46 * 04/23/99 stephen Removed EDecompositionMode, merged with 47 * Normalizer::EMode 48 * 06/14/99 stephen Removed kResourceBundleSuffix 49 * 11/02/99 helena Collator performance enhancements. Eliminates the 50 * UnicodeString construction and special case for NO_OP. 51 * 11/23/99 srl More performance enhancements. Updates to NormalizerIterator 52 * internal state management. 53 * 12/15/99 aliu Update to support Thai collation. Move NormalizerIterator 54 * to implementation file. 55 * 01/29/01 synwee Modified into a C++ wrapper which calls C API 56 * (ucol.h) 57 * 2012-2014 markus Rewritten in C++ again. 58 */ 59 60 #ifndef TBLCOLL_H 61 #define TBLCOLL_H 62 63 #include "unicode/utypes.h" 64 65 #if !UCONFIG_NO_COLLATION 66 67 #include "unicode/coll.h" 68 #include "unicode/locid.h" 69 #include "unicode/uiter.h" 70 #include "unicode/ucol.h" 71 72 U_NAMESPACE_BEGIN 73 74 struct CollationData; 75 struct CollationSettings; 76 struct CollationTailoring; 77 /** 78 * @stable ICU 2.0 79 */ 80 class StringSearch; 81 /** 82 * @stable ICU 2.0 83 */ 84 class CollationElementIterator; 85 class CollationKey; 86 class SortKeyByteSink; 87 class UnicodeSet; 88 class UnicodeString; 89 class UVector64; 90 91 /** 92 * The RuleBasedCollator class provides the implementation of 93 * Collator, using data-driven tables. The user can create a customized 94 * table-based collation. 95 * <p> 96 * For more information about the collation service see 97 * <a href="http://userguide.icu-project.org/collation">the User Guide</a>. 98 * <p> 99 * Collation service provides correct sorting orders for most locales supported in ICU. 100 * If specific data for a locale is not available, the orders eventually falls back 101 * to the <a href="http://www.unicode.org/reports/tr35/tr35-collation.html#Root_Collation">CLDR root sort order</a>. 102 * <p> 103 * Sort ordering may be customized by providing your own set of rules. For more on 104 * this subject see the <a href="http://userguide.icu-project.org/collation/customization"> 105 * Collation Customization</a> section of the User Guide. 106 * <p> 107 * Note, RuleBasedCollator is not to be subclassed. 108 * @see Collator 109 */ 110 class U_I18N_API RuleBasedCollator : public Collator { 111 public: 112 /** 113 * RuleBasedCollator constructor. This takes the table rules and builds a 114 * collation table out of them. Please see RuleBasedCollator class 115 * description for more details on the collation rule syntax. 116 * @param rules the collation rules to build the collation table from. 117 * @param status reporting a success or an error. 118 * @stable ICU 2.0 119 */ 120 RuleBasedCollator(const UnicodeString& rules, UErrorCode& status); 121 122 /** 123 * RuleBasedCollator constructor. This takes the table rules and builds a 124 * collation table out of them. Please see RuleBasedCollator class 125 * description for more details on the collation rule syntax. 126 * @param rules the collation rules to build the collation table from. 127 * @param collationStrength strength for comparison 128 * @param status reporting a success or an error. 129 * @stable ICU 2.0 130 */ 131 RuleBasedCollator(const UnicodeString& rules, 132 ECollationStrength collationStrength, 133 UErrorCode& status); 134 135 /** 136 * RuleBasedCollator constructor. This takes the table rules and builds a 137 * collation table out of them. Please see RuleBasedCollator class 138 * description for more details on the collation rule syntax. 139 * @param rules the collation rules to build the collation table from. 140 * @param decompositionMode the normalisation mode 141 * @param status reporting a success or an error. 142 * @stable ICU 2.0 143 */ 144 RuleBasedCollator(const UnicodeString& rules, 145 UColAttributeValue decompositionMode, 146 UErrorCode& status); 147 148 /** 149 * RuleBasedCollator constructor. This takes the table rules and builds a 150 * collation table out of them. Please see RuleBasedCollator class 151 * description for more details on the collation rule syntax. 152 * @param rules the collation rules to build the collation table from. 153 * @param collationStrength strength for comparison 154 * @param decompositionMode the normalisation mode 155 * @param status reporting a success or an error. 156 * @stable ICU 2.0 157 */ 158 RuleBasedCollator(const UnicodeString& rules, 159 ECollationStrength collationStrength, 160 UColAttributeValue decompositionMode, 161 UErrorCode& status); 162 163 #ifndef U_HIDE_INTERNAL_API 164 /** 165 * TODO: document & propose as public API 166 * @internal 167 */ 168 RuleBasedCollator(const UnicodeString &rules, 169 UParseError &parseError, UnicodeString &reason, 170 UErrorCode &errorCode); 171 #endif /* U_HIDE_INTERNAL_API */ 172 173 /** 174 * Copy constructor. 175 * @param other the RuleBasedCollator object to be copied 176 * @stable ICU 2.0 177 */ 178 RuleBasedCollator(const RuleBasedCollator& other); 179 180 181 /** Opens a collator from a collator binary image created using 182 * cloneBinary. Binary image used in instantiation of the 183 * collator remains owned by the user and should stay around for 184 * the lifetime of the collator. The API also takes a base collator 185 * which usually should be the root collator. 186 * @param bin binary image owned by the user and required through the 187 * lifetime of the collator 188 * @param length size of the image. If negative, the API will try to 189 * figure out the length of the image 190 * @param base fallback collator, usually root. The base is required to be 191 * present through the lifetime of the collator. Currently 192 * it cannot be NULL. 193 * @param status for catching errors 194 * @return newly created collator 195 * @see cloneBinary 196 * @stable ICU 3.4 197 */ 198 RuleBasedCollator(const uint8_t *bin, int32_t length, 199 const RuleBasedCollator *base, 200 UErrorCode &status); 201 202 /** 203 * Destructor. 204 * @stable ICU 2.0 205 */ 206 virtual ~RuleBasedCollator(); 207 208 /** 209 * Assignment operator. 210 * @param other other RuleBasedCollator object to copy from. 211 * @stable ICU 2.0 212 */ 213 RuleBasedCollator& operator=(const RuleBasedCollator& other); 214 215 /** 216 * Returns true if argument is the same as this object. 217 * @param other Collator object to be compared. 218 * @return true if arguments is the same as this object. 219 * @stable ICU 2.0 220 */ 221 virtual UBool operator==(const Collator& other) const; 222 223 /** 224 * Makes a copy of this object. 225 * @return a copy of this object, owned by the caller 226 * @stable ICU 2.0 227 */ 228 virtual Collator* clone(void) const; 229 230 /** 231 * Creates a collation element iterator for the source string. The caller of 232 * this method is responsible for the memory management of the return 233 * pointer. 234 * @param source the string over which the CollationElementIterator will 235 * iterate. 236 * @return the collation element iterator of the source string using this as 237 * the based Collator. 238 * @stable ICU 2.2 239 */ 240 virtual CollationElementIterator* createCollationElementIterator( 241 const UnicodeString& source) const; 242 243 /** 244 * Creates a collation element iterator for the source. The caller of this 245 * method is responsible for the memory management of the returned pointer. 246 * @param source the CharacterIterator which produces the characters over 247 * which the CollationElementItgerator will iterate. 248 * @return the collation element iterator of the source using this as the 249 * based Collator. 250 * @stable ICU 2.2 251 */ 252 virtual CollationElementIterator* createCollationElementIterator( 253 const CharacterIterator& source) const; 254 255 // Make deprecated versions of Collator::compare() visible. 256 using Collator::compare; 257 258 /** 259 * The comparison function compares the character data stored in two 260 * different strings. Returns information about whether a string is less 261 * than, greater than or equal to another string. 262 * @param source the source string to be compared with. 263 * @param target the string that is to be compared with the source string. 264 * @param status possible error code 265 * @return Returns an enum value. UCOL_GREATER if source is greater 266 * than target; UCOL_EQUAL if source is equal to target; UCOL_LESS if source is less 267 * than target 268 * @stable ICU 2.6 269 **/ 270 virtual UCollationResult compare(const UnicodeString& source, 271 const UnicodeString& target, 272 UErrorCode &status) const; 273 274 /** 275 * Does the same thing as compare but limits the comparison to a specified 276 * length 277 * @param source the source string to be compared with. 278 * @param target the string that is to be compared with the source string. 279 * @param length the length the comparison is limited to 280 * @param status possible error code 281 * @return Returns an enum value. UCOL_GREATER if source (up to the specified 282 * length) is greater than target; UCOL_EQUAL if source (up to specified 283 * length) is equal to target; UCOL_LESS if source (up to the specified 284 * length) is less than target. 285 * @stable ICU 2.6 286 */ 287 virtual UCollationResult compare(const UnicodeString& source, 288 const UnicodeString& target, 289 int32_t length, 290 UErrorCode &status) const; 291 292 /** 293 * The comparison function compares the character data stored in two 294 * different string arrays. Returns information about whether a string array 295 * is less than, greater than or equal to another string array. 296 * @param source the source string array to be compared with. 297 * @param sourceLength the length of the source string array. If this value 298 * is equal to -1, the string array is null-terminated. 299 * @param target the string that is to be compared with the source string. 300 * @param targetLength the length of the target string array. If this value 301 * is equal to -1, the string array is null-terminated. 302 * @param status possible error code 303 * @return Returns an enum value. UCOL_GREATER if source is greater 304 * than target; UCOL_EQUAL if source is equal to target; UCOL_LESS if source is less 305 * than target 306 * @stable ICU 2.6 307 */ 308 virtual UCollationResult compare(const UChar* source, int32_t sourceLength, 309 const UChar* target, int32_t targetLength, 310 UErrorCode &status) const; 311 312 /** 313 * Compares two strings using the Collator. 314 * Returns whether the first one compares less than/equal to/greater than 315 * the second one. 316 * This version takes UCharIterator input. 317 * @param sIter the first ("source") string iterator 318 * @param tIter the second ("target") string iterator 319 * @param status ICU status 320 * @return UCOL_LESS, UCOL_EQUAL or UCOL_GREATER 321 * @stable ICU 4.2 322 */ 323 virtual UCollationResult compare(UCharIterator &sIter, 324 UCharIterator &tIter, 325 UErrorCode &status) const; 326 327 /** 328 * Compares two UTF-8 strings using the Collator. 329 * Returns whether the first one compares less than/equal to/greater than 330 * the second one. 331 * This version takes UTF-8 input. 332 * Note that a StringPiece can be implicitly constructed 333 * from a std::string or a NUL-terminated const char * string. 334 * @param source the first UTF-8 string 335 * @param target the second UTF-8 string 336 * @param status ICU status 337 * @return UCOL_LESS, UCOL_EQUAL or UCOL_GREATER 338 * @stable ICU 51 339 */ 340 virtual UCollationResult compareUTF8(const StringPiece &source, 341 const StringPiece &target, 342 UErrorCode &status) const; 343 344 /** 345 * Transforms a specified region of the string into a series of characters 346 * that can be compared with CollationKey.compare. Use a CollationKey when 347 * you need to do repeated comparisions on the same string. For a single 348 * comparison the compare method will be faster. 349 * @param source the source string. 350 * @param key the transformed key of the source string. 351 * @param status the error code status. 352 * @return the transformed key. 353 * @see CollationKey 354 * @stable ICU 2.0 355 */ 356 virtual CollationKey& getCollationKey(const UnicodeString& source, 357 CollationKey& key, 358 UErrorCode& status) const; 359 360 /** 361 * Transforms a specified region of the string into a series of characters 362 * that can be compared with CollationKey.compare. Use a CollationKey when 363 * you need to do repeated comparisions on the same string. For a single 364 * comparison the compare method will be faster. 365 * @param source the source string. 366 * @param sourceLength the length of the source string. 367 * @param key the transformed key of the source string. 368 * @param status the error code status. 369 * @return the transformed key. 370 * @see CollationKey 371 * @stable ICU 2.0 372 */ 373 virtual CollationKey& getCollationKey(const UChar *source, 374 int32_t sourceLength, 375 CollationKey& key, 376 UErrorCode& status) const; 377 378 /** 379 * Generates the hash code for the rule-based collation object. 380 * @return the hash code. 381 * @stable ICU 2.0 382 */ 383 virtual int32_t hashCode() const; 384 385 /** 386 * Gets the locale of the Collator 387 * @param type can be either requested, valid or actual locale. For more 388 * information see the definition of ULocDataLocaleType in 389 * uloc.h 390 * @param status the error code status. 391 * @return locale where the collation data lives. If the collator 392 * was instantiated from rules, locale is empty. 393 * @deprecated ICU 2.8 likely to change in ICU 3.0, based on feedback 394 */ 395 virtual Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const; 396 397 /** 398 * Gets the tailoring rules for this collator. 399 * @return the collation tailoring from which this collator was created 400 * @stable ICU 2.0 401 */ 402 const UnicodeString& getRules() const; 403 404 /** 405 * Gets the version information for a Collator. 406 * @param info the version # information, the result will be filled in 407 * @stable ICU 2.0 408 */ 409 virtual void getVersion(UVersionInfo info) const; 410 411 #ifndef U_HIDE_DEPRECATED_API 412 /** 413 * Returns the maximum length of any expansion sequences that end with the 414 * specified comparison order. 415 * 416 * This is specific to the kind of collation element values and sequences 417 * returned by the CollationElementIterator. 418 * Call CollationElementIterator::getMaxExpansion() instead. 419 * 420 * @param order a collation order returned by CollationElementIterator::previous 421 * or CollationElementIterator::next. 422 * @return maximum size of the expansion sequences ending with the collation 423 * element, or 1 if the collation element does not occur at the end of 424 * any expansion sequence 425 * @see CollationElementIterator#getMaxExpansion 426 * @deprecated ICU 51 Use CollationElementIterator::getMaxExpansion() instead. 427 */ 428 int32_t getMaxExpansion(int32_t order) const; 429 #endif /* U_HIDE_DEPRECATED_API */ 430 431 /** 432 * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. This 433 * method is to implement a simple version of RTTI, since not all C++ 434 * compilers support genuine RTTI. Polymorphic operator==() and clone() 435 * methods call this method. 436 * @return The class ID for this object. All objects of a given class have 437 * the same class ID. Objects of other classes have different class 438 * IDs. 439 * @stable ICU 2.0 440 */ 441 virtual UClassID getDynamicClassID(void) const; 442 443 /** 444 * Returns the class ID for this class. This is useful only for comparing to 445 * a return value from getDynamicClassID(). For example: 446 * <pre> 447 * Base* polymorphic_pointer = createPolymorphicObject(); 448 * if (polymorphic_pointer->getDynamicClassID() == 449 * Derived::getStaticClassID()) ... 450 * </pre> 451 * @return The class ID for all objects of this class. 452 * @stable ICU 2.0 453 */ 454 static UClassID U_EXPORT2 getStaticClassID(void); 455 456 #ifndef U_HIDE_DEPRECATED_API 457 /** 458 * Do not use this method: The caller and the ICU library might use different heaps. 459 * Use cloneBinary() instead which writes to caller-provided memory. 460 * 461 * Returns a binary format of this collator. 462 * @param length Returns the length of the data, in bytes 463 * @param status the error code status. 464 * @return memory, owned by the caller, of size 'length' bytes. 465 * @deprecated ICU 52. Use cloneBinary() instead. 466 */ 467 uint8_t *cloneRuleData(int32_t &length, UErrorCode &status) const; 468 #endif /* U_HIDE_DEPRECATED_API */ 469 470 /** Creates a binary image of a collator. This binary image can be stored and 471 * later used to instantiate a collator using ucol_openBinary. 472 * This API supports preflighting. 473 * @param buffer a fill-in buffer to receive the binary image 474 * @param capacity capacity of the destination buffer 475 * @param status for catching errors 476 * @return size of the image 477 * @see ucol_openBinary 478 * @stable ICU 3.4 479 */ 480 int32_t cloneBinary(uint8_t *buffer, int32_t capacity, UErrorCode &status) const; 481 482 /** 483 * Returns current rules. Delta defines whether full rules are returned or 484 * just the tailoring. 485 * 486 * getRules(void) should normally be used instead. 487 * See http://userguide.icu-project.org/collation/customization#TOC-Building-on-Existing-Locales 488 * @param delta one of UCOL_TAILORING_ONLY, UCOL_FULL_RULES. 489 * @param buffer UnicodeString to store the result rules 490 * @stable ICU 2.2 491 * @see UCOL_FULL_RULES 492 */ 493 void getRules(UColRuleOption delta, UnicodeString &buffer) const; 494 495 /** 496 * Universal attribute setter 497 * @param attr attribute type 498 * @param value attribute value 499 * @param status to indicate whether the operation went on smoothly or there were errors 500 * @stable ICU 2.2 501 */ 502 virtual void setAttribute(UColAttribute attr, UColAttributeValue value, 503 UErrorCode &status); 504 505 /** 506 * Universal attribute getter. 507 * @param attr attribute type 508 * @param status to indicate whether the operation went on smoothly or there were errors 509 * @return attribute value 510 * @stable ICU 2.2 511 */ 512 virtual UColAttributeValue getAttribute(UColAttribute attr, 513 UErrorCode &status) const; 514 515 /** 516 * Sets the variable top to the top of the specified reordering group. 517 * The variable top determines the highest-sorting character 518 * which is affected by UCOL_ALTERNATE_HANDLING. 519 * If that attribute is set to UCOL_NON_IGNORABLE, then the variable top has no effect. 520 * @param group one of UCOL_REORDER_CODE_SPACE, UCOL_REORDER_CODE_PUNCTUATION, 521 * UCOL_REORDER_CODE_SYMBOL, UCOL_REORDER_CODE_CURRENCY; 522 * or UCOL_REORDER_CODE_DEFAULT to restore the default max variable group 523 * @param errorCode Standard ICU error code. Its input value must 524 * pass the U_SUCCESS() test, or else the function returns 525 * immediately. Check for U_FAILURE() on output or use with 526 * function chaining. (See User Guide for details.) 527 * @return *this 528 * @see getMaxVariable 529 * @draft ICU 53 530 */ 531 virtual Collator &setMaxVariable(UColReorderCode group, UErrorCode &errorCode); 532 533 /** 534 * Returns the maximum reordering group whose characters are affected by UCOL_ALTERNATE_HANDLING. 535 * @return the maximum variable reordering group. 536 * @see setMaxVariable 537 * @draft ICU 53 538 */ 539 virtual UColReorderCode getMaxVariable() const; 540 541 /** 542 * Sets the variable top to the primary weight of the specified string. 543 * 544 * Beginning with ICU 53, the variable top is pinned to 545 * the top of one of the supported reordering groups, 546 * and it must not be beyond the last of those groups. 547 * See setMaxVariable(). 548 * @param varTop one or more (if contraction) UChars to which the variable top should be set 549 * @param len length of variable top string. If -1 it is considered to be zero terminated. 550 * @param status error code. If error code is set, the return value is undefined. Errors set by this function are: <br> 551 * U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such contraction<br> 552 * U_ILLEGAL_ARGUMENT_ERROR if the variable top is beyond 553 * the last reordering group supported by setMaxVariable() 554 * @return variable top primary weight 555 * @deprecated ICU 53 Call setMaxVariable() instead. 556 */ 557 virtual uint32_t setVariableTop(const UChar *varTop, int32_t len, UErrorCode &status); 558 559 /** 560 * Sets the variable top to the primary weight of the specified string. 561 * 562 * Beginning with ICU 53, the variable top is pinned to 563 * the top of one of the supported reordering groups, 564 * and it must not be beyond the last of those groups. 565 * See setMaxVariable(). 566 * @param varTop a UnicodeString size 1 or more (if contraction) of UChars to which the variable top should be set 567 * @param status error code. If error code is set, the return value is undefined. Errors set by this function are: <br> 568 * U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such contraction<br> 569 * U_ILLEGAL_ARGUMENT_ERROR if the variable top is beyond 570 * the last reordering group supported by setMaxVariable() 571 * @return variable top primary weight 572 * @deprecated ICU 53 Call setMaxVariable() instead. 573 */ 574 virtual uint32_t setVariableTop(const UnicodeString &varTop, UErrorCode &status); 575 576 /** 577 * Sets the variable top to the specified primary weight. 578 * 579 * Beginning with ICU 53, the variable top is pinned to 580 * the top of one of the supported reordering groups, 581 * and it must not be beyond the last of those groups. 582 * See setMaxVariable(). 583 * @param varTop primary weight, as returned by setVariableTop or ucol_getVariableTop 584 * @param status error code 585 * @deprecated ICU 53 Call setMaxVariable() instead. 586 */ 587 virtual void setVariableTop(uint32_t varTop, UErrorCode &status); 588 589 /** 590 * Gets the variable top value of a Collator. 591 * @param status error code (not changed by function). If error code is set, the return value is undefined. 592 * @return the variable top primary weight 593 * @see getMaxVariable 594 * @stable ICU 2.0 595 */ 596 virtual uint32_t getVariableTop(UErrorCode &status) const; 597 598 /** 599 * Get a UnicodeSet that contains all the characters and sequences tailored in 600 * this collator. 601 * @param status error code of the operation 602 * @return a pointer to a UnicodeSet object containing all the 603 * code points and sequences that may sort differently than 604 * in the root collator. The object must be disposed of by using delete 605 * @stable ICU 2.4 606 */ 607 virtual UnicodeSet *getTailoredSet(UErrorCode &status) const; 608 609 /** 610 * Get the sort key as an array of bytes from a UnicodeString. 611 * @param source string to be processed. 612 * @param result buffer to store result in. If NULL, number of bytes needed 613 * will be returned. 614 * @param resultLength length of the result buffer. If if not enough the 615 * buffer will be filled to capacity. 616 * @return Number of bytes needed for storing the sort key 617 * @stable ICU 2.0 618 */ 619 virtual int32_t getSortKey(const UnicodeString& source, uint8_t *result, 620 int32_t resultLength) const; 621 622 /** 623 * Get the sort key as an array of bytes from a UChar buffer. 624 * @param source string to be processed. 625 * @param sourceLength length of string to be processed. If -1, the string 626 * is 0 terminated and length will be decided by the function. 627 * @param result buffer to store result in. If NULL, number of bytes needed 628 * will be returned. 629 * @param resultLength length of the result buffer. If if not enough the 630 * buffer will be filled to capacity. 631 * @return Number of bytes needed for storing the sort key 632 * @stable ICU 2.2 633 */ 634 virtual int32_t getSortKey(const UChar *source, int32_t sourceLength, 635 uint8_t *result, int32_t resultLength) const; 636 637 /** 638 * Retrieves the reordering codes for this collator. 639 * @param dest The array to fill with the script ordering. 640 * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function 641 * will only return the length of the result without writing any of the result string (pre-flighting). 642 * @param status A reference to an error code value, which must not indicate 643 * a failure before the function call. 644 * @return The length of the script ordering array. 645 * @see ucol_setReorderCodes 646 * @see Collator#getEquivalentReorderCodes 647 * @see Collator#setReorderCodes 648 * @stable ICU 4.8 649 */ 650 virtual int32_t getReorderCodes(int32_t *dest, 651 int32_t destCapacity, 652 UErrorCode& status) const; 653 654 /** 655 * Sets the ordering of scripts for this collator. 656 * @param reorderCodes An array of script codes in the new order. This can be NULL if the 657 * length is also set to 0. An empty array will clear any reordering codes on the collator. 658 * @param reorderCodesLength The length of reorderCodes. 659 * @param status error code 660 * @see Collator#getReorderCodes 661 * @see Collator#getEquivalentReorderCodes 662 * @stable ICU 4.8 663 */ 664 virtual void setReorderCodes(const int32_t* reorderCodes, 665 int32_t reorderCodesLength, 666 UErrorCode& status) ; 667 668 /** 669 * Implements ucol_strcollUTF8(). 670 * @internal 671 */ 672 virtual UCollationResult internalCompareUTF8( 673 const char *left, int32_t leftLength, 674 const char *right, int32_t rightLength, 675 UErrorCode &errorCode) const; 676 677 /** Get the short definition string for a collator. This internal API harvests the collator's 678 * locale and the attribute set and produces a string that can be used for opening 679 * a collator with the same attributes using the ucol_openFromShortString API. 680 * This string will be normalized. 681 * The structure and the syntax of the string is defined in the "Naming collators" 682 * section of the users guide: 683 * http://userguide.icu-project.org/collation/concepts#TOC-Collator-naming-scheme 684 * This function supports preflighting. 685 * 686 * This is internal, and intended to be used with delegate converters. 687 * 688 * @param locale a locale that will appear as a collators locale in the resulting 689 * short string definition. If NULL, the locale will be harvested 690 * from the collator. 691 * @param buffer space to hold the resulting string 692 * @param capacity capacity of the buffer 693 * @param status for returning errors. All the preflighting errors are featured 694 * @return length of the resulting string 695 * @see ucol_openFromShortString 696 * @see ucol_normalizeShortDefinitionString 697 * @see ucol_getShortDefinitionString 698 * @internal 699 */ 700 virtual int32_t internalGetShortDefinitionString(const char *locale, 701 char *buffer, 702 int32_t capacity, 703 UErrorCode &status) const; 704 705 /** 706 * Implements ucol_nextSortKeyPart(). 707 * @internal 708 */ 709 virtual int32_t internalNextSortKeyPart( 710 UCharIterator *iter, uint32_t state[2], 711 uint8_t *dest, int32_t count, UErrorCode &errorCode) const; 712 713 #ifndef U_HIDE_INTERNAL_API 714 /** 715 * Only for use in ucol_openRules(). 716 * @internal 717 */ 718 RuleBasedCollator(); 719 720 /** 721 * Implements ucol_getLocaleByType(). 722 * Needed because the lifetime of the locale ID string must match that of the collator. 723 * getLocale() returns a copy of a Locale, with minimal lifetime in a C wrapper. 724 * @internal 725 */ 726 const char *internalGetLocaleID(ULocDataLocaleType type, UErrorCode &errorCode) const; 727 728 /** 729 * Implements ucol_getContractionsAndExpansions(). 730 * Gets this collator's sets of contraction strings and/or 731 * characters and strings that map to multiple collation elements (expansions). 732 * If addPrefixes is TRUE, then contractions that are expressed as 733 * prefix/pre-context rules are included. 734 * @param contractions if not NULL, the set to hold the contractions 735 * @param expansions if not NULL, the set to hold the expansions 736 * @param addPrefixes include prefix contextual mappings 737 * @param errorCode in/out ICU error code 738 * @internal 739 */ 740 void internalGetContractionsAndExpansions( 741 UnicodeSet *contractions, UnicodeSet *expansions, 742 UBool addPrefixes, UErrorCode &errorCode) const; 743 744 /** 745 * Adds the contractions that start with character c to the set. 746 * Ignores prefixes. Used by AlphabeticIndex. 747 * @internal 748 */ 749 void internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCode &errorCode) const; 750 751 /** 752 * Implements from-rule constructors, and ucol_openRules(). 753 * @internal 754 */ 755 void internalBuildTailoring( 756 const UnicodeString &rules, 757 int32_t strength, 758 UColAttributeValue decompositionMode, 759 UParseError *outParseError, UnicodeString *outReason, 760 UErrorCode &errorCode); 761 762 /** @internal */ rbcFromUCollator(UCollator * uc)763 static inline RuleBasedCollator *rbcFromUCollator(UCollator *uc) { 764 return dynamic_cast<RuleBasedCollator *>(fromUCollator(uc)); 765 } 766 /** @internal */ rbcFromUCollator(const UCollator * uc)767 static inline const RuleBasedCollator *rbcFromUCollator(const UCollator *uc) { 768 return dynamic_cast<const RuleBasedCollator *>(fromUCollator(uc)); 769 } 770 771 /** 772 * Appends the CEs for the string to the vector. 773 * @internal for tests & tools 774 */ 775 void internalGetCEs(const UnicodeString &str, UVector64 &ces, UErrorCode &errorCode) const; 776 #endif // U_HIDE_INTERNAL_API 777 778 protected: 779 /** 780 * Used internally by registration to define the requested and valid locales. 781 * @param requestedLocale the requested locale 782 * @param validLocale the valid locale 783 * @param actualLocale the actual locale 784 * @internal 785 */ 786 virtual void setLocales(const Locale& requestedLocale, const Locale& validLocale, const Locale& actualLocale); 787 788 private: 789 friend class CollationElementIterator; 790 friend class Collator; 791 792 RuleBasedCollator(const CollationTailoring *t, const Locale &vl); 793 794 /** 795 * Enumeration of attributes that are relevant for short definition strings 796 * (e.g., ucol_getShortDefinitionString()). 797 * Effectively extends UColAttribute. 798 */ 799 enum Attributes { 800 ATTR_VARIABLE_TOP = UCOL_ATTRIBUTE_COUNT, 801 ATTR_LIMIT 802 }; 803 804 void adoptTailoring(CollationTailoring *t); 805 806 // Both lengths must be <0 or else both must be >=0. 807 UCollationResult doCompare(const UChar *left, int32_t leftLength, 808 const UChar *right, int32_t rightLength, 809 UErrorCode &errorCode) const; 810 UCollationResult doCompare(const uint8_t *left, int32_t leftLength, 811 const uint8_t *right, int32_t rightLength, 812 UErrorCode &errorCode) const; 813 814 void writeSortKey(const UChar *s, int32_t length, 815 SortKeyByteSink &sink, UErrorCode &errorCode) const; 816 817 void writeIdenticalLevel(const UChar *s, const UChar *limit, 818 SortKeyByteSink &sink, UErrorCode &errorCode) const; 819 820 const CollationSettings &getDefaultSettings() const; 821 setAttributeDefault(int32_t attribute)822 void setAttributeDefault(int32_t attribute) { 823 explicitlySetAttributes &= ~((uint32_t)1 << attribute); 824 } setAttributeExplicitly(int32_t attribute)825 void setAttributeExplicitly(int32_t attribute) { 826 explicitlySetAttributes |= (uint32_t)1 << attribute; 827 } attributeHasBeenSetExplicitly(int32_t attribute)828 UBool attributeHasBeenSetExplicitly(int32_t attribute) const { 829 // assert(0 <= attribute < ATTR_LIMIT); 830 return (UBool)((explicitlySetAttributes & ((uint32_t)1 << attribute)) != 0); 831 } 832 833 /** 834 * Tests whether a character is "unsafe" for use as a collation starting point. 835 * 836 * @param c code point or code unit 837 * @return TRUE if c is unsafe 838 * @see CollationElementIterator#setOffset(int) 839 */ 840 UBool isUnsafe(UChar32 c) const; 841 842 static void computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode); 843 UBool initMaxExpansions(UErrorCode &errorCode) const; 844 845 void setFastLatinOptions(CollationSettings &ownedSettings) const; 846 847 const CollationData *data; 848 const CollationSettings *settings; // reference-counted 849 const CollationTailoring *tailoring; // reference-counted 850 Locale validLocale; 851 uint32_t explicitlySetAttributes; 852 853 UBool actualLocaleIsSameAsValid; 854 }; 855 856 U_NAMESPACE_END 857 858 #endif // !UCONFIG_NO_COLLATION 859 #endif // TBLCOLL_H 860