1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * Copyright (C) 1996-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ****************************************************************************** 8 */ 9 10 /** 11 * \file 12 * \brief C++ API: The RuleBasedCollator class implements the Collator abstract base class. 13 */ 14 15 /** 16 * File tblcoll.h 17 * 18 * Created by: Helena Shih 19 * 20 * Modification History: 21 * 22 * Date Name Description 23 * 2/5/97 aliu Added streamIn and streamOut methods. Added 24 * constructor which reads RuleBasedCollator object from 25 * a binary file. Added writeToFile method which streams 26 * RuleBasedCollator out to a binary file. The streamIn 27 * and streamOut methods use istream and ostream objects 28 * in binary mode. 29 * 2/12/97 aliu Modified to use TableCollationData sub-object to 30 * hold invariant data. 31 * 2/13/97 aliu Moved several methods into this class from Collation. 32 * Added a private RuleBasedCollator(Locale&) constructor, 33 * to be used by Collator::createDefault(). General 34 * clean up. 35 * 2/20/97 helena Added clone, operator==, operator!=, operator=, and copy 36 * constructor and getDynamicClassID. 37 * 3/5/97 aliu Modified constructFromFile() to add parameter 38 * specifying whether or not binary loading is to be 39 * attempted. This is required for dynamic rule loading. 40 * 05/07/97 helena Added memory allocation error detection. 41 * 6/17/97 helena Added IDENTICAL strength for compare, changed getRules to 42 * use MergeCollation::getPattern. 43 * 6/20/97 helena Java class name change. 44 * 8/18/97 helena Added internal API documentation. 45 * 09/03/97 helena Added createCollationKeyValues(). 46 * 02/10/98 damiba Added compare with "length" parameter 47 * 08/05/98 erm Synched with 1.2 version of RuleBasedCollator.java 48 * 04/23/99 stephen Removed EDecompositionMode, merged with 49 * Normalizer::EMode 50 * 06/14/99 stephen Removed kResourceBundleSuffix 51 * 11/02/99 helena Collator performance enhancements. Eliminates the 52 * UnicodeString construction and special case for NO_OP. 53 * 11/23/99 srl More performance enhancements. Updates to NormalizerIterator 54 * internal state management. 55 * 12/15/99 aliu Update to support Thai collation. Move NormalizerIterator 56 * to implementation file. 57 * 01/29/01 synwee Modified into a C++ wrapper which calls C API 58 * (ucol.h) 59 * 2012-2014 markus Rewritten in C++ again. 60 */ 61 62 #ifndef TBLCOLL_H 63 #define TBLCOLL_H 64 65 #include "unicode/utypes.h" 66 67 #if U_SHOW_CPLUSPLUS_API 68 69 #if !UCONFIG_NO_COLLATION 70 71 #include "unicode/coll.h" 72 #include "unicode/locid.h" 73 #include "unicode/uiter.h" 74 #include "unicode/ucol.h" 75 76 U_NAMESPACE_BEGIN 77 78 struct CollationCacheEntry; 79 struct CollationData; 80 struct CollationSettings; 81 struct CollationTailoring; 82 /** 83 * @stable ICU 2.0 84 */ 85 class StringSearch; 86 /** 87 * @stable ICU 2.0 88 */ 89 class CollationElementIterator; 90 class CollationKey; 91 class SortKeyByteSink; 92 class UnicodeSet; 93 class UnicodeString; 94 class UVector64; 95 96 /** 97 * The RuleBasedCollator class provides the implementation of 98 * Collator, using data-driven tables. The user can create a customized 99 * table-based collation. 100 * <p> 101 * For more information about the collation service see 102 * <a href="https://unicode-org.github.io/icu/userguide/collation">the User Guide</a>. 103 * <p> 104 * Collation service provides correct sorting orders for most locales supported in ICU. 105 * If specific data for a locale is not available, the orders eventually falls back 106 * to the <a href="http://www.unicode.org/reports/tr35/tr35-collation.html#Root_Collation">CLDR root sort order</a>. 107 * <p> 108 * Sort ordering may be customized by providing your own set of rules. For more on 109 * this subject see the <a href="https://unicode-org.github.io/icu/userguide/collation/customization"> 110 * Collation Customization</a> section of the User Guide. 111 * <p> 112 * Note, RuleBasedCollator is not to be subclassed. 113 * @see Collator 114 */ 115 class U_I18N_API RuleBasedCollator U_FINAL : public Collator { 116 public: 117 /** 118 * RuleBasedCollator constructor. This takes the table rules and builds a 119 * collation table out of them. Please see RuleBasedCollator class 120 * description for more details on the collation rule syntax. 121 * @param rules the collation rules to build the collation table from. 122 * @param status reporting a success or an error. 123 * @stable ICU 2.0 124 */ 125 RuleBasedCollator(const UnicodeString& rules, UErrorCode& status); 126 127 /** 128 * RuleBasedCollator constructor. This takes the table rules and builds a 129 * collation table out of them. Please see RuleBasedCollator class 130 * description for more details on the collation rule syntax. 131 * @param rules the collation rules to build the collation table from. 132 * @param collationStrength strength for comparison 133 * @param status reporting a success or an error. 134 * @stable ICU 2.0 135 */ 136 RuleBasedCollator(const UnicodeString& rules, 137 ECollationStrength collationStrength, 138 UErrorCode& status); 139 140 /** 141 * RuleBasedCollator constructor. This takes the table rules and builds a 142 * collation table out of them. Please see RuleBasedCollator class 143 * description for more details on the collation rule syntax. 144 * @param rules the collation rules to build the collation table from. 145 * @param decompositionMode the normalisation mode 146 * @param status reporting a success or an error. 147 * @stable ICU 2.0 148 */ 149 RuleBasedCollator(const UnicodeString& rules, 150 UColAttributeValue decompositionMode, 151 UErrorCode& status); 152 153 /** 154 * RuleBasedCollator constructor. This takes the table rules and builds a 155 * collation table out of them. Please see RuleBasedCollator class 156 * description for more details on the collation rule syntax. 157 * @param rules the collation rules to build the collation table from. 158 * @param collationStrength strength for comparison 159 * @param decompositionMode the normalisation mode 160 * @param status reporting a success or an error. 161 * @stable ICU 2.0 162 */ 163 RuleBasedCollator(const UnicodeString& rules, 164 ECollationStrength collationStrength, 165 UColAttributeValue decompositionMode, 166 UErrorCode& status); 167 168 #ifndef U_HIDE_INTERNAL_API 169 /** 170 * TODO: document & propose as public API 171 * @internal 172 */ 173 RuleBasedCollator(const UnicodeString &rules, 174 UParseError &parseError, UnicodeString &reason, 175 UErrorCode &errorCode); 176 #endif /* U_HIDE_INTERNAL_API */ 177 178 /** 179 * Copy constructor. 180 * @param other the RuleBasedCollator object to be copied 181 * @stable ICU 2.0 182 */ 183 RuleBasedCollator(const RuleBasedCollator& other); 184 185 186 /** Opens a collator from a collator binary image created using 187 * cloneBinary. Binary image used in instantiation of the 188 * collator remains owned by the user and should stay around for 189 * the lifetime of the collator. The API also takes a base collator 190 * which must be the root collator. 191 * @param bin binary image owned by the user and required through the 192 * lifetime of the collator 193 * @param length size of the image. If negative, the API will try to 194 * figure out the length of the image 195 * @param base Base collator, for lookup of untailored characters. 196 * Must be the root collator, must not be NULL. 197 * The base is required to be present through the lifetime of the collator. 198 * @param status for catching errors 199 * @return newly created collator 200 * @see cloneBinary 201 * @stable ICU 3.4 202 */ 203 RuleBasedCollator(const uint8_t *bin, int32_t length, 204 const RuleBasedCollator *base, 205 UErrorCode &status); 206 207 /** 208 * Destructor. 209 * @stable ICU 2.0 210 */ 211 virtual ~RuleBasedCollator(); 212 213 /** 214 * Assignment operator. 215 * @param other other RuleBasedCollator object to copy from. 216 * @stable ICU 2.0 217 */ 218 RuleBasedCollator& operator=(const RuleBasedCollator& other); 219 220 /** 221 * Returns true if argument is the same as this object. 222 * @param other Collator object to be compared. 223 * @return true if arguments is the same as this object. 224 * @stable ICU 2.0 225 */ 226 virtual bool operator==(const Collator& other) const override; 227 228 /** 229 * Makes a copy of this object. 230 * @return a copy of this object, owned by the caller 231 * @stable ICU 2.0 232 */ 233 virtual RuleBasedCollator* clone() const override; 234 235 /** 236 * Creates a collation element iterator for the source string. The caller of 237 * this method is responsible for the memory management of the return 238 * pointer. 239 * @param source the string over which the CollationElementIterator will 240 * iterate. 241 * @return the collation element iterator of the source string using this as 242 * the based Collator. 243 * @stable ICU 2.2 244 */ 245 virtual CollationElementIterator* createCollationElementIterator( 246 const UnicodeString& source) const; 247 248 /** 249 * Creates a collation element iterator for the source. The caller of this 250 * method is responsible for the memory management of the returned pointer. 251 * @param source the CharacterIterator which produces the characters over 252 * which the CollationElementItgerator will iterate. 253 * @return the collation element iterator of the source using this as the 254 * based Collator. 255 * @stable ICU 2.2 256 */ 257 virtual CollationElementIterator* createCollationElementIterator( 258 const CharacterIterator& source) const; 259 260 // Make deprecated versions of Collator::compare() visible. 261 using Collator::compare; 262 263 /** 264 * The comparison function compares the character data stored in two 265 * different strings. Returns information about whether a string is less 266 * than, greater than or equal to another string. 267 * @param source the source string to be compared with. 268 * @param target the string that is to be compared with the source string. 269 * @param status possible error code 270 * @return Returns an enum value. UCOL_GREATER if source is greater 271 * than target; UCOL_EQUAL if source is equal to target; UCOL_LESS if source is less 272 * than target 273 * @stable ICU 2.6 274 **/ 275 virtual UCollationResult compare(const UnicodeString& source, 276 const UnicodeString& target, 277 UErrorCode &status) const override; 278 279 /** 280 * Does the same thing as compare but limits the comparison to a specified 281 * length 282 * @param source the source string to be compared with. 283 * @param target the string that is to be compared with the source string. 284 * @param length the length the comparison is limited to 285 * @param status possible error code 286 * @return Returns an enum value. UCOL_GREATER if source (up to the specified 287 * length) is greater than target; UCOL_EQUAL if source (up to specified 288 * length) is equal to target; UCOL_LESS if source (up to the specified 289 * length) is less than target. 290 * @stable ICU 2.6 291 */ 292 virtual UCollationResult compare(const UnicodeString& source, 293 const UnicodeString& target, 294 int32_t length, 295 UErrorCode &status) const override; 296 297 /** 298 * The comparison function compares the character data stored in two 299 * different string arrays. Returns information about whether a string array 300 * is less than, greater than or equal to another string array. 301 * @param source the source string array to be compared with. 302 * @param sourceLength the length of the source string array. If this value 303 * is equal to -1, the string array is null-terminated. 304 * @param target the string that is to be compared with the source string. 305 * @param targetLength the length of the target string array. If this value 306 * is equal to -1, the string array is null-terminated. 307 * @param status possible error code 308 * @return Returns an enum value. UCOL_GREATER if source is greater 309 * than target; UCOL_EQUAL if source is equal to target; UCOL_LESS if source is less 310 * than target 311 * @stable ICU 2.6 312 */ 313 virtual UCollationResult compare(const char16_t* source, int32_t sourceLength, 314 const char16_t* target, int32_t targetLength, 315 UErrorCode &status) const override; 316 317 /** 318 * Compares two strings using the Collator. 319 * Returns whether the first one compares less than/equal to/greater than 320 * the second one. 321 * This version takes UCharIterator input. 322 * @param sIter the first ("source") string iterator 323 * @param tIter the second ("target") string iterator 324 * @param status ICU status 325 * @return UCOL_LESS, UCOL_EQUAL or UCOL_GREATER 326 * @stable ICU 4.2 327 */ 328 virtual UCollationResult compare(UCharIterator &sIter, 329 UCharIterator &tIter, 330 UErrorCode &status) const override; 331 332 /** 333 * Compares two UTF-8 strings using the Collator. 334 * Returns whether the first one compares less than/equal to/greater than 335 * the second one. 336 * This version takes UTF-8 input. 337 * Note that a StringPiece can be implicitly constructed 338 * from a std::string or a NUL-terminated const char * string. 339 * @param source the first UTF-8 string 340 * @param target the second UTF-8 string 341 * @param status ICU status 342 * @return UCOL_LESS, UCOL_EQUAL or UCOL_GREATER 343 * @stable ICU 51 344 */ 345 virtual UCollationResult compareUTF8(const StringPiece &source, 346 const StringPiece &target, 347 UErrorCode &status) const override; 348 349 /** 350 * Transforms the string into a series of characters 351 * that can be compared with CollationKey.compare(). 352 * 353 * Note that sort keys are often less efficient than simply doing comparison. 354 * For more details, see the ICU User Guide. 355 * 356 * @param source the source string. 357 * @param key the transformed key of the source string. 358 * @param status the error code status. 359 * @return the transformed key. 360 * @see CollationKey 361 * @stable ICU 2.0 362 */ 363 virtual CollationKey& getCollationKey(const UnicodeString& source, 364 CollationKey& key, 365 UErrorCode& status) const override; 366 367 /** 368 * Transforms a specified region of the string into a series of characters 369 * that can be compared with CollationKey.compare. 370 * 371 * Note that sort keys are often less efficient than simply doing comparison. 372 * For more details, see the ICU User Guide. 373 * 374 * @param source the source string. 375 * @param sourceLength the length of the source string. 376 * @param key the transformed key of the source string. 377 * @param status the error code status. 378 * @return the transformed key. 379 * @see CollationKey 380 * @stable ICU 2.0 381 */ 382 virtual CollationKey& getCollationKey(const char16_t *source, 383 int32_t sourceLength, 384 CollationKey& key, 385 UErrorCode& status) const override; 386 387 /** 388 * Generates the hash code for the rule-based collation object. 389 * @return the hash code. 390 * @stable ICU 2.0 391 */ 392 virtual int32_t hashCode() const override; 393 394 #ifndef U_FORCE_HIDE_DEPRECATED_API 395 /** 396 * Gets the locale of the Collator 397 * @param type can be either requested, valid or actual locale. For more 398 * information see the definition of ULocDataLocaleType in 399 * uloc.h 400 * @param status the error code status. 401 * @return locale where the collation data lives. If the collator 402 * was instantiated from rules, locale is empty. 403 * @deprecated ICU 2.8 likely to change in ICU 3.0, based on feedback 404 */ 405 virtual Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const override; 406 #endif // U_FORCE_HIDE_DEPRECATED_API 407 408 /** 409 * Gets the tailoring rules for this collator. 410 * @return the collation tailoring from which this collator was created 411 * @stable ICU 2.0 412 */ 413 const UnicodeString& getRules() const; 414 415 /** 416 * Gets the version information for a Collator. 417 * @param info the version # information, the result will be filled in 418 * @stable ICU 2.0 419 */ 420 virtual void getVersion(UVersionInfo info) const override; 421 422 #ifndef U_HIDE_DEPRECATED_API 423 /** 424 * Returns the maximum length of any expansion sequences that end with the 425 * specified comparison order. 426 * 427 * This is specific to the kind of collation element values and sequences 428 * returned by the CollationElementIterator. 429 * Call CollationElementIterator::getMaxExpansion() instead. 430 * 431 * @param order a collation order returned by CollationElementIterator::previous 432 * or CollationElementIterator::next. 433 * @return maximum size of the expansion sequences ending with the collation 434 * element, or 1 if the collation element does not occur at the end of 435 * any expansion sequence 436 * @see CollationElementIterator#getMaxExpansion 437 * @deprecated ICU 51 Use CollationElementIterator::getMaxExpansion() instead. 438 */ 439 int32_t getMaxExpansion(int32_t order) const; 440 #endif /* U_HIDE_DEPRECATED_API */ 441 442 /** 443 * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. This 444 * method is to implement a simple version of RTTI, since not all C++ 445 * compilers support genuine RTTI. Polymorphic operator==() and clone() 446 * methods call this method. 447 * @return The class ID for this object. All objects of a given class have 448 * the same class ID. Objects of other classes have different class 449 * IDs. 450 * @stable ICU 2.0 451 */ 452 virtual UClassID getDynamicClassID(void) const override; 453 454 /** 455 * Returns the class ID for this class. This is useful only for comparing to 456 * a return value from getDynamicClassID(). For example: 457 * <pre> 458 * Base* polymorphic_pointer = createPolymorphicObject(); 459 * if (polymorphic_pointer->getDynamicClassID() == 460 * Derived::getStaticClassID()) ... 461 * </pre> 462 * @return The class ID for all objects of this class. 463 * @stable ICU 2.0 464 */ 465 static UClassID U_EXPORT2 getStaticClassID(void); 466 467 #ifndef U_HIDE_DEPRECATED_API 468 /** 469 * Do not use this method: The caller and the ICU library might use different heaps. 470 * Use cloneBinary() instead which writes to caller-provided memory. 471 * 472 * Returns a binary format of this collator. 473 * @param length Returns the length of the data, in bytes 474 * @param status the error code status. 475 * @return memory, owned by the caller, of size 'length' bytes. 476 * @deprecated ICU 52. Use cloneBinary() instead. 477 */ 478 uint8_t *cloneRuleData(int32_t &length, UErrorCode &status) const; 479 #endif /* U_HIDE_DEPRECATED_API */ 480 481 /** Creates a binary image of a collator. This binary image can be stored and 482 * later used to instantiate a collator using ucol_openBinary. 483 * This API supports preflighting. 484 * @param buffer a fill-in buffer to receive the binary image 485 * @param capacity capacity of the destination buffer 486 * @param status for catching errors 487 * @return size of the image 488 * @see ucol_openBinary 489 * @stable ICU 3.4 490 */ 491 int32_t cloneBinary(uint8_t *buffer, int32_t capacity, UErrorCode &status) const; 492 493 /** 494 * Returns current rules. Delta defines whether full rules are returned or 495 * just the tailoring. 496 * 497 * getRules(void) should normally be used instead. 498 * See https://unicode-org.github.io/icu/userguide/collation/customization#building-on-existing-locales 499 * @param delta one of UCOL_TAILORING_ONLY, UCOL_FULL_RULES. 500 * @param buffer UnicodeString to store the result rules 501 * @stable ICU 2.2 502 * @see UCOL_FULL_RULES 503 */ 504 void getRules(UColRuleOption delta, UnicodeString &buffer) const; 505 506 /** 507 * Universal attribute setter 508 * @param attr attribute type 509 * @param value attribute value 510 * @param status to indicate whether the operation went on smoothly or there were errors 511 * @stable ICU 2.2 512 */ 513 virtual void setAttribute(UColAttribute attr, UColAttributeValue value, 514 UErrorCode &status) override; 515 516 /** 517 * Universal attribute getter. 518 * @param attr attribute type 519 * @param status to indicate whether the operation went on smoothly or there were errors 520 * @return attribute value 521 * @stable ICU 2.2 522 */ 523 virtual UColAttributeValue getAttribute(UColAttribute attr, 524 UErrorCode &status) const override; 525 526 /** 527 * Sets the variable top to the top of the specified reordering group. 528 * The variable top determines the highest-sorting character 529 * which is affected by UCOL_ALTERNATE_HANDLING. 530 * If that attribute is set to UCOL_NON_IGNORABLE, then the variable top has no effect. 531 * @param group one of UCOL_REORDER_CODE_SPACE, UCOL_REORDER_CODE_PUNCTUATION, 532 * UCOL_REORDER_CODE_SYMBOL, UCOL_REORDER_CODE_CURRENCY; 533 * or UCOL_REORDER_CODE_DEFAULT to restore the default max variable group 534 * @param errorCode Standard ICU error code. Its input value must 535 * pass the U_SUCCESS() test, or else the function returns 536 * immediately. Check for U_FAILURE() on output or use with 537 * function chaining. (See User Guide for details.) 538 * @return *this 539 * @see getMaxVariable 540 * @stable ICU 53 541 */ 542 virtual Collator &setMaxVariable(UColReorderCode group, UErrorCode &errorCode) override; 543 544 /** 545 * Returns the maximum reordering group whose characters are affected by UCOL_ALTERNATE_HANDLING. 546 * @return the maximum variable reordering group. 547 * @see setMaxVariable 548 * @stable ICU 53 549 */ 550 virtual UColReorderCode getMaxVariable() const override; 551 552 #ifndef U_FORCE_HIDE_DEPRECATED_API 553 /** 554 * Sets the variable top to the primary weight of the specified string. 555 * 556 * Beginning with ICU 53, the variable top is pinned to 557 * the top of one of the supported reordering groups, 558 * and it must not be beyond the last of those groups. 559 * See setMaxVariable(). 560 * @param varTop one or more (if contraction) char16_ts to which the variable top should be set 561 * @param len length of variable top string. If -1 it is considered to be zero terminated. 562 * @param status error code. If error code is set, the return value is undefined. Errors set by this function are: <br> 563 * U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such contraction<br> 564 * U_ILLEGAL_ARGUMENT_ERROR if the variable top is beyond 565 * the last reordering group supported by setMaxVariable() 566 * @return variable top primary weight 567 * @deprecated ICU 53 Call setMaxVariable() instead. 568 */ 569 virtual uint32_t setVariableTop(const char16_t *varTop, int32_t len, UErrorCode &status) override; 570 571 /** 572 * Sets the variable top to the primary weight of the specified string. 573 * 574 * Beginning with ICU 53, the variable top is pinned to 575 * the top of one of the supported reordering groups, 576 * and it must not be beyond the last of those groups. 577 * See setMaxVariable(). 578 * @param varTop a UnicodeString size 1 or more (if contraction) of char16_ts to which the variable top should be set 579 * @param status error code. If error code is set, the return value is undefined. Errors set by this function are: <br> 580 * U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such contraction<br> 581 * U_ILLEGAL_ARGUMENT_ERROR if the variable top is beyond 582 * the last reordering group supported by setMaxVariable() 583 * @return variable top primary weight 584 * @deprecated ICU 53 Call setMaxVariable() instead. 585 */ 586 virtual uint32_t setVariableTop(const UnicodeString &varTop, UErrorCode &status) override; 587 588 /** 589 * Sets the variable top to the specified primary weight. 590 * 591 * Beginning with ICU 53, the variable top is pinned to 592 * the top of one of the supported reordering groups, 593 * and it must not be beyond the last of those groups. 594 * See setMaxVariable(). 595 * @param varTop primary weight, as returned by setVariableTop or ucol_getVariableTop 596 * @param status error code 597 * @deprecated ICU 53 Call setMaxVariable() instead. 598 */ 599 virtual void setVariableTop(uint32_t varTop, UErrorCode &status) override; 600 #endif // U_FORCE_HIDE_DEPRECATED_API 601 602 /** 603 * Gets the variable top value of a Collator. 604 * @param status error code (not changed by function). If error code is set, the return value is undefined. 605 * @return the variable top primary weight 606 * @see getMaxVariable 607 * @stable ICU 2.0 608 */ 609 virtual uint32_t getVariableTop(UErrorCode &status) const override; 610 611 /** 612 * Get a UnicodeSet that contains all the characters and sequences tailored in 613 * this collator. 614 * @param status error code of the operation 615 * @return a pointer to a UnicodeSet object containing all the 616 * code points and sequences that may sort differently than 617 * in the root collator. The object must be disposed of by using delete 618 * @stable ICU 2.4 619 */ 620 virtual UnicodeSet *getTailoredSet(UErrorCode &status) const override; 621 622 /** 623 * Get the sort key as an array of bytes from a UnicodeString. 624 * 625 * Note that sort keys are often less efficient than simply doing comparison. 626 * For more details, see the ICU User Guide. 627 * 628 * @param source string to be processed. 629 * @param result buffer to store result in. If NULL, number of bytes needed 630 * will be returned. 631 * @param resultLength length of the result buffer. If if not enough the 632 * buffer will be filled to capacity. 633 * @return Number of bytes needed for storing the sort key 634 * @stable ICU 2.0 635 */ 636 virtual int32_t getSortKey(const UnicodeString& source, uint8_t *result, 637 int32_t resultLength) const override; 638 639 /** 640 * Get the sort key as an array of bytes from a char16_t buffer. 641 * 642 * Note that sort keys are often less efficient than simply doing comparison. 643 * For more details, see the ICU User Guide. 644 * 645 * @param source string to be processed. 646 * @param sourceLength length of string to be processed. If -1, the string 647 * is 0 terminated and length will be decided by the function. 648 * @param result buffer to store result in. If NULL, number of bytes needed 649 * will be returned. 650 * @param resultLength length of the result buffer. If if not enough the 651 * buffer will be filled to capacity. 652 * @return Number of bytes needed for storing the sort key 653 * @stable ICU 2.2 654 */ 655 virtual int32_t getSortKey(const char16_t *source, int32_t sourceLength, 656 uint8_t *result, int32_t resultLength) const override; 657 658 /** 659 * Retrieves the reordering codes for this collator. 660 * @param dest The array to fill with the script ordering. 661 * @param destCapacity The length of dest. If it is 0, then dest may be NULL and the function 662 * will only return the length of the result without writing any codes (pre-flighting). 663 * @param status A reference to an error code value, which must not indicate 664 * a failure before the function call. 665 * @return The length of the script ordering array. 666 * @see ucol_setReorderCodes 667 * @see Collator#getEquivalentReorderCodes 668 * @see Collator#setReorderCodes 669 * @stable ICU 4.8 670 */ 671 virtual int32_t getReorderCodes(int32_t *dest, 672 int32_t destCapacity, 673 UErrorCode& status) const override; 674 675 /** 676 * Sets the ordering of scripts for this collator. 677 * @param reorderCodes An array of script codes in the new order. This can be NULL if the 678 * length is also set to 0. An empty array will clear any reordering codes on the collator. 679 * @param reorderCodesLength The length of reorderCodes. 680 * @param status error code 681 * @see ucol_setReorderCodes 682 * @see Collator#getReorderCodes 683 * @see Collator#getEquivalentReorderCodes 684 * @stable ICU 4.8 685 */ 686 virtual void setReorderCodes(const int32_t* reorderCodes, 687 int32_t reorderCodesLength, 688 UErrorCode& status) override; 689 690 /** 691 * Implements ucol_strcollUTF8(). 692 * @internal 693 */ 694 virtual UCollationResult internalCompareUTF8( 695 const char *left, int32_t leftLength, 696 const char *right, int32_t rightLength, 697 UErrorCode &errorCode) const override; 698 699 /** Get the short definition string for a collator. This internal API harvests the collator's 700 * locale and the attribute set and produces a string that can be used for opening 701 * a collator with the same attributes using the ucol_openFromShortString API. 702 * This string will be normalized. 703 * The structure and the syntax of the string is defined in the "Naming collators" 704 * section of the users guide: 705 * https://unicode-org.github.io/icu/userguide/collation/concepts#collator-naming-scheme 706 * This function supports preflighting. 707 * 708 * This is internal, and intended to be used with delegate converters. 709 * 710 * @param locale a locale that will appear as a collators locale in the resulting 711 * short string definition. If NULL, the locale will be harvested 712 * from the collator. 713 * @param buffer space to hold the resulting string 714 * @param capacity capacity of the buffer 715 * @param status for returning errors. All the preflighting errors are featured 716 * @return length of the resulting string 717 * @see ucol_openFromShortString 718 * @see ucol_normalizeShortDefinitionString 719 * @see ucol_getShortDefinitionString 720 * @internal 721 */ 722 virtual int32_t internalGetShortDefinitionString(const char *locale, 723 char *buffer, 724 int32_t capacity, 725 UErrorCode &status) const override; 726 727 /** 728 * Implements ucol_nextSortKeyPart(). 729 * @internal 730 */ 731 virtual int32_t internalNextSortKeyPart( 732 UCharIterator *iter, uint32_t state[2], 733 uint8_t *dest, int32_t count, UErrorCode &errorCode) const override; 734 735 // Do not enclose the default constructor with #ifndef U_HIDE_INTERNAL_API 736 /** 737 * Only for use in ucol_openRules(). 738 * @internal 739 */ 740 RuleBasedCollator(); 741 742 #ifndef U_HIDE_INTERNAL_API 743 /** 744 * Implements ucol_getLocaleByType(). 745 * Needed because the lifetime of the locale ID string must match that of the collator. 746 * getLocale() returns a copy of a Locale, with minimal lifetime in a C wrapper. 747 * @internal 748 */ 749 const char *internalGetLocaleID(ULocDataLocaleType type, UErrorCode &errorCode) const; 750 751 /** 752 * Implements ucol_getContractionsAndExpansions(). 753 * Gets this collator's sets of contraction strings and/or 754 * characters and strings that map to multiple collation elements (expansions). 755 * If addPrefixes is true, then contractions that are expressed as 756 * prefix/pre-context rules are included. 757 * @param contractions if not NULL, the set to hold the contractions 758 * @param expansions if not NULL, the set to hold the expansions 759 * @param addPrefixes include prefix contextual mappings 760 * @param errorCode in/out ICU error code 761 * @internal 762 */ 763 void internalGetContractionsAndExpansions( 764 UnicodeSet *contractions, UnicodeSet *expansions, 765 UBool addPrefixes, UErrorCode &errorCode) const; 766 767 /** 768 * Adds the contractions that start with character c to the set. 769 * Ignores prefixes. Used by AlphabeticIndex. 770 * @internal 771 */ 772 void internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCode &errorCode) const; 773 774 /** 775 * Implements from-rule constructors, and ucol_openRules(). 776 * @internal 777 */ 778 void internalBuildTailoring( 779 const UnicodeString &rules, 780 int32_t strength, 781 UColAttributeValue decompositionMode, 782 UParseError *outParseError, UnicodeString *outReason, 783 UErrorCode &errorCode); 784 785 /** @internal */ rbcFromUCollator(UCollator * uc)786 static inline RuleBasedCollator *rbcFromUCollator(UCollator *uc) { 787 return dynamic_cast<RuleBasedCollator *>(fromUCollator(uc)); 788 } 789 /** @internal */ rbcFromUCollator(const UCollator * uc)790 static inline const RuleBasedCollator *rbcFromUCollator(const UCollator *uc) { 791 return dynamic_cast<const RuleBasedCollator *>(fromUCollator(uc)); 792 } 793 794 /** 795 * Appends the CEs for the string to the vector. 796 * @internal for tests & tools 797 */ 798 void internalGetCEs(const UnicodeString &str, UVector64 &ces, UErrorCode &errorCode) const; 799 #endif // U_HIDE_INTERNAL_API 800 801 protected: 802 /** 803 * Used internally by registration to define the requested and valid locales. 804 * @param requestedLocale the requested locale 805 * @param validLocale the valid locale 806 * @param actualLocale the actual locale 807 * @internal 808 */ 809 virtual void setLocales(const Locale& requestedLocale, const Locale& validLocale, const Locale& actualLocale) override; 810 811 private: 812 friend class CollationElementIterator; 813 friend class Collator; 814 815 RuleBasedCollator(const CollationCacheEntry *entry); 816 817 /** 818 * Enumeration of attributes that are relevant for short definition strings 819 * (e.g., ucol_getShortDefinitionString()). 820 * Effectively extends UColAttribute. 821 */ 822 enum Attributes { 823 ATTR_VARIABLE_TOP = UCOL_ATTRIBUTE_COUNT, 824 ATTR_LIMIT 825 }; 826 827 void adoptTailoring(CollationTailoring *t, UErrorCode &errorCode); 828 829 // Both lengths must be <0 or else both must be >=0. 830 UCollationResult doCompare(const char16_t *left, int32_t leftLength, 831 const char16_t *right, int32_t rightLength, 832 UErrorCode &errorCode) const; 833 UCollationResult doCompare(const uint8_t *left, int32_t leftLength, 834 const uint8_t *right, int32_t rightLength, 835 UErrorCode &errorCode) const; 836 837 void writeSortKey(const char16_t *s, int32_t length, 838 SortKeyByteSink &sink, UErrorCode &errorCode) const; 839 840 void writeIdenticalLevel(const char16_t *s, const char16_t *limit, 841 SortKeyByteSink &sink, UErrorCode &errorCode) const; 842 843 const CollationSettings &getDefaultSettings() const; 844 setAttributeDefault(int32_t attribute)845 void setAttributeDefault(int32_t attribute) { 846 explicitlySetAttributes &= ~((uint32_t)1 << attribute); 847 } setAttributeExplicitly(int32_t attribute)848 void setAttributeExplicitly(int32_t attribute) { 849 explicitlySetAttributes |= (uint32_t)1 << attribute; 850 } attributeHasBeenSetExplicitly(int32_t attribute)851 UBool attributeHasBeenSetExplicitly(int32_t attribute) const { 852 // assert(0 <= attribute < ATTR_LIMIT); 853 return (UBool)((explicitlySetAttributes & ((uint32_t)1 << attribute)) != 0); 854 } 855 856 /** 857 * Tests whether a character is "unsafe" for use as a collation starting point. 858 * 859 * @param c code point or code unit 860 * @return true if c is unsafe 861 * @see CollationElementIterator#setOffset(int) 862 */ 863 UBool isUnsafe(UChar32 c) const; 864 865 static void U_CALLCONV computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode); 866 UBool initMaxExpansions(UErrorCode &errorCode) const; 867 868 void setFastLatinOptions(CollationSettings &ownedSettings) const; 869 870 const CollationData *data; 871 const CollationSettings *settings; // reference-counted 872 const CollationTailoring *tailoring; // alias of cacheEntry->tailoring 873 const CollationCacheEntry *cacheEntry; // reference-counted 874 Locale validLocale; 875 uint32_t explicitlySetAttributes; 876 877 UBool actualLocaleIsSameAsValid; 878 }; 879 880 U_NAMESPACE_END 881 882 #endif // !UCONFIG_NO_COLLATION 883 884 #endif /* U_SHOW_CPLUSPLUS_API */ 885 886 #endif // TBLCOLL_H 887