1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2001-2014 IBM and others. All rights reserved. 6 ********************************************************************** 7 * Date Name Description 8 * 03/22/2000 helena Creation. 9 ********************************************************************** 10 */ 11 12 #ifndef STSEARCH_H 13 #define STSEARCH_H 14 15 #include "unicode/utypes.h" 16 17 #if U_SHOW_CPLUSPLUS_API 18 19 /** 20 * \file 21 * \brief C++ API: Service for searching text based on RuleBasedCollator. 22 */ 23 24 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION 25 26 #include "unicode/tblcoll.h" 27 #include "unicode/coleitr.h" 28 #include "unicode/search.h" 29 30 U_NAMESPACE_BEGIN 31 32 /** 33 * 34 * <tt>StringSearch</tt> is a <tt>SearchIterator</tt> that provides 35 * language-sensitive text searching based on the comparison rules defined 36 * in a {@link RuleBasedCollator} object. 37 * StringSearch ensures that language eccentricity can be 38 * handled, e.g. for the German collator, characters ß and SS will be matched 39 * if case is chosen to be ignored. 40 * See the <a href="https://htmlpreview.github.io/?https://github.com/unicode-org/icu-docs/blob/main/design/collation/ICU_collation_design.htm"> 41 * "ICU Collation Design Document"</a> for more information. 42 * <p> 43 * There are 2 match options for selection:<br> 44 * Let S' be the sub-string of a text string S between the offsets start and 45 * end [start, end]. 46 * <br> 47 * A pattern string P matches a text string S at the offsets [start, end] 48 * if 49 * <pre> 50 * option 1. Some canonical equivalent of P matches some canonical equivalent 51 * of S' 52 * option 2. P matches S' and if P starts or ends with a combining mark, 53 * there exists no non-ignorable combining mark before or after S? 54 * in S respectively. 55 * </pre> 56 * Option 2. will be the default. 57 * <p> 58 * This search has APIs similar to that of other text iteration mechanisms 59 * such as the break iterators in <tt>BreakIterator</tt>. Using these 60 * APIs, it is easy to scan through text looking for all occurrences of 61 * a given pattern. This search iterator allows changing of direction by 62 * calling a <tt>reset</tt> followed by a <tt>next</tt> or <tt>previous</tt>. 63 * Though a direction change can occur without calling <tt>reset</tt> first, 64 * this operation comes with some speed penalty. 65 * Match results in the forward direction will match the result matches in 66 * the backwards direction in the reverse order 67 * <p> 68 * <tt>SearchIterator</tt> provides APIs to specify the starting position 69 * within the text string to be searched, e.g. <tt>setOffset</tt>, 70 * <tt>preceding</tt> and <tt>following</tt>. Since the 71 * starting position will be set as it is specified, please take note that 72 * there are some danger points which the search may render incorrect 73 * results: 74 * <ul> 75 * <li> The midst of a substring that requires normalization. 76 * <li> If the following match is to be found, the position should not be the 77 * second character which requires to be swapped with the preceding 78 * character. Vice versa, if the preceding match is to be found, 79 * position to search from should not be the first character which 80 * requires to be swapped with the next character. E.g certain Thai and 81 * Lao characters require swapping. 82 * <li> If a following pattern match is to be found, any position within a 83 * contracting sequence except the first will fail. Vice versa if a 84 * preceding pattern match is to be found, a invalid starting point 85 * would be any character within a contracting sequence except the last. 86 * </ul> 87 * <p> 88 * A <tt>BreakIterator</tt> can be used if only matches at logical breaks are desired. 89 * Using a <tt>BreakIterator</tt> will only give you results that exactly matches the 90 * boundaries given by the breakiterator. For instance the pattern "e" will 91 * not be found in the string "\u00e9" if a character break iterator is used. 92 * <p> 93 * Options are provided to handle overlapping matches. 94 * E.g. In English, overlapping matches produces the result 0 and 2 95 * for the pattern "abab" in the text "ababab", where else mutually 96 * exclusive matches only produce the result of 0. 97 * <p> 98 * Though collator attributes will be taken into consideration while 99 * performing matches, there are no APIs here for setting and getting the 100 * attributes. These attributes can be set by getting the collator 101 * from <tt>getCollator</tt> and using the APIs in <tt>coll.h</tt>. 102 * Lastly to update <tt>StringSearch</tt> to the new collator attributes, 103 * <tt>reset</tt> has to be called. 104 * <p> 105 * Restriction: <br> 106 * Currently there are no composite characters that consists of a 107 * character with combining class > 0 before a character with combining 108 * class == 0. However, if such a character exists in the future, 109 * <tt>StringSearch</tt> does not guarantee the results for option 1. 110 * <p> 111 * Consult the <tt>SearchIterator</tt> documentation for information on 112 * and examples of how to use instances of this class to implement text 113 * searching. 114 * <pre><code> 115 * UnicodeString target("The quick brown fox jumps over the lazy dog."); 116 * UnicodeString pattern("fox"); 117 * 118 * UErrorCode error = U_ZERO_ERROR; 119 * StringSearch iter(pattern, target, Locale::getUS(), NULL, status); 120 * for (int pos = iter.first(error); 121 * pos != USEARCH_DONE; 122 * pos = iter.next(error)) 123 * { 124 * printf("Found match at %d pos, length is %d\n", pos, iter.getMatchedLength()); 125 * } 126 * </code></pre> 127 * <p> 128 * Note, <tt>StringSearch</tt> is not to be subclassed. 129 * </p> 130 * @see SearchIterator 131 * @see RuleBasedCollator 132 * @since ICU 2.0 133 */ 134 135 class U_I18N_API StringSearch U_FINAL : public SearchIterator 136 { 137 public: 138 139 // public constructors and destructors -------------------------------- 140 141 /** 142 * Creating a <tt>StringSearch</tt> instance using the argument locale 143 * language rule set. A collator will be created in the process, which 144 * will be owned by this instance and will be deleted during 145 * destruction 146 * @param pattern The text for which this object will search. 147 * @param text The text in which to search for the pattern. 148 * @param locale A locale which defines the language-sensitive 149 * comparison rules used to determine whether text in the 150 * pattern and target matches. 151 * @param breakiter A <tt>BreakIterator</tt> object used to constrain 152 * the matches that are found. Matches whose start and end 153 * indices in the target text are not boundaries as 154 * determined by the <tt>BreakIterator</tt> are 155 * ignored. If this behavior is not desired, 156 * <tt>NULL</tt> can be passed in instead. 157 * @param status for errors if any. If pattern or text is NULL, or if 158 * either the length of pattern or text is 0 then an 159 * U_ILLEGAL_ARGUMENT_ERROR is returned. 160 * @stable ICU 2.0 161 */ 162 StringSearch(const UnicodeString &pattern, const UnicodeString &text, 163 const Locale &locale, 164 BreakIterator *breakiter, 165 UErrorCode &status); 166 167 /** 168 * Creating a <tt>StringSearch</tt> instance using the argument collator 169 * language rule set. Note, user retains the ownership of this collator, 170 * it does not get destroyed during this instance's destruction. 171 * @param pattern The text for which this object will search. 172 * @param text The text in which to search for the pattern. 173 * @param coll A <tt>RuleBasedCollator</tt> object which defines 174 * the language-sensitive comparison rules used to 175 * determine whether text in the pattern and target 176 * matches. User is responsible for the clearing of this 177 * object. 178 * @param breakiter A <tt>BreakIterator</tt> object used to constrain 179 * the matches that are found. Matches whose start and end 180 * indices in the target text are not boundaries as 181 * determined by the <tt>BreakIterator</tt> are 182 * ignored. If this behavior is not desired, 183 * <tt>NULL</tt> can be passed in instead. 184 * @param status for errors if any. If either the length of pattern or 185 * text is 0 then an U_ILLEGAL_ARGUMENT_ERROR is returned. 186 * @stable ICU 2.0 187 */ 188 StringSearch(const UnicodeString &pattern, 189 const UnicodeString &text, 190 RuleBasedCollator *coll, 191 BreakIterator *breakiter, 192 UErrorCode &status); 193 194 /** 195 * Creating a <tt>StringSearch</tt> instance using the argument locale 196 * language rule set. A collator will be created in the process, which 197 * will be owned by this instance and will be deleted during 198 * destruction 199 * <p> 200 * Note: No parsing of the text within the <tt>CharacterIterator</tt> 201 * will be done during searching for this version. The block of text 202 * in <tt>CharacterIterator</tt> will be used as it is. 203 * @param pattern The text for which this object will search. 204 * @param text The text iterator in which to search for the pattern. 205 * @param locale A locale which defines the language-sensitive 206 * comparison rules used to determine whether text in the 207 * pattern and target matches. User is responsible for 208 * the clearing of this object. 209 * @param breakiter A <tt>BreakIterator</tt> object used to constrain 210 * the matches that are found. Matches whose start and end 211 * indices in the target text are not boundaries as 212 * determined by the <tt>BreakIterator</tt> are 213 * ignored. If this behavior is not desired, 214 * <tt>NULL</tt> can be passed in instead. 215 * @param status for errors if any. If either the length of pattern or 216 * text is 0 then an U_ILLEGAL_ARGUMENT_ERROR is returned. 217 * @stable ICU 2.0 218 */ 219 StringSearch(const UnicodeString &pattern, CharacterIterator &text, 220 const Locale &locale, 221 BreakIterator *breakiter, 222 UErrorCode &status); 223 224 /** 225 * Creating a <tt>StringSearch</tt> instance using the argument collator 226 * language rule set. Note, user retains the ownership of this collator, 227 * it does not get destroyed during this instance's destruction. 228 * <p> 229 * Note: No parsing of the text within the <tt>CharacterIterator</tt> 230 * will be done during searching for this version. The block of text 231 * in <tt>CharacterIterator</tt> will be used as it is. 232 * @param pattern The text for which this object will search. 233 * @param text The text in which to search for the pattern. 234 * @param coll A <tt>RuleBasedCollator</tt> object which defines 235 * the language-sensitive comparison rules used to 236 * determine whether text in the pattern and target 237 * matches. User is responsible for the clearing of this 238 * object. 239 * @param breakiter A <tt>BreakIterator</tt> object used to constrain 240 * the matches that are found. Matches whose start and end 241 * indices in the target text are not boundaries as 242 * determined by the <tt>BreakIterator</tt> are 243 * ignored. If this behavior is not desired, 244 * <tt>NULL</tt> can be passed in instead. 245 * @param status for errors if any. If either the length of pattern or 246 * text is 0 then an U_ILLEGAL_ARGUMENT_ERROR is returned. 247 * @stable ICU 2.0 248 */ 249 StringSearch(const UnicodeString &pattern, CharacterIterator &text, 250 RuleBasedCollator *coll, 251 BreakIterator *breakiter, 252 UErrorCode &status); 253 254 /** 255 * Copy constructor that creates a StringSearch instance with the same 256 * behavior, and iterating over the same text. 257 * @param that StringSearch instance to be copied. 258 * @stable ICU 2.0 259 */ 260 StringSearch(const StringSearch &that); 261 262 /** 263 * Destructor. Cleans up the search iterator data struct. 264 * If a collator is created in the constructor, it will be destroyed here. 265 * @stable ICU 2.0 266 */ 267 virtual ~StringSearch(void); 268 269 /** 270 * Clone this object. 271 * Clones can be used concurrently in multiple threads. 272 * If an error occurs, then NULL is returned. 273 * The caller must delete the clone. 274 * 275 * @return a clone of this object 276 * 277 * @see getDynamicClassID 278 * @stable ICU 2.8 279 */ 280 StringSearch *clone() const; 281 282 // operator overloading --------------------------------------------- 283 284 /** 285 * Assignment operator. Sets this iterator to have the same behavior, 286 * and iterate over the same text, as the one passed in. 287 * @param that instance to be copied. 288 * @stable ICU 2.0 289 */ 290 StringSearch & operator=(const StringSearch &that); 291 292 /** 293 * Equality operator. 294 * @param that instance to be compared. 295 * @return true if both instances have the same attributes, 296 * breakiterators, collators and iterate over the same text 297 * while looking for the same pattern. 298 * @stable ICU 2.0 299 */ 300 virtual bool operator==(const SearchIterator &that) const override; 301 302 // public get and set methods ---------------------------------------- 303 304 /** 305 * Sets the index to point to the given position, and clears any state 306 * that's affected. 307 * <p> 308 * This method takes the argument index and sets the position in the text 309 * string accordingly without checking if the index is pointing to a 310 * valid starting point to begin searching. 311 * @param position within the text to be set. If position is less 312 * than or greater than the text range for searching, 313 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned 314 * @param status for errors if it occurs 315 * @stable ICU 2.0 316 */ 317 virtual void setOffset(int32_t position, UErrorCode &status) override; 318 319 /** 320 * Return the current index in the text being searched. 321 * If the iteration has gone past the end of the text 322 * (or past the beginning for a backwards search), USEARCH_DONE 323 * is returned. 324 * @return current index in the text being searched. 325 * @stable ICU 2.0 326 */ 327 virtual int32_t getOffset(void) const override; 328 329 /** 330 * Set the target text to be searched. 331 * Text iteration will hence begin at the start of the text string. 332 * This method is 333 * useful if you want to re-use an iterator to search for the same 334 * pattern within a different body of text. 335 * @param text text string to be searched 336 * @param status for errors if any. If the text length is 0 then an 337 * U_ILLEGAL_ARGUMENT_ERROR is returned. 338 * @stable ICU 2.0 339 */ 340 virtual void setText(const UnicodeString &text, UErrorCode &status) override; 341 342 /** 343 * Set the target text to be searched. 344 * Text iteration will hence begin at the start of the text string. 345 * This method is 346 * useful if you want to re-use an iterator to search for the same 347 * pattern within a different body of text. 348 * Note: No parsing of the text within the <tt>CharacterIterator</tt> 349 * will be done during searching for this version. The block of text 350 * in <tt>CharacterIterator</tt> will be used as it is. 351 * @param text text string to be searched 352 * @param status for errors if any. If the text length is 0 then an 353 * U_ILLEGAL_ARGUMENT_ERROR is returned. 354 * @stable ICU 2.0 355 */ 356 virtual void setText(CharacterIterator &text, UErrorCode &status) override; 357 358 /** 359 * Gets the collator used for the language rules. 360 * <p> 361 * Caller may modify but <b>must not</b> delete the <tt>RuleBasedCollator</tt>! 362 * Modifications to this collator will affect the original collator passed in to 363 * the <tt>StringSearch></tt> constructor or to setCollator, if any. 364 * @return collator used for string search 365 * @stable ICU 2.0 366 */ 367 RuleBasedCollator * getCollator() const; 368 369 /** 370 * Sets the collator used for the language rules. User retains the 371 * ownership of this collator, thus the responsibility of deletion lies 372 * with the user. The iterator's position will not be changed by this method. 373 * @param coll collator 374 * @param status for errors if any 375 * @stable ICU 2.0 376 */ 377 void setCollator(RuleBasedCollator *coll, UErrorCode &status); 378 379 /** 380 * Sets the pattern used for matching. 381 * The iterator's position will not be changed by this method. 382 * @param pattern search pattern to be found 383 * @param status for errors if any. If the pattern length is 0 then an 384 * U_ILLEGAL_ARGUMENT_ERROR is returned. 385 * @stable ICU 2.0 386 */ 387 void setPattern(const UnicodeString &pattern, UErrorCode &status); 388 389 /** 390 * Gets the search pattern. 391 * @return pattern used for matching 392 * @stable ICU 2.0 393 */ 394 const UnicodeString & getPattern() const; 395 396 // public methods ---------------------------------------------------- 397 398 /** 399 * Reset the iteration. 400 * Search will begin at the start of the text string if a forward 401 * iteration is initiated before a backwards iteration. Otherwise if 402 * a backwards iteration is initiated before a forwards iteration, the 403 * search will begin at the end of the text string. 404 * @stable ICU 2.0 405 */ 406 virtual void reset() override; 407 408 /** 409 * Returns a copy of StringSearch with the same behavior, and 410 * iterating over the same text, as this one. Note that all data will be 411 * replicated, except for the user-specified collator and the 412 * breakiterator. 413 * @return cloned object 414 * @stable ICU 2.0 415 */ 416 virtual StringSearch * safeClone() const override; 417 418 /** 419 * ICU "poor man's RTTI", returns a UClassID for the actual class. 420 * 421 * @stable ICU 2.2 422 */ 423 virtual UClassID getDynamicClassID() const override; 424 425 /** 426 * ICU "poor man's RTTI", returns a UClassID for this class. 427 * 428 * @stable ICU 2.2 429 */ 430 static UClassID U_EXPORT2 getStaticClassID(); 431 432 protected: 433 434 // protected method ------------------------------------------------- 435 436 /** 437 * Search forward for matching text, starting at a given location. 438 * Clients should not call this method directly; instead they should 439 * call {@link SearchIterator#next }. 440 * <p> 441 * If a match is found, this method returns the index at which the match 442 * starts and calls {@link SearchIterator#setMatchLength } with the number 443 * of characters in the target text that make up the match. If no match 444 * is found, the method returns <tt>USEARCH_DONE</tt>. 445 * <p> 446 * The <tt>StringSearch</tt> is adjusted so that its current index 447 * (as returned by {@link #getOffset }) is the match position if one was 448 * found. 449 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 450 * the <tt>StringSearch</tt> will be adjusted to the index USEARCH_DONE. 451 * @param position The index in the target text at which the search 452 * starts 453 * @param status for errors if any occurs 454 * @return The index at which the matched text in the target starts, or 455 * USEARCH_DONE if no match was found. 456 * @stable ICU 2.0 457 */ 458 virtual int32_t handleNext(int32_t position, UErrorCode &status) override; 459 460 /** 461 * Search backward for matching text, starting at a given location. 462 * Clients should not call this method directly; instead they should call 463 * <tt>SearchIterator.previous()</tt>, which this method overrides. 464 * <p> 465 * If a match is found, this method returns the index at which the match 466 * starts and calls {@link SearchIterator#setMatchLength } with the number 467 * of characters in the target text that make up the match. If no match 468 * is found, the method returns <tt>USEARCH_DONE</tt>. 469 * <p> 470 * The <tt>StringSearch</tt> is adjusted so that its current index 471 * (as returned by {@link #getOffset }) is the match position if one was 472 * found. 473 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 474 * the <tt>StringSearch</tt> will be adjusted to the index USEARCH_DONE. 475 * @param position The index in the target text at which the search 476 * starts. 477 * @param status for errors if any occurs 478 * @return The index at which the matched text in the target starts, or 479 * USEARCH_DONE if no match was found. 480 * @stable ICU 2.0 481 */ 482 virtual int32_t handlePrev(int32_t position, UErrorCode &status) override; 483 484 private : 485 StringSearch() = delete; // default constructor not implemented 486 487 // private data members ---------------------------------------------- 488 489 /** 490 * Pattern text 491 * @stable ICU 2.0 492 */ 493 UnicodeString m_pattern_; 494 /** 495 * String search struct data 496 * @stable ICU 2.0 497 */ 498 UStringSearch *m_strsrch_; 499 500 }; 501 502 U_NAMESPACE_END 503 504 #endif /* #if !UCONFIG_NO_COLLATION */ 505 506 #endif /* U_SHOW_CPLUSPLUS_API */ 507 508 #endif 509 510