1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2001-2011 IBM and others. All rights reserved. 6 ********************************************************************** 7 * Date Name Description 8 * 03/22/2000 helena Creation. 9 ********************************************************************** 10 */ 11 12 #ifndef SEARCH_H 13 #define SEARCH_H 14 15 #include "unicode/utypes.h" 16 17 /** 18 * \file 19 * \brief C++ API: SearchIterator object. 20 */ 21 22 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION 23 24 #include "unicode/uobject.h" 25 #include "unicode/unistr.h" 26 #include "unicode/chariter.h" 27 #include "unicode/brkiter.h" 28 #include "unicode/usearch.h" 29 30 /** 31 * @stable ICU 2.0 32 */ 33 struct USearch; 34 /** 35 * @stable ICU 2.0 36 */ 37 typedef struct USearch USearch; 38 39 U_NAMESPACE_BEGIN 40 41 /** 42 * 43 * <tt>SearchIterator</tt> is an abstract base class that provides 44 * methods to search for a pattern within a text string. Instances of 45 * <tt>SearchIterator</tt> maintain a current position and scans over the 46 * target text, returning the indices the pattern is matched and the length 47 * of each match. 48 * <p> 49 * <tt>SearchIterator</tt> defines a protocol for text searching. 50 * Subclasses provide concrete implementations of various search algorithms. 51 * For example, <tt>StringSearch</tt> implements language-sensitive pattern 52 * matching based on the comparison rules defined in a 53 * <tt>RuleBasedCollator</tt> object. 54 * <p> 55 * Other options for searching includes using a BreakIterator to restrict 56 * the points at which matches are detected. 57 * <p> 58 * <tt>SearchIterator</tt> provides an API that is similar to that of 59 * other text iteration classes such as <tt>BreakIterator</tt>. Using 60 * this class, it is easy to scan through text looking for all occurances of 61 * a given pattern. The following example uses a <tt>StringSearch</tt> 62 * object to find all instances of "fox" in the target string. Any other 63 * subclass of <tt>SearchIterator</tt> can be used in an identical 64 * manner. 65 * <pre><code> 66 * UnicodeString target("The quick brown fox jumped over the lazy fox"); 67 * UnicodeString pattern("fox"); 68 * 69 * SearchIterator *iter = new StringSearch(pattern, target); 70 * UErrorCode error = U_ZERO_ERROR; 71 * for (int pos = iter->first(error); pos != USEARCH_DONE; 72 * pos = iter->next(error)) { 73 * printf("Found match at %d pos, length is %d\n", pos, 74 * iter.getMatchLength()); 75 * } 76 * </code></pre> 77 * 78 * @see StringSearch 79 * @see RuleBasedCollator 80 */ 81 class U_I18N_API SearchIterator : public UObject { 82 83 public: 84 85 // public constructors and destructors ------------------------------- 86 87 /** 88 * Copy constructor that creates a SearchIterator instance with the same 89 * behavior, and iterating over the same text. 90 * @param other the SearchIterator instance to be copied. 91 * @stable ICU 2.0 92 */ 93 SearchIterator(const SearchIterator &other); 94 95 /** 96 * Destructor. Cleans up the search iterator data struct. 97 * @stable ICU 2.0 98 */ 99 virtual ~SearchIterator(); 100 101 // public get and set methods ---------------------------------------- 102 103 /** 104 * Sets the index to point to the given position, and clears any state 105 * that's affected. 106 * <p> 107 * This method takes the argument index and sets the position in the text 108 * string accordingly without checking if the index is pointing to a 109 * valid starting point to begin searching. 110 * @param position within the text to be set. If position is less 111 * than or greater than the text range for searching, 112 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned 113 * @param status for errors if it occurs 114 * @stable ICU 2.0 115 */ 116 virtual void setOffset(int32_t position, UErrorCode &status) = 0; 117 118 /** 119 * Return the current index in the text being searched. 120 * If the iteration has gone past the end of the text 121 * (or past the beginning for a backwards search), USEARCH_DONE 122 * is returned. 123 * @return current index in the text being searched. 124 * @stable ICU 2.0 125 */ 126 virtual int32_t getOffset(void) const = 0; 127 128 /** 129 * Sets the text searching attributes located in the enum 130 * USearchAttribute with values from the enum USearchAttributeValue. 131 * USEARCH_DEFAULT can be used for all attributes for resetting. 132 * @param attribute text attribute (enum USearchAttribute) to be set 133 * @param value text attribute value 134 * @param status for errors if it occurs 135 * @stable ICU 2.0 136 */ 137 void setAttribute(USearchAttribute attribute, 138 USearchAttributeValue value, 139 UErrorCode &status); 140 141 /** 142 * Gets the text searching attributes 143 * @param attribute text attribute (enum USearchAttribute) to be retrieve 144 * @return text attribute value 145 * @stable ICU 2.0 146 */ 147 USearchAttributeValue getAttribute(USearchAttribute attribute) const; 148 149 /** 150 * Returns the index to the match in the text string that was searched. 151 * This call returns a valid result only after a successful call to 152 * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>. 153 * Just after construction, or after a searching method returns 154 * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>. 155 * <p> 156 * Use getMatchedLength to get the matched string length. 157 * @return index of a substring within the text string that is being 158 * searched. 159 * @see #first 160 * @see #next 161 * @see #previous 162 * @see #last 163 * @stable ICU 2.0 164 */ 165 int32_t getMatchedStart(void) const; 166 167 /** 168 * Returns the length of text in the string which matches the search 169 * pattern. This call returns a valid result only after a successful call 170 * to <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>. 171 * Just after construction, or after a searching method returns 172 * <tt>USEARCH_DONE</tt>, this method will return 0. 173 * @return The length of the match in the target text, or 0 if there 174 * is no match currently. 175 * @see #first 176 * @see #next 177 * @see #previous 178 * @see #last 179 * @stable ICU 2.0 180 */ 181 int32_t getMatchedLength(void) const; 182 183 /** 184 * Returns the text that was matched by the most recent call to 185 * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>. 186 * If the iterator is not pointing at a valid match (e.g. just after 187 * construction or after <tt>USEARCH_DONE</tt> has been returned, 188 * returns an empty string. 189 * @param result stores the matched string or an empty string if a match 190 * is not found. 191 * @see #first 192 * @see #next 193 * @see #previous 194 * @see #last 195 * @stable ICU 2.0 196 */ 197 void getMatchedText(UnicodeString &result) const; 198 199 /** 200 * Set the BreakIterator that will be used to restrict the points 201 * at which matches are detected. The user is responsible for deleting 202 * the breakiterator. 203 * @param breakiter A BreakIterator that will be used to restrict the 204 * points at which matches are detected. If a match is 205 * found, but the match's start or end index is not a 206 * boundary as determined by the <tt>BreakIterator</tt>, 207 * the match will be rejected and another will be searched 208 * for. If this parameter is <tt>NULL</tt>, no break 209 * detection is attempted. 210 * @param status for errors if it occurs 211 * @see BreakIterator 212 * @stable ICU 2.0 213 */ 214 void setBreakIterator(BreakIterator *breakiter, UErrorCode &status); 215 216 /** 217 * Returns the BreakIterator that is used to restrict the points at 218 * which matches are detected. This will be the same object that was 219 * passed to the constructor or to <tt>setBreakIterator</tt>. 220 * Note that <tt>NULL</tt> is a legal value; it means that break 221 * detection should not be attempted. 222 * @return BreakIterator used to restrict matchings. 223 * @see #setBreakIterator 224 * @stable ICU 2.0 225 */ 226 const BreakIterator * getBreakIterator(void) const; 227 228 /** 229 * Set the string text to be searched. Text iteration will hence begin at 230 * the start of the text string. This method is useful if you want to 231 * re-use an iterator to search for the same pattern within a different 232 * body of text. The user is responsible for deleting the text. 233 * @param text string to be searched. 234 * @param status for errors. If the text length is 0, 235 * an U_ILLEGAL_ARGUMENT_ERROR is returned. 236 * @stable ICU 2.0 237 */ 238 virtual void setText(const UnicodeString &text, UErrorCode &status); 239 240 /** 241 * Set the string text to be searched. Text iteration will hence begin at 242 * the start of the text string. This method is useful if you want to 243 * re-use an iterator to search for the same pattern within a different 244 * body of text. 245 * <p> 246 * Note: No parsing of the text within the <tt>CharacterIterator</tt> 247 * will be done during searching for this version. The block of text 248 * in <tt>CharacterIterator</tt> will be used as it is. 249 * The user is responsible for deleting the text. 250 * @param text string iterator to be searched. 251 * @param status for errors if any. If the text length is 0 then an 252 * U_ILLEGAL_ARGUMENT_ERROR is returned. 253 * @stable ICU 2.0 254 */ 255 virtual void setText(CharacterIterator &text, UErrorCode &status); 256 257 /** 258 * Return the string text to be searched. 259 * @return text string to be searched. 260 * @stable ICU 2.0 261 */ 262 const UnicodeString & getText(void) const; 263 264 // operator overloading ---------------------------------------------- 265 266 /** 267 * Equality operator. 268 * @param that SearchIterator instance to be compared. 269 * @return TRUE if both BreakIterators are of the same class, have the 270 * same behavior, terates over the same text and have the same 271 * attributes. FALSE otherwise. 272 * @stable ICU 2.0 273 */ 274 virtual UBool operator==(const SearchIterator &that) const; 275 276 /** 277 * Not-equal operator. 278 * @param that SearchIterator instance to be compared. 279 * @return FALSE if operator== returns TRUE, and vice versa. 280 * @stable ICU 2.0 281 */ 282 UBool operator!=(const SearchIterator &that) const; 283 284 // public methods ---------------------------------------------------- 285 286 /** 287 * Returns a copy of SearchIterator with the same behavior, and 288 * iterating over the same text, as this one. Note that all data will be 289 * replicated, except for the text string to be searched. 290 * @return cloned object 291 * @stable ICU 2.0 292 */ 293 virtual SearchIterator* safeClone(void) const = 0; 294 295 /** 296 * Returns the first index at which the string text matches the search 297 * pattern. The iterator is adjusted so that its current index (as 298 * returned by <tt>getOffset</tt>) is the match position if one 299 * was found. 300 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 301 * the iterator will be adjusted to the index USEARCH_DONE 302 * @param status for errors if it occurs 303 * @return The character index of the first match, or 304 * <tt>USEARCH_DONE</tt> if there are no matches. 305 * @see #getOffset 306 * @stable ICU 2.0 307 */ 308 int32_t first(UErrorCode &status); 309 310 /** 311 * Returns the first index equal or greater than <tt>position</tt> at which the 312 * string text matches the search pattern. The iterator is adjusted so 313 * that its current index (as returned by <tt>getOffset</tt>) is the 314 * match position if one was found. 315 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and the 316 * iterator will be adjusted to the index <tt>USEARCH_DONE</tt>. 317 * @param position where search if to start from. If position is less 318 * than or greater than the text range for searching, 319 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned 320 * @param status for errors if it occurs 321 * @return The character index of the first match following 322 * <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no 323 * matches. 324 * @see #getOffset 325 * @stable ICU 2.0 326 */ 327 int32_t following(int32_t position, UErrorCode &status); 328 329 /** 330 * Returns the last index in the target text at which it matches the 331 * search pattern. The iterator is adjusted so that its current index 332 * (as returned by <tt>getOffset</tt>) is the match position if one was 333 * found. 334 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 335 * the iterator will be adjusted to the index USEARCH_DONE. 336 * @param status for errors if it occurs 337 * @return The index of the first match, or <tt>USEARCH_DONE</tt> if 338 * there are no matches. 339 * @see #getOffset 340 * @stable ICU 2.0 341 */ 342 int32_t last(UErrorCode &status); 343 344 /** 345 * Returns the first index less than <tt>position</tt> at which the string 346 * text matches the search pattern. The iterator is adjusted so that its 347 * current index (as returned by <tt>getOffset</tt>) is the match 348 * position if one was found. If a match is not found, 349 * <tt>USEARCH_DONE</tt> will be returned and the iterator will be 350 * adjusted to the index USEARCH_DONE 351 * <p> 352 * When <tt>USEARCH_OVERLAP</tt> option is off, the last index of the 353 * result match is always less than <tt>position</tt>. 354 * When <tt>USERARCH_OVERLAP</tt> is on, the result match may span across 355 * <tt>position</tt>. 356 * 357 * @param position where search is to start from. If position is less 358 * than or greater than the text range for searching, 359 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned 360 * @param status for errors if it occurs 361 * @return The character index of the first match preceding 362 * <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are 363 * no matches. 364 * @see #getOffset 365 * @stable ICU 2.0 366 */ 367 int32_t preceding(int32_t position, UErrorCode &status); 368 369 /** 370 * Returns the index of the next point at which the text matches the 371 * search pattern, starting from the current position 372 * The iterator is adjusted so that its current index (as returned by 373 * <tt>getOffset</tt>) is the match position if one was found. 374 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 375 * the iterator will be adjusted to a position after the end of the text 376 * string. 377 * @param status for errors if it occurs 378 * @return The index of the next match after the current position, 379 * or <tt>USEARCH_DONE</tt> if there are no more matches. 380 * @see #getOffset 381 * @stable ICU 2.0 382 */ 383 int32_t next(UErrorCode &status); 384 385 /** 386 * Returns the index of the previous point at which the string text 387 * matches the search pattern, starting at the current position. 388 * The iterator is adjusted so that its current index (as returned by 389 * <tt>getOffset</tt>) is the match position if one was found. 390 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 391 * the iterator will be adjusted to the index USEARCH_DONE 392 * @param status for errors if it occurs 393 * @return The index of the previous match before the current position, 394 * or <tt>USEARCH_DONE</tt> if there are no more matches. 395 * @see #getOffset 396 * @stable ICU 2.0 397 */ 398 int32_t previous(UErrorCode &status); 399 400 /** 401 * Resets the iteration. 402 * Search will begin at the start of the text string if a forward 403 * iteration is initiated before a backwards iteration. Otherwise if a 404 * backwards iteration is initiated before a forwards iteration, the 405 * search will begin at the end of the text string. 406 * @stable ICU 2.0 407 */ 408 virtual void reset(); 409 410 protected: 411 // protected data members --------------------------------------------- 412 413 /** 414 * C search data struct 415 * @stable ICU 2.0 416 */ 417 USearch *m_search_; 418 419 /** 420 * Break iterator. 421 * Currently the C++ breakiterator does not have getRules etc to reproduce 422 * another in C. Hence we keep the original around and do the verification 423 * at the end of the match. The user is responsible for deleting this 424 * break iterator. 425 * @stable ICU 2.0 426 */ 427 BreakIterator *m_breakiterator_; 428 429 /** 430 * Unicode string version of the search text 431 * @stable ICU 2.0 432 */ 433 UnicodeString m_text_; 434 435 // protected constructors and destructors ----------------------------- 436 437 /** 438 * Default constructor. 439 * Initializes data to the default values. 440 * @stable ICU 2.0 441 */ 442 SearchIterator(); 443 444 /** 445 * Constructor for use by subclasses. 446 * @param text The target text to be searched. 447 * @param breakiter A {@link BreakIterator} that is used to restrict the 448 * points at which matches are detected. If 449 * <tt>handleNext</tt> or <tt>handlePrev</tt> finds a 450 * match, but the match's start or end index is not a 451 * boundary as determined by the <tt>BreakIterator</tt>, 452 * the match is rejected and <tt>handleNext</tt> or 453 * <tt>handlePrev</tt> is called again. If this parameter 454 * is <tt>NULL</tt>, no break detection is attempted. 455 * @see #handleNext 456 * @see #handlePrev 457 * @stable ICU 2.0 458 */ 459 SearchIterator(const UnicodeString &text, 460 BreakIterator *breakiter = NULL); 461 462 /** 463 * Constructor for use by subclasses. 464 * <p> 465 * Note: No parsing of the text within the <tt>CharacterIterator</tt> 466 * will be done during searching for this version. The block of text 467 * in <tt>CharacterIterator</tt> will be used as it is. 468 * @param text The target text to be searched. 469 * @param breakiter A {@link BreakIterator} that is used to restrict the 470 * points at which matches are detected. If 471 * <tt>handleNext</tt> or <tt>handlePrev</tt> finds a 472 * match, but the match's start or end index is not a 473 * boundary as determined by the <tt>BreakIterator</tt>, 474 * the match is rejected and <tt>handleNext</tt> or 475 * <tt>handlePrev</tt> is called again. If this parameter 476 * is <tt>NULL</tt>, no break detection is attempted. 477 * @see #handleNext 478 * @see #handlePrev 479 * @stable ICU 2.0 480 */ 481 SearchIterator(CharacterIterator &text, BreakIterator *breakiter = NULL); 482 483 // protected methods -------------------------------------------------- 484 485 /** 486 * Assignment operator. Sets this iterator to have the same behavior, 487 * and iterate over the same text, as the one passed in. 488 * @param that instance to be copied. 489 * @stable ICU 2.0 490 */ 491 SearchIterator & operator=(const SearchIterator &that); 492 493 /** 494 * Abstract method which subclasses override to provide the mechanism 495 * for finding the next match in the target text. This allows different 496 * subclasses to provide different search algorithms. 497 * <p> 498 * If a match is found, the implementation should return the index at 499 * which the match starts and should call 500 * <tt>setMatchLength</tt> with the number of characters 501 * in the target text that make up the match. If no match is found, the 502 * method should return USEARCH_DONE. 503 * <p> 504 * @param position The index in the target text at which the search 505 * should start. 506 * @param status for error codes if it occurs. 507 * @return index at which the match starts, else if match is not found 508 * USEARCH_DONE is returned 509 * @see #setMatchLength 510 * @stable ICU 2.0 511 */ 512 virtual int32_t handleNext(int32_t position, UErrorCode &status) 513 = 0; 514 515 /** 516 * Abstract method which subclasses override to provide the mechanism for 517 * finding the previous match in the target text. This allows different 518 * subclasses to provide different search algorithms. 519 * <p> 520 * If a match is found, the implementation should return the index at 521 * which the match starts and should call 522 * <tt>setMatchLength</tt> with the number of characters 523 * in the target text that make up the match. If no match is found, the 524 * method should return USEARCH_DONE. 525 * <p> 526 * @param position The index in the target text at which the search 527 * should start. 528 * @param status for error codes if it occurs. 529 * @return index at which the match starts, else if match is not found 530 * USEARCH_DONE is returned 531 * @see #setMatchLength 532 * @stable ICU 2.0 533 */ 534 virtual int32_t handlePrev(int32_t position, UErrorCode &status) 535 = 0; 536 537 /** 538 * Sets the length of the currently matched string in the text string to 539 * be searched. 540 * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt> 541 * methods should call this when they find a match in the target text. 542 * @param length length of the matched text. 543 * @see #handleNext 544 * @see #handlePrev 545 * @stable ICU 2.0 546 */ 547 virtual void setMatchLength(int32_t length); 548 549 /** 550 * Sets the offset of the currently matched string in the text string to 551 * be searched. 552 * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt> 553 * methods should call this when they find a match in the target text. 554 * @param position start offset of the matched text. 555 * @see #handleNext 556 * @see #handlePrev 557 * @stable ICU 2.0 558 */ 559 virtual void setMatchStart(int32_t position); 560 561 /** 562 * sets match not found 563 * @stable ICU 2.0 564 */ 565 void setMatchNotFound(); 566 }; 567 568 inline UBool SearchIterator::operator!=(const SearchIterator &that) const 569 { 570 return !operator==(that); 571 } 572 U_NAMESPACE_END 573 574 #endif /* #if !UCONFIG_NO_COLLATION */ 575 576 #endif 577 578