1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2001-2011 IBM and others. All rights reserved. 6 ********************************************************************** 7 * Date Name Description 8 * 03/22/2000 helena Creation. 9 ********************************************************************** 10 */ 11 12 #ifndef SEARCH_H 13 #define SEARCH_H 14 15 #include "unicode/utypes.h" 16 17 #if U_SHOW_CPLUSPLUS_API 18 19 /** 20 * \file 21 * \brief C++ API: SearchIterator object. 22 */ 23 24 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION 25 26 #include "unicode/uobject.h" 27 #include "unicode/unistr.h" 28 #include "unicode/chariter.h" 29 #include "unicode/brkiter.h" 30 #include "unicode/usearch.h" 31 32 /** 33 * @stable ICU 2.0 34 */ 35 struct USearch; 36 /** 37 * @stable ICU 2.0 38 */ 39 typedef struct USearch USearch; 40 41 U_NAMESPACE_BEGIN 42 43 /** 44 * 45 * <tt>SearchIterator</tt> is an abstract base class that provides 46 * methods to search for a pattern within a text string. Instances of 47 * <tt>SearchIterator</tt> maintain a current position and scans over the 48 * target text, returning the indices the pattern is matched and the length 49 * of each match. 50 * <p> 51 * <tt>SearchIterator</tt> defines a protocol for text searching. 52 * Subclasses provide concrete implementations of various search algorithms. 53 * For example, <tt>StringSearch</tt> implements language-sensitive pattern 54 * matching based on the comparison rules defined in a 55 * <tt>RuleBasedCollator</tt> object. 56 * <p> 57 * Other options for searching includes using a BreakIterator to restrict 58 * the points at which matches are detected. 59 * <p> 60 * <tt>SearchIterator</tt> provides an API that is similar to that of 61 * other text iteration classes such as <tt>BreakIterator</tt>. Using 62 * this class, it is easy to scan through text looking for all occurances of 63 * a given pattern. The following example uses a <tt>StringSearch</tt> 64 * object to find all instances of "fox" in the target string. Any other 65 * subclass of <tt>SearchIterator</tt> can be used in an identical 66 * manner. 67 * <pre><code> 68 * UnicodeString target("The quick brown fox jumped over the lazy fox"); 69 * UnicodeString pattern("fox"); 70 * 71 * SearchIterator *iter = new StringSearch(pattern, target); 72 * UErrorCode error = U_ZERO_ERROR; 73 * for (int pos = iter->first(error); pos != USEARCH_DONE; 74 * pos = iter->next(error)) { 75 * printf("Found match at %d pos, length is %d\n", pos, iter.getMatchedLength()); 76 * } 77 * </code></pre> 78 * 79 * @see StringSearch 80 * @see RuleBasedCollator 81 */ 82 class U_I18N_API SearchIterator : public UObject { 83 84 public: 85 86 // public constructors and destructors ------------------------------- 87 88 /** 89 * Copy constructor that creates a SearchIterator instance with the same 90 * behavior, and iterating over the same text. 91 * @param other the SearchIterator instance to be copied. 92 * @stable ICU 2.0 93 */ 94 SearchIterator(const SearchIterator &other); 95 96 /** 97 * Destructor. Cleans up the search iterator data struct. 98 * @stable ICU 2.0 99 */ 100 virtual ~SearchIterator(); 101 102 // public get and set methods ---------------------------------------- 103 104 /** 105 * Sets the index to point to the given position, and clears any state 106 * that's affected. 107 * <p> 108 * This method takes the argument index and sets the position in the text 109 * string accordingly without checking if the index is pointing to a 110 * valid starting point to begin searching. 111 * @param position within the text to be set. If position is less 112 * than or greater than the text range for searching, 113 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned 114 * @param status for errors if it occurs 115 * @stable ICU 2.0 116 */ 117 virtual void setOffset(int32_t position, UErrorCode &status) = 0; 118 119 /** 120 * Return the current index in the text being searched. 121 * If the iteration has gone past the end of the text 122 * (or past the beginning for a backwards search), USEARCH_DONE 123 * is returned. 124 * @return current index in the text being searched. 125 * @stable ICU 2.0 126 */ 127 virtual int32_t getOffset(void) const = 0; 128 129 /** 130 * Sets the text searching attributes located in the enum 131 * USearchAttribute with values from the enum USearchAttributeValue. 132 * USEARCH_DEFAULT can be used for all attributes for resetting. 133 * @param attribute text attribute (enum USearchAttribute) to be set 134 * @param value text attribute value 135 * @param status for errors if it occurs 136 * @stable ICU 2.0 137 */ 138 void setAttribute(USearchAttribute attribute, 139 USearchAttributeValue value, 140 UErrorCode &status); 141 142 /** 143 * Gets the text searching attributes 144 * @param attribute text attribute (enum USearchAttribute) to be retrieve 145 * @return text attribute value 146 * @stable ICU 2.0 147 */ 148 USearchAttributeValue getAttribute(USearchAttribute attribute) const; 149 150 /** 151 * Returns the index to the match in the text string that was searched. 152 * This call returns a valid result only after a successful call to 153 * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>. 154 * Just after construction, or after a searching method returns 155 * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>. 156 * <p> 157 * Use getMatchedLength to get the matched string length. 158 * @return index of a substring within the text string that is being 159 * searched. 160 * @see #first 161 * @see #next 162 * @see #previous 163 * @see #last 164 * @stable ICU 2.0 165 */ 166 int32_t getMatchedStart(void) const; 167 168 /** 169 * Returns the length of text in the string which matches the search 170 * pattern. This call returns a valid result only after a successful call 171 * to <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>. 172 * Just after construction, or after a searching method returns 173 * <tt>USEARCH_DONE</tt>, this method will return 0. 174 * @return The length of the match in the target text, or 0 if there 175 * is no match currently. 176 * @see #first 177 * @see #next 178 * @see #previous 179 * @see #last 180 * @stable ICU 2.0 181 */ 182 int32_t getMatchedLength(void) const; 183 184 /** 185 * Returns the text that was matched by the most recent call to 186 * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>. 187 * If the iterator is not pointing at a valid match (e.g. just after 188 * construction or after <tt>USEARCH_DONE</tt> has been returned, 189 * returns an empty string. 190 * @param result stores the matched string or an empty string if a match 191 * is not found. 192 * @see #first 193 * @see #next 194 * @see #previous 195 * @see #last 196 * @stable ICU 2.0 197 */ 198 void getMatchedText(UnicodeString &result) const; 199 200 /** 201 * Set the BreakIterator that will be used to restrict the points 202 * at which matches are detected. The user is responsible for deleting 203 * the breakiterator. 204 * @param breakiter A BreakIterator that will be used to restrict the 205 * points at which matches are detected. If a match is 206 * found, but the match's start or end index is not a 207 * boundary as determined by the <tt>BreakIterator</tt>, 208 * the match will be rejected and another will be searched 209 * for. If this parameter is <tt>NULL</tt>, no break 210 * detection is attempted. 211 * @param status for errors if it occurs 212 * @see BreakIterator 213 * @stable ICU 2.0 214 */ 215 void setBreakIterator(BreakIterator *breakiter, UErrorCode &status); 216 217 /** 218 * Returns the BreakIterator that is used to restrict the points at 219 * which matches are detected. This will be the same object that was 220 * passed to the constructor or to <tt>setBreakIterator</tt>. 221 * Note that <tt>NULL</tt> is a legal value; it means that break 222 * detection should not be attempted. 223 * @return BreakIterator used to restrict matchings. 224 * @see #setBreakIterator 225 * @stable ICU 2.0 226 */ 227 const BreakIterator * getBreakIterator(void) const; 228 229 /** 230 * Set the string text to be searched. Text iteration will hence begin at 231 * the start of the text string. This method is useful if you want to 232 * re-use an iterator to search for the same pattern within a different 233 * body of text. The user is responsible for deleting the text. 234 * @param text string to be searched. 235 * @param status for errors. If the text length is 0, 236 * an U_ILLEGAL_ARGUMENT_ERROR is returned. 237 * @stable ICU 2.0 238 */ 239 virtual void setText(const UnicodeString &text, UErrorCode &status); 240 241 /** 242 * Set the string text to be searched. Text iteration will hence begin at 243 * the start of the text string. This method is useful if you want to 244 * re-use an iterator to search for the same pattern within a different 245 * body of text. 246 * <p> 247 * Note: No parsing of the text within the <tt>CharacterIterator</tt> 248 * will be done during searching for this version. The block of text 249 * in <tt>CharacterIterator</tt> will be used as it is. 250 * The user is responsible for deleting the text. 251 * @param text string iterator to be searched. 252 * @param status for errors if any. If the text length is 0 then an 253 * U_ILLEGAL_ARGUMENT_ERROR is returned. 254 * @stable ICU 2.0 255 */ 256 virtual void setText(CharacterIterator &text, UErrorCode &status); 257 258 /** 259 * Return the string text to be searched. 260 * @return text string to be searched. 261 * @stable ICU 2.0 262 */ 263 const UnicodeString & getText(void) const; 264 265 // operator overloading ---------------------------------------------- 266 267 /** 268 * Equality operator. 269 * @param that SearchIterator instance to be compared. 270 * @return true if both BreakIterators are of the same class, have the 271 * same behavior, terates over the same text and have the same 272 * attributes. false otherwise. 273 * @stable ICU 2.0 274 */ 275 virtual UBool operator==(const SearchIterator &that) const; 276 277 /** 278 * Not-equal operator. 279 * @param that SearchIterator instance to be compared. 280 * @return false if operator== returns true, and vice versa. 281 * @stable ICU 2.0 282 */ 283 UBool operator!=(const SearchIterator &that) const; 284 285 // public methods ---------------------------------------------------- 286 287 /** 288 * Returns a copy of SearchIterator with the same behavior, and 289 * iterating over the same text, as this one. Note that all data will be 290 * replicated, except for the text string to be searched. 291 * @return cloned object 292 * @stable ICU 2.0 293 */ 294 virtual SearchIterator* safeClone(void) const = 0; 295 296 /** 297 * Returns the first index at which the string text matches the search 298 * pattern. The iterator is adjusted so that its current index (as 299 * returned by <tt>getOffset</tt>) is the match position if one 300 * was found. 301 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 302 * the iterator will be adjusted to the index USEARCH_DONE 303 * @param status for errors if it occurs 304 * @return The character index of the first match, or 305 * <tt>USEARCH_DONE</tt> if there are no matches. 306 * @see #getOffset 307 * @stable ICU 2.0 308 */ 309 int32_t first(UErrorCode &status); 310 311 /** 312 * Returns the first index equal or greater than <tt>position</tt> at which the 313 * string text matches the search pattern. The iterator is adjusted so 314 * that its current index (as returned by <tt>getOffset</tt>) is the 315 * match position if one was found. 316 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and the 317 * iterator will be adjusted to the index <tt>USEARCH_DONE</tt>. 318 * @param position where search if to start from. If position is less 319 * than or greater than the text range for searching, 320 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned 321 * @param status for errors if it occurs 322 * @return The character index of the first match following 323 * <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no 324 * matches. 325 * @see #getOffset 326 * @stable ICU 2.0 327 */ 328 int32_t following(int32_t position, UErrorCode &status); 329 330 /** 331 * Returns the last index in the target text at which it matches the 332 * search pattern. The iterator is adjusted so that its current index 333 * (as returned by <tt>getOffset</tt>) is the match position if one was 334 * found. 335 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 336 * the iterator will be adjusted to the index USEARCH_DONE. 337 * @param status for errors if it occurs 338 * @return The index of the first match, or <tt>USEARCH_DONE</tt> if 339 * there are no matches. 340 * @see #getOffset 341 * @stable ICU 2.0 342 */ 343 int32_t last(UErrorCode &status); 344 345 /** 346 * Returns the first index less than <tt>position</tt> at which the string 347 * text matches the search pattern. The iterator is adjusted so that its 348 * current index (as returned by <tt>getOffset</tt>) is the match 349 * position if one was found. If a match is not found, 350 * <tt>USEARCH_DONE</tt> will be returned and the iterator will be 351 * adjusted to the index USEARCH_DONE 352 * <p> 353 * When <tt>USEARCH_OVERLAP</tt> option is off, the last index of the 354 * result match is always less than <tt>position</tt>. 355 * When <tt>USERARCH_OVERLAP</tt> is on, the result match may span across 356 * <tt>position</tt>. 357 * 358 * @param position where search is to start from. If position is less 359 * than or greater than the text range for searching, 360 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned 361 * @param status for errors if it occurs 362 * @return The character index of the first match preceding 363 * <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are 364 * no matches. 365 * @see #getOffset 366 * @stable ICU 2.0 367 */ 368 int32_t preceding(int32_t position, UErrorCode &status); 369 370 /** 371 * Returns the index of the next point at which the text matches the 372 * search pattern, starting from the current position 373 * The iterator is adjusted so that its current index (as returned by 374 * <tt>getOffset</tt>) is the match position if one was found. 375 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 376 * the iterator will be adjusted to a position after the end of the text 377 * string. 378 * @param status for errors if it occurs 379 * @return The index of the next match after the current position, 380 * or <tt>USEARCH_DONE</tt> if there are no more matches. 381 * @see #getOffset 382 * @stable ICU 2.0 383 */ 384 int32_t next(UErrorCode &status); 385 386 /** 387 * Returns the index of the previous point at which the string text 388 * matches the search pattern, starting at the current position. 389 * The iterator is adjusted so that its current index (as returned by 390 * <tt>getOffset</tt>) is the match position if one was found. 391 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 392 * the iterator will be adjusted to the index USEARCH_DONE 393 * @param status for errors if it occurs 394 * @return The index of the previous match before the current position, 395 * or <tt>USEARCH_DONE</tt> if there are no more matches. 396 * @see #getOffset 397 * @stable ICU 2.0 398 */ 399 int32_t previous(UErrorCode &status); 400 401 /** 402 * Resets the iteration. 403 * Search will begin at the start of the text string if a forward 404 * iteration is initiated before a backwards iteration. Otherwise if a 405 * backwards iteration is initiated before a forwards iteration, the 406 * search will begin at the end of the text string. 407 * @stable ICU 2.0 408 */ 409 virtual void reset(); 410 411 protected: 412 // protected data members --------------------------------------------- 413 414 /** 415 * C search data struct 416 * @stable ICU 2.0 417 */ 418 USearch *m_search_; 419 420 /** 421 * Break iterator. 422 * Currently the C++ breakiterator does not have getRules etc to reproduce 423 * another in C. Hence we keep the original around and do the verification 424 * at the end of the match. The user is responsible for deleting this 425 * break iterator. 426 * @stable ICU 2.0 427 */ 428 BreakIterator *m_breakiterator_; 429 430 /** 431 * Unicode string version of the search text 432 * @stable ICU 2.0 433 */ 434 UnicodeString m_text_; 435 436 // protected constructors and destructors ----------------------------- 437 438 /** 439 * Default constructor. 440 * Initializes data to the default values. 441 * @stable ICU 2.0 442 */ 443 SearchIterator(); 444 445 /** 446 * Constructor for use by subclasses. 447 * @param text The target text to be searched. 448 * @param breakiter A {@link BreakIterator} that is used to restrict the 449 * points at which matches are detected. If 450 * <tt>handleNext</tt> or <tt>handlePrev</tt> finds a 451 * match, but the match's start or end index is not a 452 * boundary as determined by the <tt>BreakIterator</tt>, 453 * the match is rejected and <tt>handleNext</tt> or 454 * <tt>handlePrev</tt> is called again. If this parameter 455 * is <tt>NULL</tt>, no break detection is attempted. 456 * @see #handleNext 457 * @see #handlePrev 458 * @stable ICU 2.0 459 */ 460 SearchIterator(const UnicodeString &text, 461 BreakIterator *breakiter = NULL); 462 463 /** 464 * Constructor for use by subclasses. 465 * <p> 466 * Note: No parsing of the text within the <tt>CharacterIterator</tt> 467 * will be done during searching for this version. The block of text 468 * in <tt>CharacterIterator</tt> will be used as it is. 469 * @param text The target text to be searched. 470 * @param breakiter A {@link BreakIterator} that is used to restrict the 471 * points at which matches are detected. If 472 * <tt>handleNext</tt> or <tt>handlePrev</tt> finds a 473 * match, but the match's start or end index is not a 474 * boundary as determined by the <tt>BreakIterator</tt>, 475 * the match is rejected and <tt>handleNext</tt> or 476 * <tt>handlePrev</tt> is called again. If this parameter 477 * is <tt>NULL</tt>, no break detection is attempted. 478 * @see #handleNext 479 * @see #handlePrev 480 * @stable ICU 2.0 481 */ 482 SearchIterator(CharacterIterator &text, BreakIterator *breakiter = NULL); 483 484 // protected methods -------------------------------------------------- 485 486 /** 487 * Assignment operator. Sets this iterator to have the same behavior, 488 * and iterate over the same text, as the one passed in. 489 * @param that instance to be copied. 490 * @stable ICU 2.0 491 */ 492 SearchIterator & operator=(const SearchIterator &that); 493 494 /** 495 * Abstract method which subclasses override to provide the mechanism 496 * for finding the next match in the target text. This allows different 497 * subclasses to provide different search algorithms. 498 * <p> 499 * If a match is found, the implementation should return the index at 500 * which the match starts and should call 501 * <tt>setMatchLength</tt> with the number of characters 502 * in the target text that make up the match. If no match is found, the 503 * method should return USEARCH_DONE. 504 * <p> 505 * @param position The index in the target text at which the search 506 * should start. 507 * @param status for error codes if it occurs. 508 * @return index at which the match starts, else if match is not found 509 * USEARCH_DONE is returned 510 * @see #setMatchLength 511 * @stable ICU 2.0 512 */ 513 virtual int32_t handleNext(int32_t position, UErrorCode &status) 514 = 0; 515 516 /** 517 * Abstract method which subclasses override to provide the mechanism for 518 * finding the previous match in the target text. This allows different 519 * subclasses to provide different search algorithms. 520 * <p> 521 * If a match is found, the implementation should return the index at 522 * which the match starts and should call 523 * <tt>setMatchLength</tt> with the number of characters 524 * in the target text that make up the match. If no match is found, the 525 * method should return USEARCH_DONE. 526 * <p> 527 * @param position The index in the target text at which the search 528 * should start. 529 * @param status for error codes if it occurs. 530 * @return index at which the match starts, else if match is not found 531 * USEARCH_DONE is returned 532 * @see #setMatchLength 533 * @stable ICU 2.0 534 */ 535 virtual int32_t handlePrev(int32_t position, UErrorCode &status) 536 = 0; 537 538 /** 539 * Sets the length of the currently matched string in the text string to 540 * be searched. 541 * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt> 542 * methods should call this when they find a match in the target text. 543 * @param length length of the matched text. 544 * @see #handleNext 545 * @see #handlePrev 546 * @stable ICU 2.0 547 */ 548 virtual void setMatchLength(int32_t length); 549 550 /** 551 * Sets the offset of the currently matched string in the text string to 552 * be searched. 553 * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt> 554 * methods should call this when they find a match in the target text. 555 * @param position start offset of the matched text. 556 * @see #handleNext 557 * @see #handlePrev 558 * @stable ICU 2.0 559 */ 560 virtual void setMatchStart(int32_t position); 561 562 /** 563 * sets match not found 564 * @stable ICU 2.0 565 */ 566 void setMatchNotFound(); 567 }; 568 569 inline UBool SearchIterator::operator!=(const SearchIterator &that) const 570 { 571 return !operator==(that); 572 } 573 U_NAMESPACE_END 574 575 #endif /* #if !UCONFIG_NO_COLLATION */ 576 577 #endif /* U_SHOW_CPLUSPLUS_API */ 578 579 #endif 580 581