• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ********************************************************************
5 *
6 *   Copyright (C) 1997-2011, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 ********************************************************************
10 */
11 
12 #ifndef CHARITER_H
13 #define CHARITER_H
14 
15 #include "unicode/utypes.h"
16 
17 #if U_SHOW_CPLUSPLUS_API
18 
19 #include "unicode/uobject.h"
20 #include "unicode/unistr.h"
21 /**
22  * \file
23  * \brief C++ API: Character Iterator
24  */
25 
26 U_NAMESPACE_BEGIN
27 /**
28  * Abstract class that defines an API for forward-only iteration
29  * on text objects.
30  * This is a minimal interface for iteration without random access
31  * or backwards iteration. It is especially useful for wrapping
32  * streams with converters into an object for collation or
33  * normalization.
34  *
35  * <p>Characters can be accessed in two ways: as code units or as
36  * code points.
37  * Unicode code points are 21-bit integers and are the scalar values
38  * of Unicode characters. ICU uses the type UChar32 for them.
39  * Unicode code units are the storage units of a given
40  * Unicode/UCS Transformation Format (a character encoding scheme).
41  * With UTF-16, all code points can be represented with either one
42  * or two code units ("surrogates").
43  * String storage is typically based on code units, while properties
44  * of characters are typically determined using code point values.
45  * Some processes may be designed to work with sequences of code units,
46  * or it may be known that all characters that are important to an
47  * algorithm can be represented with single code units.
48  * Other processes will need to use the code point access functions.</p>
49  *
50  * <p>ForwardCharacterIterator provides nextPostInc() to access
51  * a code unit and advance an internal position into the text object,
52  * similar to a <code>return text[position++]</code>.<br>
53  * It provides next32PostInc() to access a code point and advance an internal
54  * position.</p>
55  *
56  * <p>next32PostInc() assumes that the current position is that of
57  * the beginning of a code point, i.e., of its first code unit.
58  * After next32PostInc(), this will be true again.
59  * In general, access to code units and code points in the same
60  * iteration loop should not be mixed. In UTF-16, if the current position
61  * is on a second code unit (Low Surrogate), then only that code unit
62  * is returned even by next32PostInc().</p>
63  *
64  * <p>For iteration with either function, there are two ways to
65  * check for the end of the iteration. When there are no more
66  * characters in the text object:
67  * <ul>
68  * <li>The hasNext() function returns FALSE.</li>
69  * <li>nextPostInc() and next32PostInc() return DONE
70  *     when one attempts to read beyond the end of the text object.</li>
71  * </ul>
72  *
73  * Example:
74  * \code
75  * void function1(ForwardCharacterIterator &it) {
76  *     UChar32 c;
77  *     while(it.hasNext()) {
78  *         c=it.next32PostInc();
79  *         // use c
80  *     }
81  * }
82  *
83  * void function1(ForwardCharacterIterator &it) {
84  *     char16_t c;
85  *     while((c=it.nextPostInc())!=ForwardCharacterIterator::DONE) {
86  *         // use c
87  *      }
88  *  }
89  * \endcode
90  * </p>
91  *
92  * @stable ICU 2.0
93  */
94 class U_COMMON_API ForwardCharacterIterator : public UObject {
95 public:
96     /**
97      * Value returned by most of ForwardCharacterIterator's functions
98      * when the iterator has reached the limits of its iteration.
99      * @stable ICU 2.0
100      */
101     enum { DONE = 0xffff };
102 
103     /**
104      * Destructor.
105      * @stable ICU 2.0
106      */
107     virtual ~ForwardCharacterIterator();
108 
109     /**
110      * Returns true when both iterators refer to the same
111      * character in the same character-storage object.
112      * @param that The ForwardCharacterIterator to be compared for equality
113      * @return true when both iterators refer to the same
114      * character in the same character-storage object
115      * @stable ICU 2.0
116      */
117     virtual UBool operator==(const ForwardCharacterIterator& that) const = 0;
118 
119     /**
120      * Returns true when the iterators refer to different
121      * text-storage objects, or to different characters in the
122      * same text-storage object.
123      * @param that The ForwardCharacterIterator to be compared for inequality
124      * @return true when the iterators refer to different
125      * text-storage objects, or to different characters in the
126      * same text-storage object
127      * @stable ICU 2.0
128      */
129     inline UBool operator!=(const ForwardCharacterIterator& that) const;
130 
131     /**
132      * Generates a hash code for this iterator.
133      * @return the hash code.
134      * @stable ICU 2.0
135      */
136     virtual int32_t hashCode(void) const = 0;
137 
138     /**
139      * Returns a UClassID for this ForwardCharacterIterator ("poor man's
140      * RTTI").<P> Despite the fact that this function is public,
141      * DO NOT CONSIDER IT PART OF CHARACTERITERATOR'S API!
142      * @return a UClassID for this ForwardCharacterIterator
143      * @stable ICU 2.0
144      */
145     virtual UClassID getDynamicClassID(void) const = 0;
146 
147     /**
148      * Gets the current code unit for returning and advances to the next code unit
149      * in the iteration range
150      * (toward endIndex()).  If there are
151      * no more code units to return, returns DONE.
152      * @return the current code unit.
153      * @stable ICU 2.0
154      */
155     virtual char16_t         nextPostInc(void) = 0;
156 
157     /**
158      * Gets the current code point for returning and advances to the next code point
159      * in the iteration range
160      * (toward endIndex()).  If there are
161      * no more code points to return, returns DONE.
162      * @return the current code point.
163      * @stable ICU 2.0
164      */
165     virtual UChar32       next32PostInc(void) = 0;
166 
167     /**
168      * Returns FALSE if there are no more code units or code points
169      * at or after the current position in the iteration range.
170      * This is used with nextPostInc() or next32PostInc() in forward
171      * iteration.
172      * @returns FALSE if there are no more code units or code points
173      * at or after the current position in the iteration range.
174      * @stable ICU 2.0
175      */
176     virtual UBool        hasNext() = 0;
177 
178 protected:
179     /** Default constructor to be overridden in the implementing class. @stable ICU 2.0*/
180     ForwardCharacterIterator();
181 
182     /** Copy constructor to be overridden in the implementing class. @stable ICU 2.0*/
183     ForwardCharacterIterator(const ForwardCharacterIterator &other);
184 
185     /**
186      * Assignment operator to be overridden in the implementing class.
187      * @stable ICU 2.0
188      */
189     ForwardCharacterIterator &operator=(const ForwardCharacterIterator&) { return *this; }
190 };
191 
192 /**
193  * Abstract class that defines an API for iteration
194  * on text objects.
195  * This is an interface for forward and backward iteration
196  * and random access into a text object.
197  *
198  * <p>The API provides backward compatibility to the Java and older ICU
199  * CharacterIterator classes but extends them significantly:
200  * <ol>
201  * <li>CharacterIterator is now a subclass of ForwardCharacterIterator.</li>
202  * <li>While the old API functions provided forward iteration with
203  *     "pre-increment" semantics, the new one also provides functions
204  *     with "post-increment" semantics. They are more efficient and should
205  *     be the preferred iterator functions for new implementations.
206  *     The backward iteration always had "pre-decrement" semantics, which
207  *     are efficient.</li>
208  * <li>Just like ForwardCharacterIterator, it provides access to
209  *     both code units and code points. Code point access versions are available
210  *     for the old and the new iteration semantics.</li>
211  * <li>There are new functions for setting and moving the current position
212  *     without returning a character, for efficiency.</li>
213  * </ol>
214  *
215  * See ForwardCharacterIterator for examples for using the new forward iteration
216  * functions. For backward iteration, there is also a hasPrevious() function
217  * that can be used analogously to hasNext().
218  * The old functions work as before and are shown below.</p>
219  *
220  * <p>Examples for some of the new functions:</p>
221  *
222  * Forward iteration with hasNext():
223  * \code
224  * void forward1(CharacterIterator &it) {
225  *     UChar32 c;
226  *     for(it.setToStart(); it.hasNext();) {
227  *         c=it.next32PostInc();
228  *         // use c
229  *     }
230  *  }
231  * \endcode
232  * Forward iteration more similar to loops with the old forward iteration,
233  * showing a way to convert simple for() loops:
234  * \code
235  * void forward2(CharacterIterator &it) {
236  *     char16_t c;
237  *     for(c=it.firstPostInc(); c!=CharacterIterator::DONE; c=it.nextPostInc()) {
238  *          // use c
239  *      }
240  * }
241  * \endcode
242  * Backward iteration with setToEnd() and hasPrevious():
243  * \code
244  *  void backward1(CharacterIterator &it) {
245  *      UChar32 c;
246  *      for(it.setToEnd(); it.hasPrevious();) {
247  *         c=it.previous32();
248  *          // use c
249  *      }
250  *  }
251  * \endcode
252  * Backward iteration with a more traditional for() loop:
253  * \code
254  * void backward2(CharacterIterator &it) {
255  *     char16_t c;
256  *     for(c=it.last(); c!=CharacterIterator::DONE; c=it.previous()) {
257  *         // use c
258  *      }
259  *  }
260  * \endcode
261  *
262  * Example for random access:
263  * \code
264  *  void random(CharacterIterator &it) {
265  *      // set to the third code point from the beginning
266  *      it.move32(3, CharacterIterator::kStart);
267  *      // get a code point from here without moving the position
268  *      UChar32 c=it.current32();
269  *      // get the position
270  *      int32_t pos=it.getIndex();
271  *      // get the previous code unit
272  *      char16_t u=it.previous();
273  *      // move back one more code unit
274  *      it.move(-1, CharacterIterator::kCurrent);
275  *      // set the position back to where it was
276  *      // and read the same code point c and move beyond it
277  *      it.setIndex(pos);
278  *      if(c!=it.next32PostInc()) {
279  *          exit(1); // CharacterIterator inconsistent
280  *      }
281  *  }
282  * \endcode
283  *
284  * <p>Examples, especially for the old API:</p>
285  *
286  * Function processing characters, in this example simple output
287  * <pre>
288  * \code
289  *  void processChar( char16_t c )
290  *  {
291  *      cout << " " << c;
292  *  }
293  * \endcode
294  * </pre>
295  * Traverse the text from start to finish
296  * <pre>
297  * \code
298  *  void traverseForward(CharacterIterator& iter)
299  *  {
300  *      for(char16_t c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {
301  *          processChar(c);
302  *      }
303  *  }
304  * \endcode
305  * </pre>
306  * Traverse the text backwards, from end to start
307  * <pre>
308  * \code
309  *  void traverseBackward(CharacterIterator& iter)
310  *  {
311  *      for(char16_t c = iter.last(); c != CharacterIterator.DONE; c = iter.previous()) {
312  *          processChar(c);
313  *      }
314  *  }
315  * \endcode
316  * </pre>
317  * Traverse both forward and backward from a given position in the text.
318  * Calls to notBoundary() in this example represents some additional stopping criteria.
319  * <pre>
320  * \code
321  * void traverseOut(CharacterIterator& iter, int32_t pos)
322  * {
323  *      char16_t c;
324  *      for (c = iter.setIndex(pos);
325  *      c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
326  *          c = iter.next()) {}
327  *      int32_t end = iter.getIndex();
328  *      for (c = iter.setIndex(pos);
329  *          c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
330  *          c = iter.previous()) {}
331  *      int32_t start = iter.getIndex() + 1;
332  *
333  *      cout << "start: " << start << " end: " << end << endl;
334  *      for (c = iter.setIndex(start); iter.getIndex() < end; c = iter.next() ) {
335  *          processChar(c);
336  *     }
337  *  }
338  * \endcode
339  * </pre>
340  * Creating a StringCharacterIterator and calling the test functions
341  * <pre>
342  * \code
343  *  void CharacterIterator_Example( void )
344  *   {
345  *       cout << endl << "===== CharacterIterator_Example: =====" << endl;
346  *       UnicodeString text("Ein kleiner Satz.");
347  *       StringCharacterIterator iterator(text);
348  *       cout << "----- traverseForward: -----------" << endl;
349  *       traverseForward( iterator );
350  *       cout << endl << endl << "----- traverseBackward: ----------" << endl;
351  *       traverseBackward( iterator );
352  *       cout << endl << endl << "----- traverseOut: ---------------" << endl;
353  *       traverseOut( iterator, 7 );
354  *       cout << endl << endl << "-----" << endl;
355  *   }
356  * \endcode
357  * </pre>
358  *
359  * @stable ICU 2.0
360  */
361 class U_COMMON_API CharacterIterator : public ForwardCharacterIterator {
362 public:
363     /**
364      * Origin enumeration for the move() and move32() functions.
365      * @stable ICU 2.0
366      */
367     enum EOrigin { kStart, kCurrent, kEnd };
368 
369     /**
370      * Destructor.
371      * @stable ICU 2.0
372      */
373     virtual ~CharacterIterator();
374 
375     /**
376      * Returns a pointer to a new CharacterIterator of the same
377      * concrete class as this one, and referring to the same
378      * character in the same text-storage object as this one.  The
379      * caller is responsible for deleting the new clone.
380      * @return a pointer to a new CharacterIterator
381      * @stable ICU 2.0
382      */
383     virtual CharacterIterator* clone() const = 0;
384 
385     /**
386      * Sets the iterator to refer to the first code unit in its
387      * iteration range, and returns that code unit.
388      * This can be used to begin an iteration with next().
389      * @return the first code unit in its iteration range.
390      * @stable ICU 2.0
391      */
392     virtual char16_t         first(void) = 0;
393 
394     /**
395      * Sets the iterator to refer to the first code unit in its
396      * iteration range, returns that code unit, and moves the position
397      * to the second code unit. This is an alternative to setToStart()
398      * for forward iteration with nextPostInc().
399      * @return the first code unit in its iteration range.
400      * @stable ICU 2.0
401      */
402     virtual char16_t         firstPostInc(void);
403 
404     /**
405      * Sets the iterator to refer to the first code point in its
406      * iteration range, and returns that code unit,
407      * This can be used to begin an iteration with next32().
408      * Note that an iteration with next32PostInc(), beginning with,
409      * e.g., setToStart() or firstPostInc(), is more efficient.
410      * @return the first code point in its iteration range.
411      * @stable ICU 2.0
412      */
413     virtual UChar32       first32(void) = 0;
414 
415     /**
416      * Sets the iterator to refer to the first code point in its
417      * iteration range, returns that code point, and moves the position
418      * to the second code point. This is an alternative to setToStart()
419      * for forward iteration with next32PostInc().
420      * @return the first code point in its iteration range.
421      * @stable ICU 2.0
422      */
423     virtual UChar32       first32PostInc(void);
424 
425     /**
426      * Sets the iterator to refer to the first code unit or code point in its
427      * iteration range. This can be used to begin a forward
428      * iteration with nextPostInc() or next32PostInc().
429      * @return the start position of the iteration range
430      * @stable ICU 2.0
431      */
432     inline int32_t    setToStart();
433 
434     /**
435      * Sets the iterator to refer to the last code unit in its
436      * iteration range, and returns that code unit.
437      * This can be used to begin an iteration with previous().
438      * @return the last code unit.
439      * @stable ICU 2.0
440      */
441     virtual char16_t         last(void) = 0;
442 
443     /**
444      * Sets the iterator to refer to the last code point in its
445      * iteration range, and returns that code unit.
446      * This can be used to begin an iteration with previous32().
447      * @return the last code point.
448      * @stable ICU 2.0
449      */
450     virtual UChar32       last32(void) = 0;
451 
452     /**
453      * Sets the iterator to the end of its iteration range, just behind
454      * the last code unit or code point. This can be used to begin a backward
455      * iteration with previous() or previous32().
456      * @return the end position of the iteration range
457      * @stable ICU 2.0
458      */
459     inline int32_t    setToEnd();
460 
461     /**
462      * Sets the iterator to refer to the "position"-th code unit
463      * in the text-storage object the iterator refers to, and
464      * returns that code unit.
465      * @param position the "position"-th code unit in the text-storage object
466      * @return the "position"-th code unit.
467      * @stable ICU 2.0
468      */
469     virtual char16_t         setIndex(int32_t position) = 0;
470 
471     /**
472      * Sets the iterator to refer to the beginning of the code point
473      * that contains the "position"-th code unit
474      * in the text-storage object the iterator refers to, and
475      * returns that code point.
476      * The current position is adjusted to the beginning of the code point
477      * (its first code unit).
478      * @param position the "position"-th code unit in the text-storage object
479      * @return the "position"-th code point.
480      * @stable ICU 2.0
481      */
482     virtual UChar32       setIndex32(int32_t position) = 0;
483 
484     /**
485      * Returns the code unit the iterator currently refers to.
486      * @return the current code unit.
487      * @stable ICU 2.0
488      */
489     virtual char16_t         current(void) const = 0;
490 
491     /**
492      * Returns the code point the iterator currently refers to.
493      * @return the current code point.
494      * @stable ICU 2.0
495      */
496     virtual UChar32       current32(void) const = 0;
497 
498     /**
499      * Advances to the next code unit in the iteration range
500      * (toward endIndex()), and returns that code unit.  If there are
501      * no more code units to return, returns DONE.
502      * @return the next code unit.
503      * @stable ICU 2.0
504      */
505     virtual char16_t         next(void) = 0;
506 
507     /**
508      * Advances to the next code point in the iteration range
509      * (toward endIndex()), and returns that code point.  If there are
510      * no more code points to return, returns DONE.
511      * Note that iteration with "pre-increment" semantics is less
512      * efficient than iteration with "post-increment" semantics
513      * that is provided by next32PostInc().
514      * @return the next code point.
515      * @stable ICU 2.0
516      */
517     virtual UChar32       next32(void) = 0;
518 
519     /**
520      * Advances to the previous code unit in the iteration range
521      * (toward startIndex()), and returns that code unit.  If there are
522      * no more code units to return, returns DONE.
523      * @return the previous code unit.
524      * @stable ICU 2.0
525      */
526     virtual char16_t         previous(void) = 0;
527 
528     /**
529      * Advances to the previous code point in the iteration range
530      * (toward startIndex()), and returns that code point.  If there are
531      * no more code points to return, returns DONE.
532      * @return the previous code point.
533      * @stable ICU 2.0
534      */
535     virtual UChar32       previous32(void) = 0;
536 
537     /**
538      * Returns FALSE if there are no more code units or code points
539      * before the current position in the iteration range.
540      * This is used with previous() or previous32() in backward
541      * iteration.
542      * @return FALSE if there are no more code units or code points
543      * before the current position in the iteration range, return TRUE otherwise.
544      * @stable ICU 2.0
545      */
546     virtual UBool        hasPrevious() = 0;
547 
548     /**
549      * Returns the numeric index in the underlying text-storage
550      * object of the character returned by first().  Since it's
551      * possible to create an iterator that iterates across only
552      * part of a text-storage object, this number isn't
553      * necessarily 0.
554      * @returns the numeric index in the underlying text-storage
555      * object of the character returned by first().
556      * @stable ICU 2.0
557      */
558     inline int32_t       startIndex(void) const;
559 
560     /**
561      * Returns the numeric index in the underlying text-storage
562      * object of the position immediately BEYOND the character
563      * returned by last().
564      * @return the numeric index in the underlying text-storage
565      * object of the position immediately BEYOND the character
566      * returned by last().
567      * @stable ICU 2.0
568      */
569     inline int32_t       endIndex(void) const;
570 
571     /**
572      * Returns the numeric index in the underlying text-storage
573      * object of the character the iterator currently refers to
574      * (i.e., the character returned by current()).
575      * @return the numeric index in the text-storage object of
576      * the character the iterator currently refers to
577      * @stable ICU 2.0
578      */
579     inline int32_t       getIndex(void) const;
580 
581     /**
582      * Returns the length of the entire text in the underlying
583      * text-storage object.
584      * @return the length of the entire text in the text-storage object
585      * @stable ICU 2.0
586      */
587     inline int32_t           getLength() const;
588 
589     /**
590      * Moves the current position relative to the start or end of the
591      * iteration range, or relative to the current position itself.
592      * The movement is expressed in numbers of code units forward
593      * or backward by specifying a positive or negative delta.
594      * @param delta the position relative to origin. A positive delta means forward;
595      * a negative delta means backward.
596      * @param origin Origin enumeration {kStart, kCurrent, kEnd}
597      * @return the new position
598      * @stable ICU 2.0
599      */
600     virtual int32_t      move(int32_t delta, EOrigin origin) = 0;
601 
602     /**
603      * Moves the current position relative to the start or end of the
604      * iteration range, or relative to the current position itself.
605      * The movement is expressed in numbers of code points forward
606      * or backward by specifying a positive or negative delta.
607      * @param delta the position relative to origin. A positive delta means forward;
608      * a negative delta means backward.
609      * @param origin Origin enumeration {kStart, kCurrent, kEnd}
610      * @return the new position
611      * @stable ICU 2.0
612      */
613 #ifdef move32
614      // One of the system headers right now is sometimes defining a conflicting macro we don't use
615 #undef move32
616 #endif
617     virtual int32_t      move32(int32_t delta, EOrigin origin) = 0;
618 
619     /**
620      * Copies the text under iteration into the UnicodeString
621      * referred to by "result".
622      * @param result Receives a copy of the text under iteration.
623      * @stable ICU 2.0
624      */
625     virtual void            getText(UnicodeString&  result) = 0;
626 
627 protected:
628     /**
629      * Empty constructor.
630      * @stable ICU 2.0
631      */
632     CharacterIterator();
633 
634     /**
635      * Constructor, just setting the length field in this base class.
636      * @stable ICU 2.0
637      */
638     CharacterIterator(int32_t length);
639 
640     /**
641      * Constructor, just setting the length and position fields in this base class.
642      * @stable ICU 2.0
643      */
644     CharacterIterator(int32_t length, int32_t position);
645 
646     /**
647      * Constructor, just setting the length, start, end, and position fields in this base class.
648      * @stable ICU 2.0
649      */
650     CharacterIterator(int32_t length, int32_t textBegin, int32_t textEnd, int32_t position);
651 
652     /**
653      * Copy constructor.
654      *
655      * @param that The CharacterIterator to be copied
656      * @stable ICU 2.0
657      */
658     CharacterIterator(const CharacterIterator &that);
659 
660     /**
661      * Assignment operator.  Sets this CharacterIterator to have the same behavior,
662      * as the one passed in.
663      * @param that The CharacterIterator passed in.
664      * @return the newly set CharacterIterator.
665      * @stable ICU 2.0
666      */
667     CharacterIterator &operator=(const CharacterIterator &that);
668 
669     /**
670      * Base class text length field.
671      * Necessary this for correct getText() and hashCode().
672      * @stable ICU 2.0
673      */
674     int32_t textLength;
675 
676     /**
677      * Base class field for the current position.
678      * @stable ICU 2.0
679      */
680     int32_t  pos;
681 
682     /**
683      * Base class field for the start of the iteration range.
684      * @stable ICU 2.0
685      */
686     int32_t  begin;
687 
688     /**
689      * Base class field for the end of the iteration range.
690      * @stable ICU 2.0
691      */
692     int32_t  end;
693 };
694 
695 inline UBool
696 ForwardCharacterIterator::operator!=(const ForwardCharacterIterator& that) const {
697     return !operator==(that);
698 }
699 
700 inline int32_t
setToStart()701 CharacterIterator::setToStart() {
702     return move(0, kStart);
703 }
704 
705 inline int32_t
setToEnd()706 CharacterIterator::setToEnd() {
707     return move(0, kEnd);
708 }
709 
710 inline int32_t
startIndex(void)711 CharacterIterator::startIndex(void) const {
712     return begin;
713 }
714 
715 inline int32_t
endIndex(void)716 CharacterIterator::endIndex(void) const {
717     return end;
718 }
719 
720 inline int32_t
getIndex(void)721 CharacterIterator::getIndex(void) const {
722     return pos;
723 }
724 
725 inline int32_t
getLength(void)726 CharacterIterator::getLength(void) const {
727     return textLength;
728 }
729 
730 U_NAMESPACE_END
731 
732 #endif /* U_SHOW_CPLUSPLUS_API */
733 
734 #endif
735