• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ********************************************************************
3 *
4 *   Copyright (C) 1997-2005, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ********************************************************************
8 */
9 
10 #ifndef CHARITER_H
11 #define CHARITER_H
12 
13 #include "unicode/utypes.h"
14 #include "unicode/uobject.h"
15 #include "unicode/unistr.h"
16 /**
17  * \file
18  * \brief C++ API: Character Iterator
19  */
20 
21 U_NAMESPACE_BEGIN
22 /**
23  * Abstract class that defines an API for forward-only iteration
24  * on text objects.
25  * This is a minimal interface for iteration without random access
26  * or backwards iteration. It is especially useful for wrapping
27  * streams with converters into an object for collation or
28  * normalization.
29  *
30  * <p>Characters can be accessed in two ways: as code units or as
31  * code points.
32  * Unicode code points are 21-bit integers and are the scalar values
33  * of Unicode characters. ICU uses the type UChar32 for them.
34  * Unicode code units are the storage units of a given
35  * Unicode/UCS Transformation Format (a character encoding scheme).
36  * With UTF-16, all code points can be represented with either one
37  * or two code units ("surrogates").
38  * String storage is typically based on code units, while properties
39  * of characters are typically determined using code point values.
40  * Some processes may be designed to work with sequences of code units,
41  * or it may be known that all characters that are important to an
42  * algorithm can be represented with single code units.
43  * Other processes will need to use the code point access functions.</p>
44  *
45  * <p>ForwardCharacterIterator provides nextPostInc() to access
46  * a code unit and advance an internal position into the text object,
47  * similar to a <code>return text[position++]</code>.<br>
48  * It provides next32PostInc() to access a code point and advance an internal
49  * position.</p>
50  *
51  * <p>next32PostInc() assumes that the current position is that of
52  * the beginning of a code point, i.e., of its first code unit.
53  * After next32PostInc(), this will be true again.
54  * In general, access to code units and code points in the same
55  * iteration loop should not be mixed. In UTF-16, if the current position
56  * is on a second code unit (Low Surrogate), then only that code unit
57  * is returned even by next32PostInc().</p>
58  *
59  * <p>For iteration with either function, there are two ways to
60  * check for the end of the iteration. When there are no more
61  * characters in the text object:
62  * <ul>
63  * <li>The hasNext() function returns FALSE.</li>
64  * <li>nextPostInc() and next32PostInc() return DONE
65  *     when one attempts to read beyond the end of the text object.</li>
66  * </ul>
67  *
68  * Example:
69  * \code
70  * void function1(ForwardCharacterIterator &it) {
71  *     UChar32 c;
72  *     while(it.hasNext()) {
73  *         c=it.next32PostInc();
74  *         // use c
75  *     }
76  * }
77  *
78  * void function1(ForwardCharacterIterator &it) {
79  *     UChar c;
80  *     while((c=it.nextPostInc())!=ForwardCharacterIterator::DONE) {
81  *         // use c
82  *      }
83  *  }
84  * \endcode
85  * </p>
86  *
87  * @stable ICU 2.0
88  */
89 class U_COMMON_API ForwardCharacterIterator : public UObject {
90 public:
91     /**
92      * Value returned by most of ForwardCharacterIterator's functions
93      * when the iterator has reached the limits of its iteration.
94      * @stable ICU 2.0
95      */
96     enum { DONE = 0xffff };
97 
98     /**
99      * Destructor.
100      * @stable ICU 2.0
101      */
102     virtual ~ForwardCharacterIterator();
103 
104     /**
105      * Returns true when both iterators refer to the same
106      * character in the same character-storage object.
107      * @param that The ForwardCharacterIterator to be compared for equality
108      * @return true when both iterators refer to the same
109      * character in the same character-storage object
110      * @stable ICU 2.0
111      */
112     virtual UBool operator==(const ForwardCharacterIterator& that) const = 0;
113 
114     /**
115      * Returns true when the iterators refer to different
116      * text-storage objects, or to different characters in the
117      * same text-storage object.
118      * @param that The ForwardCharacterIterator to be compared for inequality
119      * @return true when the iterators refer to different
120      * text-storage objects, or to different characters in the
121      * same text-storage object
122      * @stable ICU 2.0
123      */
124     inline UBool operator!=(const ForwardCharacterIterator& that) const;
125 
126     /**
127      * Generates a hash code for this iterator.
128      * @return the hash code.
129      * @stable ICU 2.0
130      */
131     virtual int32_t hashCode(void) const = 0;
132 
133     /**
134      * Returns a UClassID for this ForwardCharacterIterator ("poor man's
135      * RTTI").<P> Despite the fact that this function is public,
136      * DO NOT CONSIDER IT PART OF CHARACTERITERATOR'S API!
137      * @return a UClassID for this ForwardCharacterIterator
138      * @stable ICU 2.0
139      */
140     virtual UClassID getDynamicClassID(void) const = 0;
141 
142     /**
143      * Gets the current code unit for returning and advances to the next code unit
144      * in the iteration range
145      * (toward endIndex()).  If there are
146      * no more code units to return, returns DONE.
147      * @return the current code unit.
148      * @stable ICU 2.0
149      */
150     virtual UChar         nextPostInc(void) = 0;
151 
152     /**
153      * Gets the current code point for returning and advances to the next code point
154      * in the iteration range
155      * (toward endIndex()).  If there are
156      * no more code points to return, returns DONE.
157      * @return the current code point.
158      * @stable ICU 2.0
159      */
160     virtual UChar32       next32PostInc(void) = 0;
161 
162     /**
163      * Returns FALSE if there are no more code units or code points
164      * at or after the current position in the iteration range.
165      * This is used with nextPostInc() or next32PostInc() in forward
166      * iteration.
167      * @returns FALSE if there are no more code units or code points
168      * at or after the current position in the iteration range.
169      * @stable ICU 2.0
170      */
171     virtual UBool        hasNext() = 0;
172 
173 protected:
174     /** Default constructor to be overridden in the implementing class. @stable ICU 2.0*/
175     ForwardCharacterIterator();
176 
177     /** Copy constructor to be overridden in the implementing class. @stable ICU 2.0*/
178     ForwardCharacterIterator(const ForwardCharacterIterator &other);
179 
180     /**
181      * Assignment operator to be overridden in the implementing class.
182      * @stable ICU 2.0
183      */
184     ForwardCharacterIterator &operator=(const ForwardCharacterIterator&) { return *this; }
185 };
186 
187 /**
188  * Abstract class that defines an API for iteration
189  * on text objects.
190  * This is an interface for forward and backward iteration
191  * and random access into a text object.
192  *
193  * <p>The API provides backward compatibility to the Java and older ICU
194  * CharacterIterator classes but extends them significantly:
195  * <ol>
196  * <li>CharacterIterator is now a subclass of ForwardCharacterIterator.</li>
197  * <li>While the old API functions provided forward iteration with
198  *     "pre-increment" semantics, the new one also provides functions
199  *     with "post-increment" semantics. They are more efficient and should
200  *     be the preferred iterator functions for new implementations.
201  *     The backward iteration always had "pre-decrement" semantics, which
202  *     are efficient.</li>
203  * <li>Just like ForwardCharacterIterator, it provides access to
204  *     both code units and code points. Code point access versions are available
205  *     for the old and the new iteration semantics.</li>
206  * <li>There are new functions for setting and moving the current position
207  *     without returning a character, for efficiency.</li>
208  * </ol>
209  *
210  * See ForwardCharacterIterator for examples for using the new forward iteration
211  * functions. For backward iteration, there is also a hasPrevious() function
212  * that can be used analogously to hasNext().
213  * The old functions work as before and are shown below.</p>
214  *
215  * <p>Examples for some of the new functions:</p>
216  *
217  * Forward iteration with hasNext():
218  * \code
219  * void forward1(CharacterIterator &it) {
220  *     UChar32 c;
221  *     for(it.setToStart(); it.hasNext();) {
222  *         c=it.next32PostInc();
223  *         // use c
224  *     }
225  *  }
226  * \endcode
227  * Forward iteration more similar to loops with the old forward iteration,
228  * showing a way to convert simple for() loops:
229  * \code
230  * void forward2(CharacterIterator &it) {
231  *     UChar c;
232  *     for(c=it.firstPostInc(); c!=CharacterIterator::DONE; c=it.nextPostInc()) {
233  *          // use c
234  *      }
235  * }
236  * \endcode
237  * Backward iteration with setToEnd() and hasPrevious():
238  * \code
239  *  void backward1(CharacterIterator &it) {
240  *      UChar32 c;
241  *      for(it.setToEnd(); it.hasPrevious();) {
242  *         c=it.previous32();
243  *          // use c
244  *      }
245  *  }
246  * \endcode
247  * Backward iteration with a more traditional for() loop:
248  * \code
249  * void backward2(CharacterIterator &it) {
250  *     UChar c;
251  *     for(c=it.last(); c!=CharacterIterator::DONE; c=it.previous()) {
252  *         // use c
253  *      }
254  *  }
255  * \endcode
256  *
257  * Example for random access:
258  * \code
259  *  void random(CharacterIterator &it) {
260  *      // set to the third code point from the beginning
261  *      it.move32(3, CharacterIterator::kStart);
262  *      // get a code point from here without moving the position
263  *      UChar32 c=it.current32();
264  *      // get the position
265  *      int32_t pos=it.getIndex();
266  *      // get the previous code unit
267  *      UChar u=it.previous();
268  *      // move back one more code unit
269  *      it.move(-1, CharacterIterator::kCurrent);
270  *      // set the position back to where it was
271  *      // and read the same code point c and move beyond it
272  *      it.setIndex(pos);
273  *      if(c!=it.next32PostInc()) {
274  *          exit(1); // CharacterIterator inconsistent
275  *      }
276  *  }
277  * \endcode
278  *
279  * <p>Examples, especially for the old API:</p>
280  *
281  * Function processing characters, in this example simple output
282  * <pre>
283  * \code
284  *  void processChar( UChar c )
285  *  {
286  *      cout << " " << c;
287  *  }
288  * \endcode
289  * </pre>
290  * Traverse the text from start to finish
291  * <pre>
292  * \code
293  *  void traverseForward(CharacterIterator& iter)
294  *  {
295  *      for(UChar c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {
296  *          processChar(c);
297  *      }
298  *  }
299  * \endcode
300  * </pre>
301  * Traverse the text backwards, from end to start
302  * <pre>
303  * \code
304  *  void traverseBackward(CharacterIterator& iter)
305  *  {
306  *      for(UChar c = iter.last(); c != CharacterIterator.DONE; c = iter.previous()) {
307  *          processChar(c);
308  *      }
309  *  }
310  * \endcode
311  * </pre>
312  * Traverse both forward and backward from a given position in the text.
313  * Calls to notBoundary() in this example represents some additional stopping criteria.
314  * <pre>
315  * \code
316  * void traverseOut(CharacterIterator& iter, int32_t pos)
317  * {
318  *      UChar c;
319  *      for (c = iter.setIndex(pos);
320  *      c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
321  *          c = iter.next()) {}
322  *      int32_t end = iter.getIndex();
323  *      for (c = iter.setIndex(pos);
324  *          c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
325  *          c = iter.previous()) {}
326  *      int32_t start = iter.getIndex() + 1;
327  *
328  *      cout << "start: " << start << " end: " << end << endl;
329  *      for (c = iter.setIndex(start); iter.getIndex() < end; c = iter.next() ) {
330  *          processChar(c);
331  *     }
332  *  }
333  * \endcode
334  * </pre>
335  * Creating a StringCharacterIterator and calling the test functions
336  * <pre>
337  * \code
338  *  void CharacterIterator_Example( void )
339  *   {
340  *       cout << endl << "===== CharacterIterator_Example: =====" << endl;
341  *       UnicodeString text("Ein kleiner Satz.");
342  *       StringCharacterIterator iterator(text);
343  *       cout << "----- traverseForward: -----------" << endl;
344  *       traverseForward( iterator );
345  *       cout << endl << endl << "----- traverseBackward: ----------" << endl;
346  *       traverseBackward( iterator );
347  *       cout << endl << endl << "----- traverseOut: ---------------" << endl;
348  *       traverseOut( iterator, 7 );
349  *       cout << endl << endl << "-----" << endl;
350  *   }
351  * \endcode
352  * </pre>
353  *
354  * @stable ICU 2.0
355  */
356 class U_COMMON_API CharacterIterator : public ForwardCharacterIterator {
357 public:
358     /**
359      * Origin enumeration for the move() and move32() functions.
360      * @stable ICU 2.0
361      */
362     enum EOrigin { kStart, kCurrent, kEnd };
363 
364     /**
365      * Returns a pointer to a new CharacterIterator of the same
366      * concrete class as this one, and referring to the same
367      * character in the same text-storage object as this one.  The
368      * caller is responsible for deleting the new clone.
369      * @return a pointer to a new CharacterIterator
370      * @stable ICU 2.0
371      */
372     virtual CharacterIterator* clone(void) const = 0;
373 
374     /**
375      * Sets the iterator to refer to the first code unit in its
376      * iteration range, and returns that code unit.
377      * This can be used to begin an iteration with next().
378      * @return the first code unit in its iteration range.
379      * @stable ICU 2.0
380      */
381     virtual UChar         first(void) = 0;
382 
383     /**
384      * Sets the iterator to refer to the first code unit in its
385      * iteration range, returns that code unit, and moves the position
386      * to the second code unit. This is an alternative to setToStart()
387      * for forward iteration with nextPostInc().
388      * @return the first code unit in its iteration range.
389      * @stable ICU 2.0
390      */
391     virtual UChar         firstPostInc(void);
392 
393     /**
394      * Sets the iterator to refer to the first code point in its
395      * iteration range, and returns that code unit,
396      * This can be used to begin an iteration with next32().
397      * Note that an iteration with next32PostInc(), beginning with,
398      * e.g., setToStart() or firstPostInc(), is more efficient.
399      * @return the first code point in its iteration range.
400      * @stable ICU 2.0
401      */
402     virtual UChar32       first32(void) = 0;
403 
404     /**
405      * Sets the iterator to refer to the first code point in its
406      * iteration range, returns that code point, and moves the position
407      * to the second code point. This is an alternative to setToStart()
408      * for forward iteration with next32PostInc().
409      * @return the first code point in its iteration range.
410      * @stable ICU 2.0
411      */
412     virtual UChar32       first32PostInc(void);
413 
414     /**
415      * Sets the iterator to refer to the first code unit or code point in its
416      * iteration range. This can be used to begin a forward
417      * iteration with nextPostInc() or next32PostInc().
418      * @return the start position of the iteration range
419      * @stable ICU 2.0
420      */
421     inline int32_t    setToStart();
422 
423     /**
424      * Sets the iterator to refer to the last code unit in its
425      * iteration range, and returns that code unit.
426      * This can be used to begin an iteration with previous().
427      * @return the last code unit.
428      * @stable ICU 2.0
429      */
430     virtual UChar         last(void) = 0;
431 
432     /**
433      * Sets the iterator to refer to the last code point in its
434      * iteration range, and returns that code unit.
435      * This can be used to begin an iteration with previous32().
436      * @return the last code point.
437      * @stable ICU 2.0
438      */
439     virtual UChar32       last32(void) = 0;
440 
441     /**
442      * Sets the iterator to the end of its iteration range, just behind
443      * the last code unit or code point. This can be used to begin a backward
444      * iteration with previous() or previous32().
445      * @return the end position of the iteration range
446      * @stable ICU 2.0
447      */
448     inline int32_t    setToEnd();
449 
450     /**
451      * Sets the iterator to refer to the "position"-th code unit
452      * in the text-storage object the iterator refers to, and
453      * returns that code unit.
454      * @param position the "position"-th code unit in the text-storage object
455      * @return the "position"-th code unit.
456      * @stable ICU 2.0
457      */
458     virtual UChar         setIndex(int32_t position) = 0;
459 
460     /**
461      * Sets the iterator to refer to the beginning of the code point
462      * that contains the "position"-th code unit
463      * in the text-storage object the iterator refers to, and
464      * returns that code point.
465      * The current position is adjusted to the beginning of the code point
466      * (its first code unit).
467      * @param position the "position"-th code unit in the text-storage object
468      * @return the "position"-th code point.
469      * @stable ICU 2.0
470      */
471     virtual UChar32       setIndex32(int32_t position) = 0;
472 
473     /**
474      * Returns the code unit the iterator currently refers to.
475      * @return the current code unit.
476      * @stable ICU 2.0
477      */
478     virtual UChar         current(void) const = 0;
479 
480     /**
481      * Returns the code point the iterator currently refers to.
482      * @return the current code point.
483      * @stable ICU 2.0
484      */
485     virtual UChar32       current32(void) const = 0;
486 
487     /**
488      * Advances to the next code unit in the iteration range
489      * (toward endIndex()), and returns that code unit.  If there are
490      * no more code units to return, returns DONE.
491      * @return the next code unit.
492      * @stable ICU 2.0
493      */
494     virtual UChar         next(void) = 0;
495 
496     /**
497      * Advances to the next code point in the iteration range
498      * (toward endIndex()), and returns that code point.  If there are
499      * no more code points to return, returns DONE.
500      * Note that iteration with "pre-increment" semantics is less
501      * efficient than iteration with "post-increment" semantics
502      * that is provided by next32PostInc().
503      * @return the next code point.
504      * @stable ICU 2.0
505      */
506     virtual UChar32       next32(void) = 0;
507 
508     /**
509      * Advances to the previous code unit in the iteration range
510      * (toward startIndex()), and returns that code unit.  If there are
511      * no more code units to return, returns DONE.
512      * @return the previous code unit.
513      * @stable ICU 2.0
514      */
515     virtual UChar         previous(void) = 0;
516 
517     /**
518      * Advances to the previous code point in the iteration range
519      * (toward startIndex()), and returns that code point.  If there are
520      * no more code points to return, returns DONE.
521      * @return the previous code point.
522      * @stable ICU 2.0
523      */
524     virtual UChar32       previous32(void) = 0;
525 
526     /**
527      * Returns FALSE if there are no more code units or code points
528      * before the current position in the iteration range.
529      * This is used with previous() or previous32() in backward
530      * iteration.
531      * @return FALSE if there are no more code units or code points
532      * before the current position in the iteration range, return TRUE otherwise.
533      * @stable ICU 2.0
534      */
535     virtual UBool        hasPrevious() = 0;
536 
537     /**
538      * Returns the numeric index in the underlying text-storage
539      * object of the character returned by first().  Since it's
540      * possible to create an iterator that iterates across only
541      * part of a text-storage object, this number isn't
542      * necessarily 0.
543      * @returns the numeric index in the underlying text-storage
544      * object of the character returned by first().
545      * @stable ICU 2.0
546      */
547     inline int32_t       startIndex(void) const;
548 
549     /**
550      * Returns the numeric index in the underlying text-storage
551      * object of the position immediately BEYOND the character
552      * returned by last().
553      * @return the numeric index in the underlying text-storage
554      * object of the position immediately BEYOND the character
555      * returned by last().
556      * @stable ICU 2.0
557      */
558     inline int32_t       endIndex(void) const;
559 
560     /**
561      * Returns the numeric index in the underlying text-storage
562      * object of the character the iterator currently refers to
563      * (i.e., the character returned by current()).
564      * @return the numberic index in the text-storage object of
565      * the character the iterator currently refers to
566      * @stable ICU 2.0
567      */
568     inline int32_t       getIndex(void) const;
569 
570     /**
571      * Returns the length of the entire text in the underlying
572      * text-storage object.
573      * @return the length of the entire text in the text-storage object
574      * @stable ICU 2.0
575      */
576     inline int32_t           getLength() const;
577 
578     /**
579      * Moves the current position relative to the start or end of the
580      * iteration range, or relative to the current position itself.
581      * The movement is expressed in numbers of code units forward
582      * or backward by specifying a positive or negative delta.
583      * @param delta the position relative to origin. A positive delta means forward;
584      * a negative delta means backward.
585      * @param origin Origin enumeration {kStart, kCurrent, kEnd}
586      * @return the new position
587      * @stable ICU 2.0
588      */
589     virtual int32_t      move(int32_t delta, EOrigin origin) = 0;
590 
591     /**
592      * Moves the current position relative to the start or end of the
593      * iteration range, or relative to the current position itself.
594      * The movement is expressed in numbers of code points forward
595      * or backward by specifying a positive or negative delta.
596      * @param delta the position relative to origin. A positive delta means forward;
597      * a negative delta means backward.
598      * @param origin Origin enumeration {kStart, kCurrent, kEnd}
599      * @return the new position
600      * @stable ICU 2.0
601      */
602     virtual int32_t      move32(int32_t delta, EOrigin origin) = 0;
603 
604     /**
605      * Copies the text under iteration into the UnicodeString
606      * referred to by "result".
607      * @param result Receives a copy of the text under iteration.
608      * @stable ICU 2.0
609      */
610     virtual void            getText(UnicodeString&  result) = 0;
611 
612 protected:
613     /**
614      * Empty constructor.
615      * @stable ICU 2.0
616      */
617     CharacterIterator();
618 
619     /**
620      * Constructor, just setting the length field in this base class.
621      * @stable ICU 2.0
622      */
623     CharacterIterator(int32_t length);
624 
625     /**
626      * Constructor, just setting the length and position fields in this base class.
627      * @stable ICU 2.0
628      */
629     CharacterIterator(int32_t length, int32_t position);
630 
631     /**
632      * Constructor, just setting the length, start, end, and position fields in this base class.
633      * @stable ICU 2.0
634      */
635     CharacterIterator(int32_t length, int32_t textBegin, int32_t textEnd, int32_t position);
636 
637     /**
638      * Copy constructor.
639      *
640      * @param that The CharacterIterator to be copied
641      * @stable ICU 2.0
642      */
643     CharacterIterator(const CharacterIterator &that);
644 
645     /**
646      * Assignment operator.  Sets this CharacterIterator to have the same behavior,
647      * as the one passed in.
648      * @param that The CharacterIterator passed in.
649      * @return the newly set CharacterIterator.
650      * @stable ICU 2.0
651      */
652     CharacterIterator &operator=(const CharacterIterator &that);
653 
654     /**
655      * Base class text length field.
656      * Necessary this for correct getText() and hashCode().
657      * @stable ICU 2.0
658      */
659     int32_t textLength;
660 
661     /**
662      * Base class field for the current position.
663      * @stable ICU 2.0
664      */
665     int32_t  pos;
666 
667     /**
668      * Base class field for the start of the iteration range.
669      * @stable ICU 2.0
670      */
671     int32_t  begin;
672 
673     /**
674      * Base class field for the end of the iteration range.
675      * @stable ICU 2.0
676      */
677     int32_t  end;
678 };
679 
680 inline UBool
681 ForwardCharacterIterator::operator!=(const ForwardCharacterIterator& that) const {
682     return !operator==(that);
683 }
684 
685 inline int32_t
setToStart()686 CharacterIterator::setToStart() {
687     return move(0, kStart);
688 }
689 
690 inline int32_t
setToEnd()691 CharacterIterator::setToEnd() {
692     return move(0, kEnd);
693 }
694 
695 inline int32_t
startIndex(void)696 CharacterIterator::startIndex(void) const {
697     return begin;
698 }
699 
700 inline int32_t
endIndex(void)701 CharacterIterator::endIndex(void) const {
702     return end;
703 }
704 
705 inline int32_t
getIndex(void)706 CharacterIterator::getIndex(void) const {
707     return pos;
708 }
709 
710 inline int32_t
getLength(void)711 CharacterIterator::getLength(void) const {
712     return textLength;
713 }
714 
715 U_NAMESPACE_END
716 #endif
717