• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ***************************************************************************
3 *   Copyright (C) 1999-2012 International Business Machines Corporation
4 *   and others. All rights reserved.
5 ***************************************************************************
6 */
7 //
8 //  file:  rbbi.c    Contains the implementation of the rule based break iterator
9 //                   runtime engine and the API implementation for
10 //                   class RuleBasedBreakIterator
11 //
12 
13 #include "utypeinfo.h"  // for 'typeid' to work
14 
15 #include "unicode/utypes.h"
16 
17 #if !UCONFIG_NO_BREAK_ITERATION
18 
19 #include "unicode/rbbi.h"
20 #include "unicode/schriter.h"
21 #include "unicode/uchriter.h"
22 #include "unicode/udata.h"
23 #include "unicode/uclean.h"
24 #include "rbbidata.h"
25 #include "rbbirb.h"
26 #include "cmemory.h"
27 #include "cstring.h"
28 #include "umutex.h"
29 #include "ucln_cmn.h"
30 #include "brkeng.h"
31 
32 #include "uassert.h"
33 #include "uvector.h"
34 
35 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
36 #if U_LOCAL_SERVICE_HOOK
37 #include "localsvc.h"
38 #endif
39 
40 #ifdef RBBI_DEBUG
41 static UBool fTrace = FALSE;
42 #endif
43 
44 U_NAMESPACE_BEGIN
45 
46 // The state number of the starting state
47 #define START_STATE 1
48 
49 // The state-transition value indicating "stop"
50 #define STOP_STATE  0
51 
52 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
54 
55 
56 //=======================================================================
57 // constructors
58 //=======================================================================
59 
60 /**
61  * Constructs a RuleBasedBreakIterator that uses the already-created
62  * tables object that is passed in as a parameter.
63  */
64 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
65 {
66     init();
67     fData = new RBBIDataWrapper(data, status); // status checked in constructor
68     if (U_FAILURE(status)) {return;}
69     if(fData == 0) {
70         status = U_MEMORY_ALLOCATION_ERROR;
71         return;
72     }
73 }
74 
75 /**
76  * Same as above but does not adopt memory
77  */
RuleBasedBreakIterator(const RBBIDataHeader * data,enum EDontAdopt,UErrorCode & status)78 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)
79 {
80     init();
81     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor
82     if (U_FAILURE(status)) {return;}
83     if(fData == 0) {
84         status = U_MEMORY_ALLOCATION_ERROR;
85         return;
86     }
87 }
88 
89 
90 //
91 //  Construct from precompiled binary rules (tables).  This constructor is public API,
92 //  taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
93 //
RuleBasedBreakIterator(const uint8_t * compiledRules,uint32_t ruleLength,UErrorCode & status)94 RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
95                        uint32_t       ruleLength,
96                        UErrorCode     &status) {
97     init();
98     if (U_FAILURE(status)) {
99         return;
100     }
101     if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) {
102         status = U_ILLEGAL_ARGUMENT_ERROR;
103         return;
104     }
105     const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules;
106     if (data->fLength > ruleLength) {
107         status = U_ILLEGAL_ARGUMENT_ERROR;
108         return;
109     }
110     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);
111     if (U_FAILURE(status)) {return;}
112     if(fData == 0) {
113         status = U_MEMORY_ALLOCATION_ERROR;
114         return;
115     }
116 }
117 
118 
119 //-------------------------------------------------------------------------------
120 //
121 //   Constructor   from a UDataMemory handle to precompiled break rules
122 //                 stored in an ICU data file.
123 //
124 //-------------------------------------------------------------------------------
RuleBasedBreakIterator(UDataMemory * udm,UErrorCode & status)125 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
126 {
127     init();
128     fData = new RBBIDataWrapper(udm, status); // status checked in constructor
129     if (U_FAILURE(status)) {return;}
130     if(fData == 0) {
131         status = U_MEMORY_ALLOCATION_ERROR;
132         return;
133     }
134 }
135 
136 
137 
138 //-------------------------------------------------------------------------------
139 //
140 //   Constructor       from a set of rules supplied as a string.
141 //
142 //-------------------------------------------------------------------------------
RuleBasedBreakIterator(const UnicodeString & rules,UParseError & parseError,UErrorCode & status)143 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
144                                                 UParseError          &parseError,
145                                                 UErrorCode           &status)
146 {
147     init();
148     if (U_FAILURE(status)) {return;}
149     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
150         RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
151     // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
152     //        creates and returns a complete RBBI.  From here, in a constructor, we
153     //        can't just return the object created by the builder factory, hence
154     //        the assignment of the factory created object to "this".
155     if (U_SUCCESS(status)) {
156         *this = *bi;
157         delete bi;
158     }
159 }
160 
161 
162 //-------------------------------------------------------------------------------
163 //
164 // Default Constructor.      Create an empty shell that can be set up later.
165 //                           Used when creating a RuleBasedBreakIterator from a set
166 //                           of rules.
167 //-------------------------------------------------------------------------------
RuleBasedBreakIterator()168 RuleBasedBreakIterator::RuleBasedBreakIterator() {
169     init();
170 }
171 
172 
173 //-------------------------------------------------------------------------------
174 //
175 //   Copy constructor.  Will produce a break iterator with the same behavior,
176 //                      and which iterates over the same text, as the one passed in.
177 //
178 //-------------------------------------------------------------------------------
RuleBasedBreakIterator(const RuleBasedBreakIterator & other)179 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
180 : BreakIterator(other)
181 {
182     this->init();
183     *this = other;
184 }
185 
186 
187 /**
188  * Destructor
189  */
~RuleBasedBreakIterator()190 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
191     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
192         // fCharIter was adopted from the outside.
193         delete fCharIter;
194     }
195     fCharIter = NULL;
196     delete fSCharIter;
197     fCharIter = NULL;
198     delete fDCharIter;
199     fDCharIter = NULL;
200 
201     utext_close(fText);
202 
203     if (fData != NULL) {
204         fData->removeReference();
205         fData = NULL;
206     }
207     if (fCachedBreakPositions) {
208         uprv_free(fCachedBreakPositions);
209         fCachedBreakPositions = NULL;
210     }
211     if (fLanguageBreakEngines) {
212         delete fLanguageBreakEngines;
213         fLanguageBreakEngines = NULL;
214     }
215     if (fUnhandledBreakEngine) {
216         delete fUnhandledBreakEngine;
217         fUnhandledBreakEngine = NULL;
218     }
219 }
220 
221 /**
222  * Assignment operator.  Sets this iterator to have the same behavior,
223  * and iterate over the same text, as the one passed in.
224  */
225 RuleBasedBreakIterator&
operator =(const RuleBasedBreakIterator & that)226 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
227     if (this == &that) {
228         return *this;
229     }
230     reset();    // Delete break cache information
231     fBreakType = that.fBreakType;
232     if (fLanguageBreakEngines != NULL) {
233         delete fLanguageBreakEngines;
234         fLanguageBreakEngines = NULL;   // Just rebuild for now
235     }
236     // TODO: clone fLanguageBreakEngines from "that"
237     UErrorCode status = U_ZERO_ERROR;
238     fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
239 
240     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
241         delete fCharIter;
242     }
243     fCharIter = NULL;
244 
245     if (that.fCharIter != NULL ) {
246         // This is a little bit tricky - it will intially appear that
247         //  this->fCharIter is adopted, even if that->fCharIter was
248         //  not adopted.  That's ok.
249         fCharIter = that.fCharIter->clone();
250     }
251 
252     if (fData != NULL) {
253         fData->removeReference();
254         fData = NULL;
255     }
256     if (that.fData != NULL) {
257         fData = that.fData->addReference();
258     }
259 
260     return *this;
261 }
262 
263 
264 
265 //-----------------------------------------------------------------------------
266 //
267 //    init()      Shared initialization routine.   Used by all the constructors.
268 //                Initializes all fields, leaving the object in a consistent state.
269 //
270 //-----------------------------------------------------------------------------
init()271 void RuleBasedBreakIterator::init() {
272     UErrorCode  status    = U_ZERO_ERROR;
273     fBufferClone          = FALSE;
274     fText                 = utext_openUChars(NULL, NULL, 0, &status);
275     fCharIter             = NULL;
276     fSCharIter            = NULL;
277     fDCharIter            = NULL;
278     fData                 = NULL;
279     fLastRuleStatusIndex  = 0;
280     fLastStatusIndexValid = TRUE;
281     fDictionaryCharCount  = 0;
282     fBreakType            = UBRK_WORD;  // Defaulting BreakType to word gives reasonable
283                                         //   dictionary behavior for Break Iterators that are
284                                         //   built from rules.  Even better would be the ability to
285                                         //   declare the type in the rules.
286 
287     fCachedBreakPositions    = NULL;
288     fLanguageBreakEngines    = NULL;
289     fUnhandledBreakEngine    = NULL;
290     fNumCachedBreakPositions = 0;
291     fPositionInCache         = 0;
292 
293 #ifdef RBBI_DEBUG
294     static UBool debugInitDone = FALSE;
295     if (debugInitDone == FALSE) {
296         char *debugEnv = getenv("U_RBBIDEBUG");
297         if (debugEnv && uprv_strstr(debugEnv, "trace")) {
298             fTrace = TRUE;
299         }
300         debugInitDone = TRUE;
301     }
302 #endif
303 }
304 
305 
306 
307 //-----------------------------------------------------------------------------
308 //
309 //    clone - Returns a newly-constructed RuleBasedBreakIterator with the same
310 //            behavior, and iterating over the same text, as this one.
311 //            Virtual function: does the right thing with subclasses.
312 //
313 //-----------------------------------------------------------------------------
314 BreakIterator*
clone(void) const315 RuleBasedBreakIterator::clone(void) const {
316     return new RuleBasedBreakIterator(*this);
317 }
318 
319 /**
320  * Equality operator.  Returns TRUE if both BreakIterators are of the
321  * same class, have the same behavior, and iterate over the same text.
322  */
323 UBool
operator ==(const BreakIterator & that) const324 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
325     if (typeid(*this) != typeid(that)) {
326         return FALSE;
327     }
328 
329     const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
330 
331     if (!utext_equals(fText, that2.fText)) {
332         // The two break iterators are operating on different text,
333         //   or have a different interation position.
334         return FALSE;
335     };
336 
337     // TODO:  need a check for when in a dictionary region at different offsets.
338 
339     if (that2.fData == fData ||
340         (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
341             // The two break iterators are using the same rules.
342             return TRUE;
343         }
344     return FALSE;
345 }
346 
347 /**
348  * Compute a hash code for this BreakIterator
349  * @return A hash code
350  */
351 int32_t
hashCode(void) const352 RuleBasedBreakIterator::hashCode(void) const {
353     int32_t   hash = 0;
354     if (fData != NULL) {
355         hash = fData->hashCode();
356     }
357     return hash;
358 }
359 
360 
setText(UText * ut,UErrorCode & status)361 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
362     if (U_FAILURE(status)) {
363         return;
364     }
365     reset();
366     fText = utext_clone(fText, ut, FALSE, TRUE, &status);
367 
368     // Set up a dummy CharacterIterator to be returned if anyone
369     //   calls getText().  With input from UText, there is no reasonable
370     //   way to return a characterIterator over the actual input text.
371     //   Return one over an empty string instead - this is the closest
372     //   we can come to signaling a failure.
373     //   (GetText() is obsolete, this failure is sort of OK)
374     if (fDCharIter == NULL) {
375         static const UChar c = 0;
376         fDCharIter = new UCharCharacterIterator(&c, 0);
377         if (fDCharIter == NULL) {
378             status = U_MEMORY_ALLOCATION_ERROR;
379             return;
380         }
381     }
382 
383     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
384         // existing fCharIter was adopted from the outside.  Delete it now.
385         delete fCharIter;
386     }
387     fCharIter = fDCharIter;
388 
389     this->first();
390 }
391 
392 
getUText(UText * fillIn,UErrorCode & status) const393 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
394     UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
395     return result;
396 }
397 
398 
399 
400 /**
401  * Returns the description used to create this iterator
402  */
403 const UnicodeString&
getRules() const404 RuleBasedBreakIterator::getRules() const {
405     if (fData != NULL) {
406         return fData->getRuleSourceString();
407     } else {
408         static const UnicodeString *s;
409         if (s == NULL) {
410             // TODO:  something more elegant here.
411             //        perhaps API should return the string by value.
412             //        Note:  thread unsafe init & leak are semi-ok, better than
413             //               what was before.  Sould be cleaned up, though.
414             s = new UnicodeString;
415         }
416         return *s;
417     }
418 }
419 
420 //=======================================================================
421 // BreakIterator overrides
422 //=======================================================================
423 
424 /**
425  * Return a CharacterIterator over the text being analyzed.
426  */
427 CharacterIterator&
getText() const428 RuleBasedBreakIterator::getText() const {
429     return *fCharIter;
430 }
431 
432 /**
433  * Set the iterator to analyze a new piece of text.  This function resets
434  * the current iteration position to the beginning of the text.
435  * @param newText An iterator over the text to analyze.
436  */
437 void
adoptText(CharacterIterator * newText)438 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
439     // If we are holding a CharacterIterator adopted from a
440     //   previous call to this function, delete it now.
441     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
442         delete fCharIter;
443     }
444 
445     fCharIter = newText;
446     UErrorCode status = U_ZERO_ERROR;
447     reset();
448     if (newText==NULL || newText->startIndex() != 0) {
449         // startIndex !=0 wants to be an error, but there's no way to report it.
450         // Make the iterator text be an empty string.
451         fText = utext_openUChars(fText, NULL, 0, &status);
452     } else {
453         fText = utext_openCharacterIterator(fText, newText, &status);
454     }
455     this->first();
456 }
457 
458 /**
459  * Set the iterator to analyze a new piece of text.  This function resets
460  * the current iteration position to the beginning of the text.
461  * @param newText An iterator over the text to analyze.
462  */
463 void
setText(const UnicodeString & newText)464 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
465     UErrorCode status = U_ZERO_ERROR;
466     reset();
467     fText = utext_openConstUnicodeString(fText, &newText, &status);
468 
469     // Set up a character iterator on the string.
470     //   Needed in case someone calls getText().
471     //  Can not, unfortunately, do this lazily on the (probably never)
472     //  call to getText(), because getText is const.
473     if (fSCharIter == NULL) {
474         fSCharIter = new StringCharacterIterator(newText);
475     } else {
476         fSCharIter->setText(newText);
477     }
478 
479     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
480         // old fCharIter was adopted from the outside.  Delete it.
481         delete fCharIter;
482     }
483     fCharIter = fSCharIter;
484 
485     this->first();
486 }
487 
488 
489 /**
490  *  Provide a new UText for the input text.  Must reference text with contents identical
491  *  to the original.
492  *  Intended for use with text data originating in Java (garbage collected) environments
493  *  where the data may be moved in memory at arbitrary times.
494  */
refreshInputText(UText * input,UErrorCode & status)495 RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
496     if (U_FAILURE(status)) {
497         return *this;
498     }
499     if (input == NULL) {
500         status = U_ILLEGAL_ARGUMENT_ERROR;
501         return *this;
502     }
503     int64_t pos = utext_getNativeIndex(fText);
504     //  Shallow read-only clone of the new UText into the existing input UText
505     fText = utext_clone(fText, input, FALSE, TRUE, &status);
506     if (U_FAILURE(status)) {
507         return *this;
508     }
509     utext_setNativeIndex(fText, pos);
510     if (utext_getNativeIndex(fText) != pos) {
511         // Sanity check.  The new input utext is supposed to have the exact same
512         // contents as the old.  If we can't set to the same position, it doesn't.
513         // The contents underlying the old utext might be invalid at this point,
514         // so it's not safe to check directly.
515         status = U_ILLEGAL_ARGUMENT_ERROR;
516     }
517     return *this;
518 }
519 
520 
521 /**
522  * Sets the current iteration position to the beginning of the text.
523  * @return The offset of the beginning of the text.
524  */
first(void)525 int32_t RuleBasedBreakIterator::first(void) {
526     reset();
527     fLastRuleStatusIndex  = 0;
528     fLastStatusIndexValid = TRUE;
529     //if (fText == NULL)
530     //    return BreakIterator::DONE;
531 
532     utext_setNativeIndex(fText, 0);
533     return 0;
534 }
535 
536 /**
537  * Sets the current iteration position to the end of the text.
538  * @return The text's past-the-end offset.
539  */
last(void)540 int32_t RuleBasedBreakIterator::last(void) {
541     reset();
542     if (fText == NULL) {
543         fLastRuleStatusIndex  = 0;
544         fLastStatusIndexValid = TRUE;
545         return BreakIterator::DONE;
546     }
547 
548     fLastStatusIndexValid = FALSE;
549     int32_t pos = (int32_t)utext_nativeLength(fText);
550     utext_setNativeIndex(fText, pos);
551     return pos;
552 }
553 
554 /**
555  * Advances the iterator either forward or backward the specified number of steps.
556  * Negative values move backward, and positive values move forward.  This is
557  * equivalent to repeatedly calling next() or previous().
558  * @param n The number of steps to move.  The sign indicates the direction
559  * (negative is backwards, and positive is forwards).
560  * @return The character offset of the boundary position n boundaries away from
561  * the current one.
562  */
next(int32_t n)563 int32_t RuleBasedBreakIterator::next(int32_t n) {
564     int32_t result = current();
565     while (n > 0) {
566         result = next();
567         --n;
568     }
569     while (n < 0) {
570         result = previous();
571         ++n;
572     }
573     return result;
574 }
575 
576 /**
577  * Advances the iterator to the next boundary position.
578  * @return The position of the first boundary after this one.
579  */
next(void)580 int32_t RuleBasedBreakIterator::next(void) {
581     // if we have cached break positions and we're still in the range
582     // covered by them, just move one step forward in the cache
583     if (fCachedBreakPositions != NULL) {
584         if (fPositionInCache < fNumCachedBreakPositions - 1) {
585             ++fPositionInCache;
586             int32_t pos = fCachedBreakPositions[fPositionInCache];
587             utext_setNativeIndex(fText, pos);
588             return pos;
589         }
590         else {
591             reset();
592         }
593     }
594 
595     int32_t startPos = current();
596     int32_t result = handleNext(fData->fForwardTable);
597     if (fDictionaryCharCount > 0) {
598         result = checkDictionary(startPos, result, FALSE);
599     }
600     return result;
601 }
602 
603 /**
604  * Advances the iterator backwards, to the last boundary preceding this one.
605  * @return The position of the last boundary position preceding this one.
606  */
previous(void)607 int32_t RuleBasedBreakIterator::previous(void) {
608     int32_t result;
609     int32_t startPos;
610 
611     // if we have cached break positions and we're still in the range
612     // covered by them, just move one step backward in the cache
613     if (fCachedBreakPositions != NULL) {
614         if (fPositionInCache > 0) {
615             --fPositionInCache;
616             // If we're at the beginning of the cache, need to reevaluate the
617             // rule status
618             if (fPositionInCache <= 0) {
619                 fLastStatusIndexValid = FALSE;
620             }
621             int32_t pos = fCachedBreakPositions[fPositionInCache];
622             utext_setNativeIndex(fText, pos);
623             return pos;
624         }
625         else {
626             reset();
627         }
628     }
629 
630     // if we're already sitting at the beginning of the text, return DONE
631     if (fText == NULL || (startPos = current()) == 0) {
632         fLastRuleStatusIndex  = 0;
633         fLastStatusIndexValid = TRUE;
634         return BreakIterator::DONE;
635     }
636 
637     if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
638         result = handlePrevious(fData->fReverseTable);
639         if (fDictionaryCharCount > 0) {
640             result = checkDictionary(result, startPos, TRUE);
641         }
642         return result;
643     }
644 
645     // old rule syntax
646     // set things up.  handlePrevious() will back us up to some valid
647     // break position before the current position (we back our internal
648     // iterator up one step to prevent handlePrevious() from returning
649     // the current position), but not necessarily the last one before
650 
651     // where we started
652 
653     int32_t start = current();
654 
655     (void)UTEXT_PREVIOUS32(fText);
656     int32_t lastResult    = handlePrevious(fData->fReverseTable);
657     if (lastResult == UBRK_DONE) {
658         lastResult = 0;
659         utext_setNativeIndex(fText, 0);
660     }
661     result = lastResult;
662     int32_t lastTag       = 0;
663     UBool   breakTagValid = FALSE;
664 
665     // iterate forward from the known break position until we pass our
666     // starting point.  The last break position before the starting
667     // point is our return value
668 
669     for (;;) {
670         result         = next();
671         if (result == BreakIterator::DONE || result >= start) {
672             break;
673         }
674         lastResult     = result;
675         lastTag        = fLastRuleStatusIndex;
676         breakTagValid  = TRUE;
677     }
678 
679     // fLastBreakTag wants to have the value for section of text preceding
680     // the result position that we are to return (in lastResult.)  If
681     // the backwards rules overshot and the above loop had to do two or more
682     // next()s to move up to the desired return position, we will have a valid
683     // tag value. But, if handlePrevious() took us to exactly the correct result positon,
684     // we wont have a tag value for that position, which is only set by handleNext().
685 
686     // set the current iteration position to be the last break position
687     // before where we started, and then return that value
688     utext_setNativeIndex(fText, lastResult);
689     fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
690     fLastStatusIndexValid = breakTagValid;
691 
692     // No need to check the dictionary; it will have been handled by
693     // next()
694 
695     return lastResult;
696 }
697 
698 /**
699  * Sets the iterator to refer to the first boundary position following
700  * the specified position.
701  * @offset The position from which to begin searching for a break position.
702  * @return The position of the first break after the current position.
703  */
following(int32_t offset)704 int32_t RuleBasedBreakIterator::following(int32_t offset) {
705     // if we have cached break positions and offset is in the range
706     // covered by them, use them
707     // TODO: could use binary search
708     // TODO: what if offset is outside range, but break is not?
709     if (fCachedBreakPositions != NULL) {
710         if (offset >= fCachedBreakPositions[0]
711                 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
712             fPositionInCache = 0;
713             // We are guaranteed not to leave the array due to range test above
714             while (offset >= fCachedBreakPositions[fPositionInCache]) {
715                 ++fPositionInCache;
716             }
717             int32_t pos = fCachedBreakPositions[fPositionInCache];
718             utext_setNativeIndex(fText, pos);
719             return pos;
720         }
721         else {
722             reset();
723         }
724     }
725 
726     // if the offset passed in is already past the end of the text,
727     // just return DONE; if it's before the beginning, return the
728     // text's starting offset
729     fLastRuleStatusIndex  = 0;
730     fLastStatusIndexValid = TRUE;
731     if (fText == NULL || offset >= utext_nativeLength(fText)) {
732         last();
733         return next();
734     }
735     else if (offset < 0) {
736         return first();
737     }
738 
739     // otherwise, set our internal iteration position (temporarily)
740     // to the position passed in.  If this is the _beginning_ position,
741     // then we can just use next() to get our return value
742 
743     int32_t result = 0;
744 
745     if (fData->fSafeRevTable != NULL) {
746         // new rule syntax
747         utext_setNativeIndex(fText, offset);
748         // move forward one codepoint to prepare for moving back to a
749         // safe point.
750         // this handles offset being between a supplementary character
751         (void)UTEXT_NEXT32(fText);
752         // handlePrevious will move most of the time to < 1 boundary away
753         handlePrevious(fData->fSafeRevTable);
754         int32_t result = next();
755         while (result <= offset) {
756             result = next();
757         }
758         return result;
759     }
760     if (fData->fSafeFwdTable != NULL) {
761         // backup plan if forward safe table is not available
762         utext_setNativeIndex(fText, offset);
763         (void)UTEXT_PREVIOUS32(fText);
764         // handle next will give result >= offset
765         handleNext(fData->fSafeFwdTable);
766         // previous will give result 0 or 1 boundary away from offset,
767         // most of the time
768         // we have to
769         int32_t oldresult = previous();
770         while (oldresult > offset) {
771             int32_t result = previous();
772             if (result <= offset) {
773                 return oldresult;
774             }
775             oldresult = result;
776         }
777         int32_t result = next();
778         if (result <= offset) {
779             return next();
780         }
781         return result;
782     }
783     // otherwise, we have to sync up first.  Use handlePrevious() to back
784     // up to a known break position before the specified position (if
785     // we can determine that the specified position is a break position,
786     // we don't back up at all).  This may or may not be the last break
787     // position at or before our starting position.  Advance forward
788     // from here until we've passed the starting position.  The position
789     // we stop on will be the first break position after the specified one.
790     // old rule syntax
791 
792     utext_setNativeIndex(fText, offset);
793     if (offset==0 ||
794         (offset==1  && utext_getNativeIndex(fText)==0)) {
795         return next();
796     }
797     result = previous();
798 
799     while (result != BreakIterator::DONE && result <= offset) {
800         result = next();
801     }
802 
803     return result;
804 }
805 
806 /**
807  * Sets the iterator to refer to the last boundary position before the
808  * specified position.
809  * @offset The position to begin searching for a break from.
810  * @return The position of the last boundary before the starting position.
811  */
preceding(int32_t offset)812 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
813     // if we have cached break positions and offset is in the range
814     // covered by them, use them
815     if (fCachedBreakPositions != NULL) {
816         // TODO: binary search?
817         // TODO: What if offset is outside range, but break is not?
818         if (offset > fCachedBreakPositions[0]
819                 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
820             fPositionInCache = 0;
821             while (fPositionInCache < fNumCachedBreakPositions
822                    && offset > fCachedBreakPositions[fPositionInCache])
823                 ++fPositionInCache;
824             --fPositionInCache;
825             // If we're at the beginning of the cache, need to reevaluate the
826             // rule status
827             if (fPositionInCache <= 0) {
828                 fLastStatusIndexValid = FALSE;
829             }
830             utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
831             return fCachedBreakPositions[fPositionInCache];
832         }
833         else {
834             reset();
835         }
836     }
837 
838     // if the offset passed in is already past the end of the text,
839     // just return DONE; if it's before the beginning, return the
840     // text's starting offset
841     if (fText == NULL || offset > utext_nativeLength(fText)) {
842         // return BreakIterator::DONE;
843         return last();
844     }
845     else if (offset < 0) {
846         return first();
847     }
848 
849     // if we start by updating the current iteration position to the
850     // position specified by the caller, we can just use previous()
851     // to carry out this operation
852 
853     if (fData->fSafeFwdTable != NULL) {
854         // new rule syntax
855         utext_setNativeIndex(fText, offset);
856         int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
857         if (newOffset != offset) {
858             // Will come here if specified offset was not a code point boundary AND
859             //   the underlying implmentation is using UText, which snaps any non-code-point-boundary
860             //   indices to the containing code point.
861             // For breakitereator::preceding only, these non-code-point indices need to be moved
862             //   up to refer to the following codepoint.
863             (void)UTEXT_NEXT32(fText);
864             offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
865         }
866 
867         // TODO:  (synwee) would it be better to just check for being in the middle of a surrogate pair,
868         //        rather than adjusting the position unconditionally?
869         //        (Change would interact with safe rules.)
870         // TODO:  change RBBI behavior for off-boundary indices to match that of UText?
871         //        affects only preceding(), seems cleaner, but is slightly different.
872         (void)UTEXT_PREVIOUS32(fText);
873         handleNext(fData->fSafeFwdTable);
874         int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
875         while (result >= offset) {
876             result = previous();
877         }
878         return result;
879     }
880     if (fData->fSafeRevTable != NULL) {
881         // backup plan if forward safe table is not available
882         //  TODO:  check whether this path can be discarded
883         //         It's probably OK to say that rules must supply both safe tables
884         //            if they use safe tables at all.  We have certainly never described
885         //            to anyone how to work with just one safe table.
886         utext_setNativeIndex(fText, offset);
887         (void)UTEXT_NEXT32(fText);
888 
889         // handle previous will give result <= offset
890         handlePrevious(fData->fSafeRevTable);
891 
892         // next will give result 0 or 1 boundary away from offset,
893         // most of the time
894         // we have to
895         int32_t oldresult = next();
896         while (oldresult < offset) {
897             int32_t result = next();
898             if (result >= offset) {
899                 return oldresult;
900             }
901             oldresult = result;
902         }
903         int32_t result = previous();
904         if (result >= offset) {
905             return previous();
906         }
907         return result;
908     }
909 
910     // old rule syntax
911     utext_setNativeIndex(fText, offset);
912     return previous();
913 }
914 
915 /**
916  * Returns true if the specfied position is a boundary position.  As a side
917  * effect, leaves the iterator pointing to the first boundary position at
918  * or after "offset".
919  * @param offset the offset to check.
920  * @return True if "offset" is a boundary position.
921  */
isBoundary(int32_t offset)922 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
923     // the beginning index of the iterator is always a boundary position by definition
924     if (offset == 0) {
925         first();       // For side effects on current position, tag values.
926         return TRUE;
927     }
928 
929     if (offset == (int32_t)utext_nativeLength(fText)) {
930         last();       // For side effects on current position, tag values.
931         return TRUE;
932     }
933 
934     // out-of-range indexes are never boundary positions
935     if (offset < 0) {
936         first();       // For side effects on current position, tag values.
937         return FALSE;
938     }
939 
940     if (offset > utext_nativeLength(fText)) {
941         last();        // For side effects on current position, tag values.
942         return FALSE;
943     }
944 
945     // otherwise, we can use following() on the position before the specified
946     // one and return true if the position we get back is the one the user
947     // specified
948     utext_previous32From(fText, offset);
949     int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
950     UBool    result  = following(backOne) == offset;
951     return result;
952 }
953 
954 /**
955  * Returns the current iteration position.
956  * @return The current iteration position.
957  */
current(void) const958 int32_t RuleBasedBreakIterator::current(void) const {
959     int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
960     return pos;
961 }
962 
963 //=======================================================================
964 // implementation
965 //=======================================================================
966 
967 //
968 // RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end
969 //                 of user text.  A variable with this enum type keeps track of where we
970 //                 are.  The state machine only fetches user input while in the RUN mode.
971 //
972 enum RBBIRunMode {
973     RBBI_START,     // state machine processing is before first char of input
974     RBBI_RUN,       // state machine processing is in the user text
975     RBBI_END        // state machine processing is after end of user text.
976 };
977 
978 
979 //-----------------------------------------------------------------------------------
980 //
981 //  handleNext(stateTable)
982 //     This method is the actual implementation of the rbbi next() method.
983 //     This method initializes the state machine to state 1
984 //     and advances through the text character by character until we reach the end
985 //     of the text or the state machine transitions to state 0.  We update our return
986 //     value every time the state machine passes through an accepting state.
987 //
988 //-----------------------------------------------------------------------------------
handleNext(const RBBIStateTable * statetable)989 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
990     int32_t             state;
991     uint16_t            category        = 0;
992     RBBIRunMode         mode;
993 
994     RBBIStateTableRow  *row;
995     UChar32             c;
996     int32_t             lookaheadStatus = 0;
997     int32_t             lookaheadTagIdx = 0;
998     int32_t             result          = 0;
999     int32_t             initialPosition = 0;
1000     int32_t             lookaheadResult = 0;
1001     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
1002     const char         *tableData       = statetable->fTableData;
1003     uint32_t            tableRowLen     = statetable->fRowLen;
1004 
1005     #ifdef RBBI_DEBUG
1006         if (fTrace) {
1007             RBBIDebugPuts("Handle Next   pos   char  state category");
1008         }
1009     #endif
1010 
1011     // No matter what, handleNext alway correctly sets the break tag value.
1012     fLastStatusIndexValid = TRUE;
1013     fLastRuleStatusIndex = 0;
1014 
1015     // if we're already at the end of the text, return DONE.
1016     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1017     result          = initialPosition;
1018     c               = UTEXT_NEXT32(fText);
1019     if (fData == NULL || c==U_SENTINEL) {
1020         return BreakIterator::DONE;
1021     }
1022 
1023     //  Set the initial state for the state machine
1024     state = START_STATE;
1025     row = (RBBIStateTableRow *)
1026             //(statetable->fTableData + (statetable->fRowLen * state));
1027             (tableData + tableRowLen * state);
1028 
1029 
1030     mode     = RBBI_RUN;
1031     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
1032         category = 2;
1033         mode     = RBBI_START;
1034     }
1035 
1036 
1037     // loop until we reach the end of the text or transition to state 0
1038     //
1039     for (;;) {
1040         if (c == U_SENTINEL) {
1041             // Reached end of input string.
1042             if (mode == RBBI_END) {
1043                 // We have already run the loop one last time with the
1044                 //   character set to the psueudo {eof} value.  Now it is time
1045                 //   to unconditionally bail out.
1046                 if (lookaheadResult > result) {
1047                     // We ran off the end of the string with a pending look-ahead match.
1048                     // Treat this as if the look-ahead condition had been met, and return
1049                     //  the match at the / position from the look-ahead rule.
1050                     result               = lookaheadResult;
1051                     fLastRuleStatusIndex = lookaheadTagIdx;
1052                     lookaheadStatus = 0;
1053                 }
1054                 break;
1055             }
1056             // Run the loop one last time with the fake end-of-input character category.
1057             mode = RBBI_END;
1058             category = 1;
1059         }
1060 
1061         //
1062         // Get the char category.  An incoming category of 1 or 2 means that
1063         //      we are preset for doing the beginning or end of input, and
1064         //      that we shouldn't get a category from an actual text input character.
1065         //
1066         if (mode == RBBI_RUN) {
1067             // look up the current character's character category, which tells us
1068             // which column in the state table to look at.
1069             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
1070             //        not the size of the character going in, which is a UChar32.
1071             //
1072             UTRIE_GET16(&fData->fTrie, c, category);
1073 
1074             // Check the dictionary bit in the character's category.
1075             //    Counter is only used by dictionary based iterators (subclasses).
1076             //    Chars that need to be handled by a dictionary have a flag bit set
1077             //    in their category values.
1078             //
1079             if ((category & 0x4000) != 0)  {
1080                 fDictionaryCharCount++;
1081                 //  And off the dictionary flag bit.
1082                 category &= ~0x4000;
1083             }
1084         }
1085 
1086        #ifdef RBBI_DEBUG
1087             if (fTrace) {
1088                 RBBIDebugPrintf("             %4ld   ", utext_getNativeIndex(fText));
1089                 if (0x20<=c && c<0x7f) {
1090                     RBBIDebugPrintf("\"%c\"  ", c);
1091                 } else {
1092                     RBBIDebugPrintf("%5x  ", c);
1093                 }
1094                 RBBIDebugPrintf("%3d  %3d\n", state, category);
1095             }
1096         #endif
1097 
1098         // State Transition - move machine to its next state
1099         //
1100 
1101         // Note: fNextState is defined as uint16_t[2], but we are casting
1102         // a generated RBBI table to RBBIStateTableRow and some tables
1103         // actually have more than 2 categories.
1104         U_ASSERT(category<fData->fHeader->fCatCount);
1105         state = row->fNextState[category];  /*Not accessing beyond memory*/
1106         row = (RBBIStateTableRow *)
1107             // (statetable->fTableData + (statetable->fRowLen * state));
1108             (tableData + tableRowLen * state);
1109 
1110 
1111         if (row->fAccepting == -1) {
1112             // Match found, common case.
1113             if (mode != RBBI_START) {
1114                 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1115             }
1116             fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
1117         }
1118 
1119         if (row->fLookAhead != 0) {
1120             if (lookaheadStatus != 0
1121                 && row->fAccepting == lookaheadStatus) {
1122                 // Lookahead match is completed.
1123                 result               = lookaheadResult;
1124                 fLastRuleStatusIndex = lookaheadTagIdx;
1125                 lookaheadStatus      = 0;
1126                 // TODO:  make a standalone hard break in a rule work.
1127                 if (lookAheadHardBreak) {
1128                     UTEXT_SETNATIVEINDEX(fText, result);
1129                     return result;
1130                 }
1131                 // Look-ahead completed, but other rules may match further.  Continue on
1132                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
1133                 goto continueOn;
1134             }
1135 
1136             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1137             lookaheadResult = r;
1138             lookaheadStatus = row->fLookAhead;
1139             lookaheadTagIdx = row->fTagIdx;
1140             goto continueOn;
1141         }
1142 
1143 
1144         if (row->fAccepting != 0) {
1145             // Because this is an accepting state, any in-progress look-ahead match
1146             //   is no longer relavant.  Clear out the pending lookahead status.
1147             lookaheadStatus = 0;           // clear out any pending look-ahead match.
1148         }
1149 
1150 continueOn:
1151         if (state == STOP_STATE) {
1152             // This is the normal exit from the lookup state machine.
1153             // We have advanced through the string until it is certain that no
1154             //   longer match is possible, no matter what characters follow.
1155             break;
1156         }
1157 
1158         // Advance to the next character.
1159         // If this is a beginning-of-input loop iteration, don't advance
1160         //    the input position.  The next iteration will be processing the
1161         //    first real input character.
1162         if (mode == RBBI_RUN) {
1163             c = UTEXT_NEXT32(fText);
1164         } else {
1165             if (mode == RBBI_START) {
1166                 mode = RBBI_RUN;
1167             }
1168         }
1169 
1170 
1171     }
1172 
1173     // The state machine is done.  Check whether it found a match...
1174 
1175     // If the iterator failed to advance in the match engine, force it ahead by one.
1176     //   (This really indicates a defect in the break rules.  They should always match
1177     //    at least one character.)
1178     if (result == initialPosition) {
1179         UTEXT_SETNATIVEINDEX(fText, initialPosition);
1180         UTEXT_NEXT32(fText);
1181         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1182     }
1183 
1184     // Leave the iterator at our result position.
1185     UTEXT_SETNATIVEINDEX(fText, result);
1186     #ifdef RBBI_DEBUG
1187         if (fTrace) {
1188             RBBIDebugPrintf("result = %d\n\n", result);
1189         }
1190     #endif
1191     return result;
1192 }
1193 
1194 
1195 
1196 //-----------------------------------------------------------------------------------
1197 //
1198 //  handlePrevious()
1199 //
1200 //      Iterate backwards, according to the logic of the reverse rules.
1201 //      This version handles the exact style backwards rules.
1202 //
1203 //      The logic of this function is very similar to handleNext(), above.
1204 //
1205 //-----------------------------------------------------------------------------------
handlePrevious(const RBBIStateTable * statetable)1206 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
1207     int32_t             state;
1208     uint16_t            category        = 0;
1209     RBBIRunMode         mode;
1210     RBBIStateTableRow  *row;
1211     UChar32             c;
1212     int32_t             lookaheadStatus = 0;
1213     int32_t             result          = 0;
1214     int32_t             initialPosition = 0;
1215     int32_t             lookaheadResult = 0;
1216     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
1217 
1218     #ifdef RBBI_DEBUG
1219         if (fTrace) {
1220             RBBIDebugPuts("Handle Previous   pos   char  state category");
1221         }
1222     #endif
1223 
1224     // handlePrevious() never gets the rule status.
1225     // Flag the status as invalid; if the user ever asks for status, we will need
1226     // to back up, then re-find the break position using handleNext(), which does
1227     // get the status value.
1228     fLastStatusIndexValid = FALSE;
1229     fLastRuleStatusIndex = 0;
1230 
1231     // if we're already at the start of the text, return DONE.
1232     if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
1233         return BreakIterator::DONE;
1234     }
1235 
1236     //  Set up the starting char.
1237     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1238     result          = initialPosition;
1239     c               = UTEXT_PREVIOUS32(fText);
1240 
1241     //  Set the initial state for the state machine
1242     state = START_STATE;
1243     row = (RBBIStateTableRow *)
1244             (statetable->fTableData + (statetable->fRowLen * state));
1245     category = 3;
1246     mode     = RBBI_RUN;
1247     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
1248         category = 2;
1249         mode     = RBBI_START;
1250     }
1251 
1252 
1253     // loop until we reach the start of the text or transition to state 0
1254     //
1255     for (;;) {
1256         if (c == U_SENTINEL) {
1257             // Reached end of input string.
1258             if (mode == RBBI_END) {
1259                 // We have already run the loop one last time with the
1260                 //   character set to the psueudo {eof} value.  Now it is time
1261                 //   to unconditionally bail out.
1262                 if (lookaheadResult < result) {
1263                     // We ran off the end of the string with a pending look-ahead match.
1264                     // Treat this as if the look-ahead condition had been met, and return
1265                     //  the match at the / position from the look-ahead rule.
1266                     result               = lookaheadResult;
1267                     lookaheadStatus = 0;
1268                 } else if (result == initialPosition) {
1269                     // Ran off start, no match found.
1270                     // move one index one (towards the start, since we are doing a previous())
1271                     UTEXT_SETNATIVEINDEX(fText, initialPosition);
1272                     (void)UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
1273                 }
1274                 break;
1275             }
1276             // Run the loop one last time with the fake end-of-input character category.
1277             mode = RBBI_END;
1278             category = 1;
1279         }
1280 
1281         //
1282         // Get the char category.  An incoming category of 1 or 2 means that
1283         //      we are preset for doing the beginning or end of input, and
1284         //      that we shouldn't get a category from an actual text input character.
1285         //
1286         if (mode == RBBI_RUN) {
1287             // look up the current character's character category, which tells us
1288             // which column in the state table to look at.
1289             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
1290             //        not the size of the character going in, which is a UChar32.
1291             //
1292             UTRIE_GET16(&fData->fTrie, c, category);
1293 
1294             // Check the dictionary bit in the character's category.
1295             //    Counter is only used by dictionary based iterators (subclasses).
1296             //    Chars that need to be handled by a dictionary have a flag bit set
1297             //    in their category values.
1298             //
1299             if ((category & 0x4000) != 0)  {
1300                 fDictionaryCharCount++;
1301                 //  And off the dictionary flag bit.
1302                 category &= ~0x4000;
1303             }
1304         }
1305 
1306         #ifdef RBBI_DEBUG
1307             if (fTrace) {
1308                 RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText));
1309                 if (0x20<=c && c<0x7f) {
1310                     RBBIDebugPrintf("\"%c\"  ", c);
1311                 } else {
1312                     RBBIDebugPrintf("%5x  ", c);
1313                 }
1314                 RBBIDebugPrintf("%3d  %3d\n", state, category);
1315             }
1316         #endif
1317 
1318         // State Transition - move machine to its next state
1319         //
1320 
1321         // Note: fNextState is defined as uint16_t[2], but we are casting
1322         // a generated RBBI table to RBBIStateTableRow and some tables
1323         // actually have more than 2 categories.
1324         U_ASSERT(category<fData->fHeader->fCatCount);
1325         state = row->fNextState[category];  /*Not accessing beyond memory*/
1326         row = (RBBIStateTableRow *)
1327             (statetable->fTableData + (statetable->fRowLen * state));
1328 
1329         if (row->fAccepting == -1) {
1330             // Match found, common case.
1331             result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1332         }
1333 
1334         if (row->fLookAhead != 0) {
1335             if (lookaheadStatus != 0
1336                 && row->fAccepting == lookaheadStatus) {
1337                 // Lookahead match is completed.
1338                 result               = lookaheadResult;
1339                 lookaheadStatus      = 0;
1340                 // TODO:  make a standalone hard break in a rule work.
1341                 if (lookAheadHardBreak) {
1342                     UTEXT_SETNATIVEINDEX(fText, result);
1343                     return result;
1344                 }
1345                 // Look-ahead completed, but other rules may match further.  Continue on
1346                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
1347                 goto continueOn;
1348             }
1349 
1350             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1351             lookaheadResult = r;
1352             lookaheadStatus = row->fLookAhead;
1353             goto continueOn;
1354         }
1355 
1356 
1357         if (row->fAccepting != 0) {
1358             // Because this is an accepting state, any in-progress look-ahead match
1359             //   is no longer relavant.  Clear out the pending lookahead status.
1360             lookaheadStatus = 0;
1361         }
1362 
1363 continueOn:
1364         if (state == STOP_STATE) {
1365             // This is the normal exit from the lookup state machine.
1366             // We have advanced through the string until it is certain that no
1367             //   longer match is possible, no matter what characters follow.
1368             break;
1369         }
1370 
1371         // Move (backwards) to the next character to process.
1372         // If this is a beginning-of-input loop iteration, don't advance
1373         //    the input position.  The next iteration will be processing the
1374         //    first real input character.
1375         if (mode == RBBI_RUN) {
1376             c = UTEXT_PREVIOUS32(fText);
1377         } else {
1378             if (mode == RBBI_START) {
1379                 mode = RBBI_RUN;
1380             }
1381         }
1382     }
1383 
1384     // The state machine is done.  Check whether it found a match...
1385 
1386     // If the iterator failed to advance in the match engine, force it ahead by one.
1387     //   (This really indicates a defect in the break rules.  They should always match
1388     //    at least one character.)
1389     if (result == initialPosition) {
1390         UTEXT_SETNATIVEINDEX(fText, initialPosition);
1391         UTEXT_PREVIOUS32(fText);
1392         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1393     }
1394 
1395     // Leave the iterator at our result position.
1396     UTEXT_SETNATIVEINDEX(fText, result);
1397     #ifdef RBBI_DEBUG
1398         if (fTrace) {
1399             RBBIDebugPrintf("result = %d\n\n", result);
1400         }
1401     #endif
1402     return result;
1403 }
1404 
1405 
1406 void
reset()1407 RuleBasedBreakIterator::reset()
1408 {
1409     if (fCachedBreakPositions) {
1410         uprv_free(fCachedBreakPositions);
1411     }
1412     fCachedBreakPositions = NULL;
1413     fNumCachedBreakPositions = 0;
1414     fDictionaryCharCount = 0;
1415     fPositionInCache = 0;
1416 }
1417 
1418 
1419 
1420 //-------------------------------------------------------------------------------
1421 //
1422 //   getRuleStatus()   Return the break rule tag associated with the current
1423 //                     iterator position.  If the iterator arrived at its current
1424 //                     position by iterating forwards, the value will have been
1425 //                     cached by the handleNext() function.
1426 //
1427 //                     If no cached status value is available, the status is
1428 //                     found by doing a previous() followed by a next(), which
1429 //                     leaves the iterator where it started, and computes the
1430 //                     status while doing the next().
1431 //
1432 //-------------------------------------------------------------------------------
makeRuleStatusValid()1433 void RuleBasedBreakIterator::makeRuleStatusValid() {
1434     if (fLastStatusIndexValid == FALSE) {
1435         //  No cached status is available.
1436         if (fText == NULL || current() == 0) {
1437             //  At start of text, or there is no text.  Status is always zero.
1438             fLastRuleStatusIndex = 0;
1439             fLastStatusIndexValid = TRUE;
1440         } else {
1441             //  Not at start of text.  Find status the tedious way.
1442             int32_t pa = current();
1443             previous();
1444             if (fNumCachedBreakPositions > 0) {
1445                 reset();                // Blow off the dictionary cache
1446             }
1447             int32_t pb = next();
1448             if (pa != pb) {
1449                 // note: the if (pa != pb) test is here only to eliminate warnings for
1450                 //       unused local variables on gcc.  Logically, it isn't needed.
1451                 U_ASSERT(pa == pb);
1452             }
1453         }
1454     }
1455     U_ASSERT(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fData->fStatusMaxIdx);
1456 }
1457 
1458 
getRuleStatus() const1459 int32_t  RuleBasedBreakIterator::getRuleStatus() const {
1460     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
1461     nonConstThis->makeRuleStatusValid();
1462 
1463     // fLastRuleStatusIndex indexes to the start of the appropriate status record
1464     //                                                 (the number of status values.)
1465     //   This function returns the last (largest) of the array of status values.
1466     int32_t  idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
1467     int32_t  tagVal = fData->fRuleStatusTable[idx];
1468 
1469     return tagVal;
1470 }
1471 
1472 
1473 
1474 
getRuleStatusVec(int32_t * fillInVec,int32_t capacity,UErrorCode & status)1475 int32_t RuleBasedBreakIterator::getRuleStatusVec(
1476              int32_t *fillInVec, int32_t capacity, UErrorCode &status)
1477 {
1478     if (U_FAILURE(status)) {
1479         return 0;
1480     }
1481 
1482     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
1483     nonConstThis->makeRuleStatusValid();
1484     int32_t  numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
1485     int32_t  numValsToCopy = numVals;
1486     if (numVals > capacity) {
1487         status = U_BUFFER_OVERFLOW_ERROR;
1488         numValsToCopy = capacity;
1489     }
1490     int i;
1491     for (i=0; i<numValsToCopy; i++) {
1492         fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
1493     }
1494     return numVals;
1495 }
1496 
1497 
1498 
1499 //-------------------------------------------------------------------------------
1500 //
1501 //   getBinaryRules        Access to the compiled form of the rules,
1502 //                         for use by build system tools that save the data
1503 //                         for standard iterator types.
1504 //
1505 //-------------------------------------------------------------------------------
getBinaryRules(uint32_t & length)1506 const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
1507     const uint8_t  *retPtr = NULL;
1508     length = 0;
1509 
1510     if (fData != NULL) {
1511         retPtr = (const uint8_t *)fData->fHeader;
1512         length = fData->fHeader->fLength;
1513     }
1514     return retPtr;
1515 }
1516 
1517 
1518 
1519 
1520 //-------------------------------------------------------------------------------
1521 //
1522 //  BufferClone       TODO:  In my (Andy) opinion, this function should be deprecated.
1523 //                    Saving one heap allocation isn't worth the trouble.
1524 //                    Cloning shouldn't be done in tight loops, and
1525 //                    making the clone copy involves other heap operations anyway.
1526 //                    And the application code for correctly dealing with buffer
1527 //                    size problems and the eventual object destruction is ugly.
1528 //
1529 //-------------------------------------------------------------------------------
createBufferClone(void * stackBuffer,int32_t & bufferSize,UErrorCode & status)1530 BreakIterator *  RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
1531                                    int32_t &bufferSize,
1532                                    UErrorCode &status)
1533 {
1534     if (U_FAILURE(status)){
1535         return NULL;
1536     }
1537 
1538     //
1539     //  If user buffer size is zero this is a preflight operation to
1540     //    obtain the needed buffer size, allowing for worst case misalignment.
1541     //
1542     if (bufferSize == 0) {
1543         bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
1544         return NULL;
1545     }
1546 
1547 
1548     //
1549     //  Check the alignment and size of the user supplied buffer.
1550     //  Allocate heap memory if the user supplied memory is insufficient.
1551     //
1552     char    *buf   = (char *)stackBuffer;
1553     uint32_t s      = bufferSize;
1554 
1555     if (stackBuffer == NULL) {
1556         s = 0;   // Ignore size, force allocation if user didn't give us a buffer.
1557     }
1558     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
1559         uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf);
1560         s   -= offsetUp;
1561         buf += offsetUp;
1562     }
1563     if (s < sizeof(RuleBasedBreakIterator)) {
1564         // Not enough room in the caller-supplied buffer.
1565         // Do a plain-vanilla heap based clone and return that, along with
1566         //   a warning that the clone was allocated.
1567         RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this);
1568         if (clonedBI == 0) {
1569             status = U_MEMORY_ALLOCATION_ERROR;
1570         } else {
1571             status = U_SAFECLONE_ALLOCATED_WARNING;
1572         }
1573         return clonedBI;
1574     }
1575 
1576     //
1577     //  Clone the source BI into the caller-supplied buffer.
1578     //
1579     RuleBasedBreakIterator *clone = new(buf) RuleBasedBreakIterator(*this);
1580     clone->fBufferClone = TRUE;   // Flag to prevent deleting storage on close (From C code)
1581 
1582     return clone;
1583 }
1584 
1585 
1586 //-------------------------------------------------------------------------------
1587 //
1588 //  isDictionaryChar      Return true if the category lookup for this char
1589 //                        indicates that it is in the set of dictionary lookup
1590 //                        chars.
1591 //
1592 //                        This function is intended for use by dictionary based
1593 //                        break iterators.
1594 //
1595 //-------------------------------------------------------------------------------
1596 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) {
1597     if (fData == NULL) {
1598         return FALSE;
1599     }
1600     uint16_t category;
1601     UTRIE_GET16(&fData->fTrie, c, category);
1602     return (category & 0x4000) != 0;
1603 }*/
1604 
1605 
1606 //-------------------------------------------------------------------------------
1607 //
1608 //  checkDictionary       This function handles all processing of characters in
1609 //                        the "dictionary" set. It will determine the appropriate
1610 //                        course of action, and possibly set up a cache in the
1611 //                        process.
1612 //
1613 //-------------------------------------------------------------------------------
checkDictionary(int32_t startPos,int32_t endPos,UBool reverse)1614 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
1615                             int32_t endPos,
1616                             UBool reverse) {
1617     // Reset the old break cache first.
1618     reset();
1619 
1620     // note: code segment below assumes that dictionary chars are in the
1621     // startPos-endPos range
1622     // value returned should be next character in sequence
1623     if ((endPos - startPos) <= 1) {
1624         return (reverse ? startPos : endPos);
1625     }
1626 
1627     // Bug 5532.  The dictionary code will crash if the input text is UTF-8
1628     //      because native indexes are different from UTF-16 indexes.
1629     //      Temporary hack: skip dictionary lookup for UTF-8 encoded text.
1630     //      It wont give the right breaks, but it's better than a crash.
1631     //
1632     //      Check the type of the UText by checking its pFuncs field, which
1633     //      is UText's function dispatch table.  It will be the same for all
1634     //      UTF-8 UTexts and different for any other UText type.
1635     //
1636     //      We have no other type of UText available with non-UTF-16 native indexing.
1637     //      This whole check will go away once the dictionary code is fixed.
1638     static const void *utext_utf8Funcs;
1639     if (utext_utf8Funcs == NULL) {
1640         // Cache the UTF-8 UText function pointer value.
1641         UErrorCode status = U_ZERO_ERROR;
1642         UText tempUText = UTEXT_INITIALIZER;
1643         utext_openUTF8(&tempUText, NULL, 0, &status);
1644         utext_utf8Funcs = tempUText.pFuncs;
1645         utext_close(&tempUText);
1646     }
1647     if (fText->pFuncs == utext_utf8Funcs) {
1648         return (reverse ? startPos : endPos);
1649     }
1650 
1651     // Starting from the starting point, scan towards the proposed result,
1652     // looking for the first dictionary character (which may be the one
1653     // we're on, if we're starting in the middle of a range).
1654     utext_setNativeIndex(fText, reverse ? endPos : startPos);
1655     if (reverse) {
1656         UTEXT_PREVIOUS32(fText);
1657     }
1658 
1659     int32_t rangeStart = startPos;
1660     int32_t rangeEnd = endPos;
1661 
1662     uint16_t    category;
1663     int32_t     current;
1664     UErrorCode  status = U_ZERO_ERROR;
1665     UStack      breaks(status);
1666     int32_t     foundBreakCount = 0;
1667     UChar32     c = utext_current32(fText);
1668 
1669     UTRIE_GET16(&fData->fTrie, c, category);
1670 
1671     // Is the character we're starting on a dictionary character? If so, we
1672     // need to back up to include the entire run; otherwise the results of
1673     // the break algorithm will differ depending on where we start. Since
1674     // the result is cached and there is typically a non-dictionary break
1675     // within a small number of words, there should be little performance impact.
1676     if (category & 0x4000) {
1677         if (reverse) {
1678             do {
1679                 utext_next32(fText);          // TODO:  recast to work directly with postincrement.
1680                 c = utext_current32(fText);
1681                 UTRIE_GET16(&fData->fTrie, c, category);
1682             } while (c != U_SENTINEL && (category & 0x4000));
1683             // Back up to the last dictionary character
1684             rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1685             if (c == U_SENTINEL) {
1686                 // c = fText->last32();
1687                 //   TODO:  why was this if needed?
1688                 c = UTEXT_PREVIOUS32(fText);
1689             }
1690             else {
1691                 c = UTEXT_PREVIOUS32(fText);
1692             }
1693         }
1694         else {
1695             do {
1696                 c = UTEXT_PREVIOUS32(fText);
1697                 UTRIE_GET16(&fData->fTrie, c, category);
1698             }
1699             while (c != U_SENTINEL && (category & 0x4000));
1700             // Back up to the last dictionary character
1701             if (c == U_SENTINEL) {
1702                 // c = fText->first32();
1703                 c = utext_current32(fText);
1704             }
1705             else {
1706                 utext_next32(fText);
1707                 c = utext_current32(fText);
1708             }
1709             rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
1710         }
1711         UTRIE_GET16(&fData->fTrie, c, category);
1712     }
1713 
1714     // Loop through the text, looking for ranges of dictionary characters.
1715     // For each span, find the appropriate break engine, and ask it to find
1716     // any breaks within the span.
1717     // Note: we always do this in the forward direction, so that the break
1718     // cache is built in the right order.
1719     if (reverse) {
1720         utext_setNativeIndex(fText, rangeStart);
1721         c = utext_current32(fText);
1722         UTRIE_GET16(&fData->fTrie, c, category);
1723     }
1724     while(U_SUCCESS(status)) {
1725         while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
1726             utext_next32(fText);           // TODO:  tweak for post-increment operation
1727             c = utext_current32(fText);
1728             UTRIE_GET16(&fData->fTrie, c, category);
1729         }
1730         if (current >= rangeEnd) {
1731             break;
1732         }
1733 
1734         // We now have a dictionary character. Get the appropriate language object
1735         // to deal with it.
1736         const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
1737 
1738         // Ask the language object if there are any breaks. It will leave the text
1739         // pointer on the other side of its range, ready to search for the next one.
1740         if (lbe != NULL) {
1741             foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
1742         }
1743 
1744         // Reload the loop variables for the next go-round
1745         c = utext_current32(fText);
1746         UTRIE_GET16(&fData->fTrie, c, category);
1747     }
1748 
1749     // If we found breaks, build a new break cache. The first and last entries must
1750     // be the original starting and ending position.
1751     if (foundBreakCount > 0) {
1752         int32_t totalBreaks = foundBreakCount;
1753         if (startPos < breaks.elementAti(0)) {
1754             totalBreaks += 1;
1755         }
1756         if (endPos > breaks.peeki()) {
1757             totalBreaks += 1;
1758         }
1759         fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
1760         if (fCachedBreakPositions != NULL) {
1761             int32_t out = 0;
1762             fNumCachedBreakPositions = totalBreaks;
1763             if (startPos < breaks.elementAti(0)) {
1764                 fCachedBreakPositions[out++] = startPos;
1765             }
1766             for (int32_t i = 0; i < foundBreakCount; ++i) {
1767                 fCachedBreakPositions[out++] = breaks.elementAti(i);
1768             }
1769             if (endPos > fCachedBreakPositions[out-1]) {
1770                 fCachedBreakPositions[out] = endPos;
1771             }
1772             // If there are breaks, then by definition, we are replacing the original
1773             // proposed break by one of the breaks we found. Use following() and
1774             // preceding() to do the work. They should never recurse in this case.
1775             if (reverse) {
1776                 return preceding(endPos);
1777             }
1778             else {
1779                 return following(startPos);
1780             }
1781         }
1782         // If the allocation failed, just fall through to the "no breaks found" case.
1783     }
1784 
1785     // If we get here, there were no language-based breaks. Set the text pointer
1786     // to the original proposed break.
1787     utext_setNativeIndex(fText, reverse ? startPos : endPos);
1788     return (reverse ? startPos : endPos);
1789 }
1790 
1791 U_NAMESPACE_END
1792 
1793 // defined in ucln_cmn.h
1794 
1795 static icu::UStack *gLanguageBreakFactories = NULL;
1796 
1797 /**
1798  * Release all static memory held by breakiterator.
1799  */
1800 U_CDECL_BEGIN
breakiterator_cleanup_dict(void)1801 static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
1802     if (gLanguageBreakFactories) {
1803         delete gLanguageBreakFactories;
1804         gLanguageBreakFactories = NULL;
1805     }
1806     return TRUE;
1807 }
1808 U_CDECL_END
1809 
1810 U_CDECL_BEGIN
_deleteFactory(void * obj)1811 static void U_CALLCONV _deleteFactory(void *obj) {
1812     delete (icu::LanguageBreakFactory *) obj;
1813 }
1814 U_CDECL_END
1815 U_NAMESPACE_BEGIN
1816 
1817 static const LanguageBreakEngine*
getLanguageBreakEngineFromFactory(UChar32 c,int32_t breakType)1818 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
1819 {
1820     UBool       needsInit;
1821     UErrorCode  status = U_ZERO_ERROR;
1822     UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit);
1823 
1824     if (needsInit) {
1825         UStack  *factories = new UStack(_deleteFactory, NULL, status);
1826         if (factories != NULL && U_SUCCESS(status)) {
1827             ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
1828             factories->push(builtIn, status);
1829 #ifdef U_LOCAL_SERVICE_HOOK
1830             LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
1831             if (extra != NULL) {
1832                 factories->push(extra, status);
1833             }
1834 #endif
1835         }
1836         umtx_lock(NULL);
1837         if (gLanguageBreakFactories == NULL) {
1838             gLanguageBreakFactories = factories;
1839             factories = NULL;
1840             ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
1841         }
1842         umtx_unlock(NULL);
1843         delete factories;
1844     }
1845 
1846     if (gLanguageBreakFactories == NULL) {
1847         return NULL;
1848     }
1849 
1850     int32_t i = gLanguageBreakFactories->size();
1851     const LanguageBreakEngine *lbe = NULL;
1852     while (--i >= 0) {
1853         LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
1854         lbe = factory->getEngineFor(c, breakType);
1855         if (lbe != NULL) {
1856             break;
1857         }
1858     }
1859     return lbe;
1860 }
1861 
1862 
1863 //-------------------------------------------------------------------------------
1864 //
1865 //  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the
1866 //                          the character c.
1867 //
1868 //-------------------------------------------------------------------------------
1869 const LanguageBreakEngine *
getLanguageBreakEngine(UChar32 c)1870 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
1871     const LanguageBreakEngine *lbe = NULL;
1872     UErrorCode status = U_ZERO_ERROR;
1873 
1874     if (fLanguageBreakEngines == NULL) {
1875         fLanguageBreakEngines = new UStack(status);
1876         if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
1877             delete fLanguageBreakEngines;
1878             fLanguageBreakEngines = 0;
1879             return NULL;
1880         }
1881     }
1882 
1883     int32_t i = fLanguageBreakEngines->size();
1884     while (--i >= 0) {
1885         lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
1886         if (lbe->handles(c, fBreakType)) {
1887             return lbe;
1888         }
1889     }
1890 
1891     // No existing dictionary took the character. See if a factory wants to
1892     // give us a new LanguageBreakEngine for this character.
1893     lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
1894 
1895     // If we got one, use it and push it on our stack.
1896     if (lbe != NULL) {
1897         fLanguageBreakEngines->push((void *)lbe, status);
1898         // Even if we can't remember it, we can keep looking it up, so
1899         // return it even if the push fails.
1900         return lbe;
1901     }
1902 
1903     // No engine is forthcoming for this character. Add it to the
1904     // reject set. Create the reject break engine if needed.
1905     if (fUnhandledBreakEngine == NULL) {
1906         fUnhandledBreakEngine = new UnhandledEngine(status);
1907         if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
1908             status = U_MEMORY_ALLOCATION_ERROR;
1909         }
1910         // Put it last so that scripts for which we have an engine get tried
1911         // first.
1912         fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
1913         // If we can't insert it, or creation failed, get rid of it
1914         if (U_FAILURE(status)) {
1915             delete fUnhandledBreakEngine;
1916             fUnhandledBreakEngine = 0;
1917             return NULL;
1918         }
1919     }
1920 
1921     // Tell the reject engine about the character; at its discretion, it may
1922     // add more than just the one character.
1923     fUnhandledBreakEngine->handleCharacter(c, fBreakType);
1924 
1925     return fUnhandledBreakEngine;
1926 }
1927 
1928 
1929 
1930 /*int32_t RuleBasedBreakIterator::getBreakType() const {
1931     return fBreakType;
1932 }*/
1933 
setBreakType(int32_t type)1934 void RuleBasedBreakIterator::setBreakType(int32_t type) {
1935     fBreakType = type;
1936     reset();
1937 }
1938 
1939 U_NAMESPACE_END
1940 
1941 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1942