• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ***************************************************************************
3 *   Copyright (C) 1999-2011 International Business Machines Corporation
4 *   and others. All rights reserved.
5 ***************************************************************************
6 */
7 //
8 //  file:  rbbi.c    Contains the implementation of the rule based break iterator
9 //                   runtime engine and the API implementation for
10 //                   class RuleBasedBreakIterator
11 //
12 
13 #include <typeinfo>  // for 'typeid' to work
14 
15 #include "unicode/utypes.h"
16 
17 #if !UCONFIG_NO_BREAK_ITERATION
18 
19 #include "unicode/rbbi.h"
20 #include "unicode/schriter.h"
21 #include "unicode/uchriter.h"
22 #include "unicode/udata.h"
23 #include "unicode/uclean.h"
24 #include "rbbidata.h"
25 #include "rbbirb.h"
26 #include "cmemory.h"
27 #include "cstring.h"
28 #include "umutex.h"
29 #include "ucln_cmn.h"
30 #include "brkeng.h"
31 
32 #include "uassert.h"
33 #include "uvector.h"
34 
35 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
36 #if U_LOCAL_SERVICE_HOOK
37 #include "localsvc.h"
38 #endif
39 
40 #ifdef RBBI_DEBUG
41 static UBool fTrace = FALSE;
42 #endif
43 
44 U_NAMESPACE_BEGIN
45 
46 // The state number of the starting state
47 #define START_STATE 1
48 
49 // The state-transition value indicating "stop"
50 #define STOP_STATE  0
51 
52 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
54 
55 
56 //=======================================================================
57 // constructors
58 //=======================================================================
59 
60 /**
61  * Constructs a RuleBasedBreakIterator that uses the already-created
62  * tables object that is passed in as a parameter.
63  */
64 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
65 {
66     init();
67     fData = new RBBIDataWrapper(data, status); // status checked in constructor
68     if (U_FAILURE(status)) {return;}
69     if(fData == 0) {
70         status = U_MEMORY_ALLOCATION_ERROR;
71         return;
72     }
73 }
74 
75 /**
76  * Same as above but does not adopt memory
77  */
RuleBasedBreakIterator(const RBBIDataHeader * data,enum EDontAdopt,UErrorCode & status)78 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)
79 {
80     init();
81     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor
82     if (U_FAILURE(status)) {return;}
83     if(fData == 0) {
84         status = U_MEMORY_ALLOCATION_ERROR;
85         return;
86     }
87 }
88 
89 
90 //
91 //  Construct from precompiled binary rules (tables).  This constructor is public API,
92 //  taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
93 //
RuleBasedBreakIterator(const uint8_t * compiledRules,uint32_t ruleLength,UErrorCode & status)94 RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
95                        uint32_t       ruleLength,
96                        UErrorCode     &status) {
97     init();
98     if (U_FAILURE(status)) {
99         return;
100     }
101     if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) {
102         status = U_ILLEGAL_ARGUMENT_ERROR;
103         return;
104     }
105     const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules;
106     if (data->fLength > ruleLength) {
107         status = U_ILLEGAL_ARGUMENT_ERROR;
108         return;
109     }
110     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);
111     if (U_FAILURE(status)) {return;}
112     if(fData == 0) {
113         status = U_MEMORY_ALLOCATION_ERROR;
114         return;
115     }
116 }
117 
118 
119 //-------------------------------------------------------------------------------
120 //
121 //   Constructor   from a UDataMemory handle to precompiled break rules
122 //                 stored in an ICU data file.
123 //
124 //-------------------------------------------------------------------------------
RuleBasedBreakIterator(UDataMemory * udm,UErrorCode & status)125 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
126 {
127     init();
128     fData = new RBBIDataWrapper(udm, status); // status checked in constructor
129     if (U_FAILURE(status)) {return;}
130     if(fData == 0) {
131         status = U_MEMORY_ALLOCATION_ERROR;
132         return;
133     }
134 }
135 
136 
137 
138 //-------------------------------------------------------------------------------
139 //
140 //   Constructor       from a set of rules supplied as a string.
141 //
142 //-------------------------------------------------------------------------------
RuleBasedBreakIterator(const UnicodeString & rules,UParseError & parseError,UErrorCode & status)143 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
144                                                 UParseError          &parseError,
145                                                 UErrorCode           &status)
146 {
147     init();
148     if (U_FAILURE(status)) {return;}
149     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
150         RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
151     // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
152     //        creates and returns a complete RBBI.  From here, in a constructor, we
153     //        can't just return the object created by the builder factory, hence
154     //        the assignment of the factory created object to "this".
155     if (U_SUCCESS(status)) {
156         *this = *bi;
157         delete bi;
158     }
159 }
160 
161 
162 //-------------------------------------------------------------------------------
163 //
164 // Default Constructor.      Create an empty shell that can be set up later.
165 //                           Used when creating a RuleBasedBreakIterator from a set
166 //                           of rules.
167 //-------------------------------------------------------------------------------
RuleBasedBreakIterator()168 RuleBasedBreakIterator::RuleBasedBreakIterator() {
169     init();
170 }
171 
172 
173 //-------------------------------------------------------------------------------
174 //
175 //   Copy constructor.  Will produce a break iterator with the same behavior,
176 //                      and which iterates over the same text, as the one passed in.
177 //
178 //-------------------------------------------------------------------------------
RuleBasedBreakIterator(const RuleBasedBreakIterator & other)179 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
180 : BreakIterator(other)
181 {
182     this->init();
183     *this = other;
184 }
185 
186 
187 /**
188  * Destructor
189  */
~RuleBasedBreakIterator()190 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
191     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
192         // fCharIter was adopted from the outside.
193         delete fCharIter;
194     }
195     fCharIter = NULL;
196     delete fSCharIter;
197     fCharIter = NULL;
198     delete fDCharIter;
199     fDCharIter = NULL;
200 
201     utext_close(fText);
202 
203     if (fData != NULL) {
204         fData->removeReference();
205         fData = NULL;
206     }
207     if (fCachedBreakPositions) {
208         uprv_free(fCachedBreakPositions);
209         fCachedBreakPositions = NULL;
210     }
211     if (fLanguageBreakEngines) {
212         delete fLanguageBreakEngines;
213         fLanguageBreakEngines = NULL;
214     }
215     if (fUnhandledBreakEngine) {
216         delete fUnhandledBreakEngine;
217         fUnhandledBreakEngine = NULL;
218     }
219 }
220 
221 /**
222  * Assignment operator.  Sets this iterator to have the same behavior,
223  * and iterate over the same text, as the one passed in.
224  */
225 RuleBasedBreakIterator&
operator =(const RuleBasedBreakIterator & that)226 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
227     if (this == &that) {
228         return *this;
229     }
230     reset();    // Delete break cache information
231     fBreakType = that.fBreakType;
232     if (fLanguageBreakEngines != NULL) {
233         delete fLanguageBreakEngines;
234         fLanguageBreakEngines = NULL;   // Just rebuild for now
235     }
236     // TODO: clone fLanguageBreakEngines from "that"
237     UErrorCode status = U_ZERO_ERROR;
238     fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
239 
240     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
241         delete fCharIter;
242     }
243     fCharIter = NULL;
244 
245     if (that.fCharIter != NULL ) {
246         // This is a little bit tricky - it will intially appear that
247         //  this->fCharIter is adopted, even if that->fCharIter was
248         //  not adopted.  That's ok.
249         fCharIter = that.fCharIter->clone();
250     }
251 
252     if (fData != NULL) {
253         fData->removeReference();
254         fData = NULL;
255     }
256     if (that.fData != NULL) {
257         fData = that.fData->addReference();
258     }
259 
260     return *this;
261 }
262 
263 
264 
265 //-----------------------------------------------------------------------------
266 //
267 //    init()      Shared initialization routine.   Used by all the constructors.
268 //                Initializes all fields, leaving the object in a consistent state.
269 //
270 //-----------------------------------------------------------------------------
init()271 void RuleBasedBreakIterator::init() {
272     UErrorCode  status    = U_ZERO_ERROR;
273     fBufferClone          = FALSE;
274     fText                 = utext_openUChars(NULL, NULL, 0, &status);
275     fCharIter             = NULL;
276     fSCharIter            = NULL;
277     fDCharIter            = NULL;
278     fData                 = NULL;
279     fLastRuleStatusIndex  = 0;
280     fLastStatusIndexValid = TRUE;
281     fDictionaryCharCount  = 0;
282     fBreakType            = UBRK_WORD;  // Defaulting BreakType to word gives reasonable
283                                         //   dictionary behavior for Break Iterators that are
284                                         //   built from rules.  Even better would be the ability to
285                                         //   declare the type in the rules.
286 
287     fCachedBreakPositions    = NULL;
288     fLanguageBreakEngines    = NULL;
289     fUnhandledBreakEngine    = NULL;
290     fNumCachedBreakPositions = 0;
291     fPositionInCache         = 0;
292 
293 #ifdef RBBI_DEBUG
294     static UBool debugInitDone = FALSE;
295     if (debugInitDone == FALSE) {
296         char *debugEnv = getenv("U_RBBIDEBUG");
297         if (debugEnv && uprv_strstr(debugEnv, "trace")) {
298             fTrace = TRUE;
299         }
300         debugInitDone = TRUE;
301     }
302 #endif
303 }
304 
305 
306 
307 //-----------------------------------------------------------------------------
308 //
309 //    clone - Returns a newly-constructed RuleBasedBreakIterator with the same
310 //            behavior, and iterating over the same text, as this one.
311 //            Virtual function: does the right thing with subclasses.
312 //
313 //-----------------------------------------------------------------------------
314 BreakIterator*
clone(void) const315 RuleBasedBreakIterator::clone(void) const {
316     return new RuleBasedBreakIterator(*this);
317 }
318 
319 /**
320  * Equality operator.  Returns TRUE if both BreakIterators are of the
321  * same class, have the same behavior, and iterate over the same text.
322  */
323 UBool
operator ==(const BreakIterator & that) const324 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
325     if (typeid(*this) != typeid(that)) {
326         return FALSE;
327     }
328 
329     const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
330 
331     if (!utext_equals(fText, that2.fText)) {
332         // The two break iterators are operating on different text,
333         //   or have a different interation position.
334         return FALSE;
335     };
336 
337     // TODO:  need a check for when in a dictionary region at different offsets.
338 
339     if (that2.fData == fData ||
340         (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
341             // The two break iterators are using the same rules.
342             return TRUE;
343         }
344     return FALSE;
345 }
346 
347 /**
348  * Compute a hash code for this BreakIterator
349  * @return A hash code
350  */
351 int32_t
hashCode(void) const352 RuleBasedBreakIterator::hashCode(void) const {
353     int32_t   hash = 0;
354     if (fData != NULL) {
355         hash = fData->hashCode();
356     }
357     return hash;
358 }
359 
360 
setText(UText * ut,UErrorCode & status)361 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
362     if (U_FAILURE(status)) {
363         return;
364     }
365     reset();
366     fText = utext_clone(fText, ut, FALSE, TRUE, &status);
367 
368     // Set up a dummy CharacterIterator to be returned if anyone
369     //   calls getText().  With input from UText, there is no reasonable
370     //   way to return a characterIterator over the actual input text.
371     //   Return one over an empty string instead - this is the closest
372     //   we can come to signaling a failure.
373     //   (GetText() is obsolete, this failure is sort of OK)
374     if (fDCharIter == NULL) {
375         static const UChar c = 0;
376         fDCharIter = new UCharCharacterIterator(&c, 0);
377         if (fDCharIter == NULL) {
378             status = U_MEMORY_ALLOCATION_ERROR;
379             return;
380         }
381     }
382 
383     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
384         // existing fCharIter was adopted from the outside.  Delete it now.
385         delete fCharIter;
386     }
387     fCharIter = fDCharIter;
388 
389     this->first();
390 }
391 
392 
getUText(UText * fillIn,UErrorCode & status) const393 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
394     UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
395     return result;
396 }
397 
398 
399 
400 /**
401  * Returns the description used to create this iterator
402  */
403 const UnicodeString&
getRules() const404 RuleBasedBreakIterator::getRules() const {
405     if (fData != NULL) {
406         return fData->getRuleSourceString();
407     } else {
408         static const UnicodeString *s;
409         if (s == NULL) {
410             // TODO:  something more elegant here.
411             //        perhaps API should return the string by value.
412             //        Note:  thread unsafe init & leak are semi-ok, better than
413             //               what was before.  Sould be cleaned up, though.
414             s = new UnicodeString;
415         }
416         return *s;
417     }
418 }
419 
420 //=======================================================================
421 // BreakIterator overrides
422 //=======================================================================
423 
424 /**
425  * Return a CharacterIterator over the text being analyzed.
426  */
427 CharacterIterator&
getText() const428 RuleBasedBreakIterator::getText() const {
429     return *fCharIter;
430 }
431 
432 /**
433  * Set the iterator to analyze a new piece of text.  This function resets
434  * the current iteration position to the beginning of the text.
435  * @param newText An iterator over the text to analyze.
436  */
437 void
adoptText(CharacterIterator * newText)438 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
439     // If we are holding a CharacterIterator adopted from a
440     //   previous call to this function, delete it now.
441     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
442         delete fCharIter;
443     }
444 
445     fCharIter = newText;
446     UErrorCode status = U_ZERO_ERROR;
447     reset();
448     if (newText==NULL || newText->startIndex() != 0) {
449         // startIndex !=0 wants to be an error, but there's no way to report it.
450         // Make the iterator text be an empty string.
451         fText = utext_openUChars(fText, NULL, 0, &status);
452     } else {
453         fText = utext_openCharacterIterator(fText, newText, &status);
454     }
455     this->first();
456 }
457 
458 /**
459  * Set the iterator to analyze a new piece of text.  This function resets
460  * the current iteration position to the beginning of the text.
461  * @param newText An iterator over the text to analyze.
462  */
463 void
setText(const UnicodeString & newText)464 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
465     UErrorCode status = U_ZERO_ERROR;
466     reset();
467     fText = utext_openConstUnicodeString(fText, &newText, &status);
468 
469     // Set up a character iterator on the string.
470     //   Needed in case someone calls getText().
471     //  Can not, unfortunately, do this lazily on the (probably never)
472     //  call to getText(), because getText is const.
473     if (fSCharIter == NULL) {
474         fSCharIter = new StringCharacterIterator(newText);
475     } else {
476         fSCharIter->setText(newText);
477     }
478 
479     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
480         // old fCharIter was adopted from the outside.  Delete it.
481         delete fCharIter;
482     }
483     fCharIter = fSCharIter;
484 
485     this->first();
486 }
487 
488 
489 
490 /**
491  * Sets the current iteration position to the beginning of the text.
492  * @return The offset of the beginning of the text.
493  */
first(void)494 int32_t RuleBasedBreakIterator::first(void) {
495     reset();
496     fLastRuleStatusIndex  = 0;
497     fLastStatusIndexValid = TRUE;
498     //if (fText == NULL)
499     //    return BreakIterator::DONE;
500 
501     utext_setNativeIndex(fText, 0);
502     return 0;
503 }
504 
505 /**
506  * Sets the current iteration position to the end of the text.
507  * @return The text's past-the-end offset.
508  */
last(void)509 int32_t RuleBasedBreakIterator::last(void) {
510     reset();
511     if (fText == NULL) {
512         fLastRuleStatusIndex  = 0;
513         fLastStatusIndexValid = TRUE;
514         return BreakIterator::DONE;
515     }
516 
517     fLastStatusIndexValid = FALSE;
518     int32_t pos = (int32_t)utext_nativeLength(fText);
519     utext_setNativeIndex(fText, pos);
520     return pos;
521 }
522 
523 /**
524  * Advances the iterator either forward or backward the specified number of steps.
525  * Negative values move backward, and positive values move forward.  This is
526  * equivalent to repeatedly calling next() or previous().
527  * @param n The number of steps to move.  The sign indicates the direction
528  * (negative is backwards, and positive is forwards).
529  * @return The character offset of the boundary position n boundaries away from
530  * the current one.
531  */
next(int32_t n)532 int32_t RuleBasedBreakIterator::next(int32_t n) {
533     int32_t result = current();
534     while (n > 0) {
535         result = next();
536         --n;
537     }
538     while (n < 0) {
539         result = previous();
540         ++n;
541     }
542     return result;
543 }
544 
545 /**
546  * Advances the iterator to the next boundary position.
547  * @return The position of the first boundary after this one.
548  */
next(void)549 int32_t RuleBasedBreakIterator::next(void) {
550     // if we have cached break positions and we're still in the range
551     // covered by them, just move one step forward in the cache
552     if (fCachedBreakPositions != NULL) {
553         if (fPositionInCache < fNumCachedBreakPositions - 1) {
554             ++fPositionInCache;
555             int32_t pos = fCachedBreakPositions[fPositionInCache];
556             utext_setNativeIndex(fText, pos);
557             return pos;
558         }
559         else {
560             reset();
561         }
562     }
563 
564     int32_t startPos = current();
565     int32_t result = handleNext(fData->fForwardTable);
566     if (fDictionaryCharCount > 0) {
567         result = checkDictionary(startPos, result, FALSE);
568     }
569     return result;
570 }
571 
572 /**
573  * Advances the iterator backwards, to the last boundary preceding this one.
574  * @return The position of the last boundary position preceding this one.
575  */
previous(void)576 int32_t RuleBasedBreakIterator::previous(void) {
577     int32_t result;
578     int32_t startPos;
579 
580     // if we have cached break positions and we're still in the range
581     // covered by them, just move one step backward in the cache
582     if (fCachedBreakPositions != NULL) {
583         if (fPositionInCache > 0) {
584             --fPositionInCache;
585             // If we're at the beginning of the cache, need to reevaluate the
586             // rule status
587             if (fPositionInCache <= 0) {
588                 fLastStatusIndexValid = FALSE;
589             }
590             int32_t pos = fCachedBreakPositions[fPositionInCache];
591             utext_setNativeIndex(fText, pos);
592             return pos;
593         }
594         else {
595             reset();
596         }
597     }
598 
599     // if we're already sitting at the beginning of the text, return DONE
600     if (fText == NULL || (startPos = current()) == 0) {
601         fLastRuleStatusIndex  = 0;
602         fLastStatusIndexValid = TRUE;
603         return BreakIterator::DONE;
604     }
605 
606     if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
607         result = handlePrevious(fData->fReverseTable);
608         if (fDictionaryCharCount > 0) {
609             result = checkDictionary(result, startPos, TRUE);
610         }
611         return result;
612     }
613 
614     // old rule syntax
615     // set things up.  handlePrevious() will back us up to some valid
616     // break position before the current position (we back our internal
617     // iterator up one step to prevent handlePrevious() from returning
618     // the current position), but not necessarily the last one before
619 
620     // where we started
621 
622     int32_t start = current();
623 
624     (void)UTEXT_PREVIOUS32(fText);
625     int32_t lastResult    = handlePrevious(fData->fReverseTable);
626     if (lastResult == UBRK_DONE) {
627         lastResult = 0;
628         utext_setNativeIndex(fText, 0);
629     }
630     result = lastResult;
631     int32_t lastTag       = 0;
632     UBool   breakTagValid = FALSE;
633 
634     // iterate forward from the known break position until we pass our
635     // starting point.  The last break position before the starting
636     // point is our return value
637 
638     for (;;) {
639         result         = next();
640         if (result == BreakIterator::DONE || result >= start) {
641             break;
642         }
643         lastResult     = result;
644         lastTag        = fLastRuleStatusIndex;
645         breakTagValid  = TRUE;
646     }
647 
648     // fLastBreakTag wants to have the value for section of text preceding
649     // the result position that we are to return (in lastResult.)  If
650     // the backwards rules overshot and the above loop had to do two or more
651     // next()s to move up to the desired return position, we will have a valid
652     // tag value. But, if handlePrevious() took us to exactly the correct result positon,
653     // we wont have a tag value for that position, which is only set by handleNext().
654 
655     // set the current iteration position to be the last break position
656     // before where we started, and then return that value
657     utext_setNativeIndex(fText, lastResult);
658     fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
659     fLastStatusIndexValid = breakTagValid;
660 
661     // No need to check the dictionary; it will have been handled by
662     // next()
663 
664     return lastResult;
665 }
666 
667 /**
668  * Sets the iterator to refer to the first boundary position following
669  * the specified position.
670  * @offset The position from which to begin searching for a break position.
671  * @return The position of the first break after the current position.
672  */
following(int32_t offset)673 int32_t RuleBasedBreakIterator::following(int32_t offset) {
674     // if we have cached break positions and offset is in the range
675     // covered by them, use them
676     // TODO: could use binary search
677     // TODO: what if offset is outside range, but break is not?
678     if (fCachedBreakPositions != NULL) {
679         if (offset >= fCachedBreakPositions[0]
680                 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
681             fPositionInCache = 0;
682             // We are guaranteed not to leave the array due to range test above
683             while (offset >= fCachedBreakPositions[fPositionInCache]) {
684                 ++fPositionInCache;
685             }
686             int32_t pos = fCachedBreakPositions[fPositionInCache];
687             utext_setNativeIndex(fText, pos);
688             return pos;
689         }
690         else {
691             reset();
692         }
693     }
694 
695     // if the offset passed in is already past the end of the text,
696     // just return DONE; if it's before the beginning, return the
697     // text's starting offset
698     fLastRuleStatusIndex  = 0;
699     fLastStatusIndexValid = TRUE;
700     if (fText == NULL || offset >= utext_nativeLength(fText)) {
701         last();
702         return next();
703     }
704     else if (offset < 0) {
705         return first();
706     }
707 
708     // otherwise, set our internal iteration position (temporarily)
709     // to the position passed in.  If this is the _beginning_ position,
710     // then we can just use next() to get our return value
711 
712     int32_t result = 0;
713 
714     if (fData->fSafeRevTable != NULL) {
715         // new rule syntax
716         utext_setNativeIndex(fText, offset);
717         // move forward one codepoint to prepare for moving back to a
718         // safe point.
719         // this handles offset being between a supplementary character
720         (void)UTEXT_NEXT32(fText);
721         // handlePrevious will move most of the time to < 1 boundary away
722         handlePrevious(fData->fSafeRevTable);
723         int32_t result = next();
724         while (result <= offset) {
725             result = next();
726         }
727         return result;
728     }
729     if (fData->fSafeFwdTable != NULL) {
730         // backup plan if forward safe table is not available
731         utext_setNativeIndex(fText, offset);
732         (void)UTEXT_PREVIOUS32(fText);
733         // handle next will give result >= offset
734         handleNext(fData->fSafeFwdTable);
735         // previous will give result 0 or 1 boundary away from offset,
736         // most of the time
737         // we have to
738         int32_t oldresult = previous();
739         while (oldresult > offset) {
740             int32_t result = previous();
741             if (result <= offset) {
742                 return oldresult;
743             }
744             oldresult = result;
745         }
746         int32_t result = next();
747         if (result <= offset) {
748             return next();
749         }
750         return result;
751     }
752     // otherwise, we have to sync up first.  Use handlePrevious() to back
753     // up to a known break position before the specified position (if
754     // we can determine that the specified position is a break position,
755     // we don't back up at all).  This may or may not be the last break
756     // position at or before our starting position.  Advance forward
757     // from here until we've passed the starting position.  The position
758     // we stop on will be the first break position after the specified one.
759     // old rule syntax
760 
761     utext_setNativeIndex(fText, offset);
762     if (offset==0 ||
763         (offset==1  && utext_getNativeIndex(fText)==0)) {
764         return next();
765     }
766     result = previous();
767 
768     while (result != BreakIterator::DONE && result <= offset) {
769         result = next();
770     }
771 
772     return result;
773 }
774 
775 /**
776  * Sets the iterator to refer to the last boundary position before the
777  * specified position.
778  * @offset The position to begin searching for a break from.
779  * @return The position of the last boundary before the starting position.
780  */
preceding(int32_t offset)781 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
782     // if we have cached break positions and offset is in the range
783     // covered by them, use them
784     if (fCachedBreakPositions != NULL) {
785         // TODO: binary search?
786         // TODO: What if offset is outside range, but break is not?
787         if (offset > fCachedBreakPositions[0]
788                 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
789             fPositionInCache = 0;
790             while (fPositionInCache < fNumCachedBreakPositions
791                    && offset > fCachedBreakPositions[fPositionInCache])
792                 ++fPositionInCache;
793             --fPositionInCache;
794             // If we're at the beginning of the cache, need to reevaluate the
795             // rule status
796             if (fPositionInCache <= 0) {
797                 fLastStatusIndexValid = FALSE;
798             }
799             utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
800             return fCachedBreakPositions[fPositionInCache];
801         }
802         else {
803             reset();
804         }
805     }
806 
807     // if the offset passed in is already past the end of the text,
808     // just return DONE; if it's before the beginning, return the
809     // text's starting offset
810     if (fText == NULL || offset > utext_nativeLength(fText)) {
811         // return BreakIterator::DONE;
812         return last();
813     }
814     else if (offset < 0) {
815         return first();
816     }
817 
818     // if we start by updating the current iteration position to the
819     // position specified by the caller, we can just use previous()
820     // to carry out this operation
821 
822     if (fData->fSafeFwdTable != NULL) {
823         // new rule syntax
824         utext_setNativeIndex(fText, offset);
825         int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
826         if (newOffset != offset) {
827             // Will come here if specified offset was not a code point boundary AND
828             //   the underlying implmentation is using UText, which snaps any non-code-point-boundary
829             //   indices to the containing code point.
830             // For breakitereator::preceding only, these non-code-point indices need to be moved
831             //   up to refer to the following codepoint.
832             (void)UTEXT_NEXT32(fText);
833             offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
834         }
835 
836         // TODO:  (synwee) would it be better to just check for being in the middle of a surrogate pair,
837         //        rather than adjusting the position unconditionally?
838         //        (Change would interact with safe rules.)
839         // TODO:  change RBBI behavior for off-boundary indices to match that of UText?
840         //        affects only preceding(), seems cleaner, but is slightly different.
841         (void)UTEXT_PREVIOUS32(fText);
842         handleNext(fData->fSafeFwdTable);
843         int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
844         while (result >= offset) {
845             result = previous();
846         }
847         return result;
848     }
849     if (fData->fSafeRevTable != NULL) {
850         // backup plan if forward safe table is not available
851         //  TODO:  check whether this path can be discarded
852         //         It's probably OK to say that rules must supply both safe tables
853         //            if they use safe tables at all.  We have certainly never described
854         //            to anyone how to work with just one safe table.
855         utext_setNativeIndex(fText, offset);
856         (void)UTEXT_NEXT32(fText);
857 
858         // handle previous will give result <= offset
859         handlePrevious(fData->fSafeRevTable);
860 
861         // next will give result 0 or 1 boundary away from offset,
862         // most of the time
863         // we have to
864         int32_t oldresult = next();
865         while (oldresult < offset) {
866             int32_t result = next();
867             if (result >= offset) {
868                 return oldresult;
869             }
870             oldresult = result;
871         }
872         int32_t result = previous();
873         if (result >= offset) {
874             return previous();
875         }
876         return result;
877     }
878 
879     // old rule syntax
880     utext_setNativeIndex(fText, offset);
881     return previous();
882 }
883 
884 /**
885  * Returns true if the specfied position is a boundary position.  As a side
886  * effect, leaves the iterator pointing to the first boundary position at
887  * or after "offset".
888  * @param offset the offset to check.
889  * @return True if "offset" is a boundary position.
890  */
isBoundary(int32_t offset)891 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
892     // the beginning index of the iterator is always a boundary position by definition
893     if (offset == 0) {
894         first();       // For side effects on current position, tag values.
895         return TRUE;
896     }
897 
898     if (offset == (int32_t)utext_nativeLength(fText)) {
899         last();       // For side effects on current position, tag values.
900         return TRUE;
901     }
902 
903     // out-of-range indexes are never boundary positions
904     if (offset < 0) {
905         first();       // For side effects on current position, tag values.
906         return FALSE;
907     }
908 
909     if (offset > utext_nativeLength(fText)) {
910         last();        // For side effects on current position, tag values.
911         return FALSE;
912     }
913 
914     // otherwise, we can use following() on the position before the specified
915     // one and return true if the position we get back is the one the user
916     // specified
917     utext_previous32From(fText, offset);
918     int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
919     UBool    result  = following(backOne) == offset;
920     return result;
921 }
922 
923 /**
924  * Returns the current iteration position.
925  * @return The current iteration position.
926  */
current(void) const927 int32_t RuleBasedBreakIterator::current(void) const {
928     int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
929     return pos;
930 }
931 
932 //=======================================================================
933 // implementation
934 //=======================================================================
935 
936 //
937 // RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end
938 //                 of user text.  A variable with this enum type keeps track of where we
939 //                 are.  The state machine only fetches user input while in the RUN mode.
940 //
941 enum RBBIRunMode {
942     RBBI_START,     // state machine processing is before first char of input
943     RBBI_RUN,       // state machine processing is in the user text
944     RBBI_END        // state machine processing is after end of user text.
945 };
946 
947 
948 //-----------------------------------------------------------------------------------
949 //
950 //  handleNext(stateTable)
951 //     This method is the actual implementation of the rbbi next() method.
952 //     This method initializes the state machine to state 1
953 //     and advances through the text character by character until we reach the end
954 //     of the text or the state machine transitions to state 0.  We update our return
955 //     value every time the state machine passes through an accepting state.
956 //
957 //-----------------------------------------------------------------------------------
handleNext(const RBBIStateTable * statetable)958 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
959     int32_t             state;
960     int16_t             category        = 0;
961     RBBIRunMode         mode;
962 
963     RBBIStateTableRow  *row;
964     UChar32             c;
965     int32_t             lookaheadStatus = 0;
966     int32_t             lookaheadTagIdx = 0;
967     int32_t             result          = 0;
968     int32_t             initialPosition = 0;
969     int32_t             lookaheadResult = 0;
970     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
971     const char         *tableData       = statetable->fTableData;
972     uint32_t            tableRowLen     = statetable->fRowLen;
973 
974     #ifdef RBBI_DEBUG
975         if (fTrace) {
976             RBBIDebugPuts("Handle Next   pos   char  state category");
977         }
978     #endif
979 
980     // No matter what, handleNext alway correctly sets the break tag value.
981     fLastStatusIndexValid = TRUE;
982     fLastRuleStatusIndex = 0;
983 
984     // if we're already at the end of the text, return DONE.
985     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
986     result          = initialPosition;
987     c               = UTEXT_NEXT32(fText);
988     if (fData == NULL || c==U_SENTINEL) {
989         return BreakIterator::DONE;
990     }
991 
992     //  Set the initial state for the state machine
993     state = START_STATE;
994     row = (RBBIStateTableRow *)
995             //(statetable->fTableData + (statetable->fRowLen * state));
996             (tableData + tableRowLen * state);
997 
998 
999     mode     = RBBI_RUN;
1000     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
1001         category = 2;
1002         mode     = RBBI_START;
1003     }
1004 
1005 
1006     // loop until we reach the end of the text or transition to state 0
1007     //
1008     for (;;) {
1009         if (c == U_SENTINEL) {
1010             // Reached end of input string.
1011             if (mode == RBBI_END) {
1012                 // We have already run the loop one last time with the
1013                 //   character set to the psueudo {eof} value.  Now it is time
1014                 //   to unconditionally bail out.
1015                 if (lookaheadResult > result) {
1016                     // We ran off the end of the string with a pending look-ahead match.
1017                     // Treat this as if the look-ahead condition had been met, and return
1018                     //  the match at the / position from the look-ahead rule.
1019                     result               = lookaheadResult;
1020                     fLastRuleStatusIndex = lookaheadTagIdx;
1021                     lookaheadStatus = 0;
1022                 }
1023                 break;
1024             }
1025             // Run the loop one last time with the fake end-of-input character category.
1026             mode = RBBI_END;
1027             category = 1;
1028         }
1029 
1030         //
1031         // Get the char category.  An incoming category of 1 or 2 means that
1032         //      we are preset for doing the beginning or end of input, and
1033         //      that we shouldn't get a category from an actual text input character.
1034         //
1035         if (mode == RBBI_RUN) {
1036             // look up the current character's character category, which tells us
1037             // which column in the state table to look at.
1038             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
1039             //        not the size of the character going in, which is a UChar32.
1040             //
1041             UTRIE_GET16(&fData->fTrie, c, category);
1042 
1043             // Check the dictionary bit in the character's category.
1044             //    Counter is only used by dictionary based iterators (subclasses).
1045             //    Chars that need to be handled by a dictionary have a flag bit set
1046             //    in their category values.
1047             //
1048             if ((category & 0x4000) != 0)  {
1049                 fDictionaryCharCount++;
1050                 //  And off the dictionary flag bit.
1051                 category &= ~0x4000;
1052             }
1053         }
1054 
1055         #ifdef RBBI_DEBUG
1056             if (fTrace) {
1057                 RBBIDebugPrintf("             %4ld   ", utext_getNativeIndex(fText));
1058                 if (0x20<=c && c<0x7f) {
1059                     RBBIDebugPrintf("\"%c\"  ", c);
1060                 } else {
1061                     RBBIDebugPrintf("%5x  ", c);
1062                 }
1063                 RBBIDebugPrintf("%3d  %3d\n", state, category);
1064             }
1065         #endif
1066 
1067         // State Transition - move machine to its next state
1068         //
1069         state = row->fNextState[category];
1070         row = (RBBIStateTableRow *)
1071             // (statetable->fTableData + (statetable->fRowLen * state));
1072             (tableData + tableRowLen * state);
1073 
1074 
1075         if (row->fAccepting == -1) {
1076             // Match found, common case.
1077             if (mode != RBBI_START) {
1078                 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1079             }
1080             fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
1081         }
1082 
1083         if (row->fLookAhead != 0) {
1084             if (lookaheadStatus != 0
1085                 && row->fAccepting == lookaheadStatus) {
1086                 // Lookahead match is completed.
1087                 result               = lookaheadResult;
1088                 fLastRuleStatusIndex = lookaheadTagIdx;
1089                 lookaheadStatus      = 0;
1090                 // TODO:  make a standalone hard break in a rule work.
1091                 if (lookAheadHardBreak) {
1092                     UTEXT_SETNATIVEINDEX(fText, result);
1093                     return result;
1094                 }
1095                 // Look-ahead completed, but other rules may match further.  Continue on
1096                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
1097                 goto continueOn;
1098             }
1099 
1100             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1101             lookaheadResult = r;
1102             lookaheadStatus = row->fLookAhead;
1103             lookaheadTagIdx = row->fTagIdx;
1104             goto continueOn;
1105         }
1106 
1107 
1108         if (row->fAccepting != 0) {
1109             // Because this is an accepting state, any in-progress look-ahead match
1110             //   is no longer relavant.  Clear out the pending lookahead status.
1111             lookaheadStatus = 0;           // clear out any pending look-ahead match.
1112         }
1113 
1114 continueOn:
1115         if (state == STOP_STATE) {
1116             // This is the normal exit from the lookup state machine.
1117             // We have advanced through the string until it is certain that no
1118             //   longer match is possible, no matter what characters follow.
1119             break;
1120         }
1121 
1122         // Advance to the next character.
1123         // If this is a beginning-of-input loop iteration, don't advance
1124         //    the input position.  The next iteration will be processing the
1125         //    first real input character.
1126         if (mode == RBBI_RUN) {
1127             c = UTEXT_NEXT32(fText);
1128         } else {
1129             if (mode == RBBI_START) {
1130                 mode = RBBI_RUN;
1131             }
1132         }
1133 
1134 
1135     }
1136 
1137     // The state machine is done.  Check whether it found a match...
1138 
1139     // If the iterator failed to advance in the match engine, force it ahead by one.
1140     //   (This really indicates a defect in the break rules.  They should always match
1141     //    at least one character.)
1142     if (result == initialPosition) {
1143         UTEXT_SETNATIVEINDEX(fText, initialPosition);
1144         UTEXT_NEXT32(fText);
1145         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1146     }
1147 
1148     // Leave the iterator at our result position.
1149     UTEXT_SETNATIVEINDEX(fText, result);
1150     #ifdef RBBI_DEBUG
1151         if (fTrace) {
1152             RBBIDebugPrintf("result = %d\n\n", result);
1153         }
1154     #endif
1155     return result;
1156 }
1157 
1158 
1159 
1160 //-----------------------------------------------------------------------------------
1161 //
1162 //  handlePrevious()
1163 //
1164 //      Iterate backwards, according to the logic of the reverse rules.
1165 //      This version handles the exact style backwards rules.
1166 //
1167 //      The logic of this function is very similar to handleNext(), above.
1168 //
1169 //-----------------------------------------------------------------------------------
handlePrevious(const RBBIStateTable * statetable)1170 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
1171     int32_t             state;
1172     int16_t             category        = 0;
1173     RBBIRunMode         mode;
1174     RBBIStateTableRow  *row;
1175     UChar32             c;
1176     int32_t             lookaheadStatus = 0;
1177     int32_t             result          = 0;
1178     int32_t             initialPosition = 0;
1179     int32_t             lookaheadResult = 0;
1180     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
1181 
1182     #ifdef RBBI_DEBUG
1183         if (fTrace) {
1184             RBBIDebugPuts("Handle Previous   pos   char  state category");
1185         }
1186     #endif
1187 
1188     // handlePrevious() never gets the rule status.
1189     // Flag the status as invalid; if the user ever asks for status, we will need
1190     // to back up, then re-find the break position using handleNext(), which does
1191     // get the status value.
1192     fLastStatusIndexValid = FALSE;
1193     fLastRuleStatusIndex = 0;
1194 
1195     // if we're already at the start of the text, return DONE.
1196     if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
1197         return BreakIterator::DONE;
1198     }
1199 
1200     //  Set up the starting char.
1201     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1202     result          = initialPosition;
1203     c               = UTEXT_PREVIOUS32(fText);
1204 
1205     //  Set the initial state for the state machine
1206     state = START_STATE;
1207     row = (RBBIStateTableRow *)
1208             (statetable->fTableData + (statetable->fRowLen * state));
1209     category = 3;
1210     mode     = RBBI_RUN;
1211     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
1212         category = 2;
1213         mode     = RBBI_START;
1214     }
1215 
1216 
1217     // loop until we reach the start of the text or transition to state 0
1218     //
1219     for (;;) {
1220         if (c == U_SENTINEL) {
1221             // Reached end of input string.
1222             if (mode == RBBI_END) {
1223                 // We have already run the loop one last time with the
1224                 //   character set to the psueudo {eof} value.  Now it is time
1225                 //   to unconditionally bail out.
1226                 if (lookaheadResult < result) {
1227                     // We ran off the end of the string with a pending look-ahead match.
1228                     // Treat this as if the look-ahead condition had been met, and return
1229                     //  the match at the / position from the look-ahead rule.
1230                     result               = lookaheadResult;
1231                     lookaheadStatus = 0;
1232                 } else if (result == initialPosition) {
1233                     // Ran off start, no match found.
1234                     // move one index one (towards the start, since we are doing a previous())
1235                     UTEXT_SETNATIVEINDEX(fText, initialPosition);
1236                     (void)UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
1237                 }
1238                 break;
1239             }
1240             // Run the loop one last time with the fake end-of-input character category.
1241             mode = RBBI_END;
1242             category = 1;
1243         }
1244 
1245         //
1246         // Get the char category.  An incoming category of 1 or 2 means that
1247         //      we are preset for doing the beginning or end of input, and
1248         //      that we shouldn't get a category from an actual text input character.
1249         //
1250         if (mode == RBBI_RUN) {
1251             // look up the current character's character category, which tells us
1252             // which column in the state table to look at.
1253             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
1254             //        not the size of the character going in, which is a UChar32.
1255             //
1256             UTRIE_GET16(&fData->fTrie, c, category);
1257 
1258             // Check the dictionary bit in the character's category.
1259             //    Counter is only used by dictionary based iterators (subclasses).
1260             //    Chars that need to be handled by a dictionary have a flag bit set
1261             //    in their category values.
1262             //
1263             if ((category & 0x4000) != 0)  {
1264                 fDictionaryCharCount++;
1265                 //  And off the dictionary flag bit.
1266                 category &= ~0x4000;
1267             }
1268         }
1269 
1270         #ifdef RBBI_DEBUG
1271             if (fTrace) {
1272                 RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText));
1273                 if (0x20<=c && c<0x7f) {
1274                     RBBIDebugPrintf("\"%c\"  ", c);
1275                 } else {
1276                     RBBIDebugPrintf("%5x  ", c);
1277                 }
1278                 RBBIDebugPrintf("%3d  %3d\n", state, category);
1279             }
1280         #endif
1281 
1282         // State Transition - move machine to its next state
1283         //
1284         state = row->fNextState[category];
1285         row = (RBBIStateTableRow *)
1286             (statetable->fTableData + (statetable->fRowLen * state));
1287 
1288         if (row->fAccepting == -1) {
1289             // Match found, common case.
1290             result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1291         }
1292 
1293         if (row->fLookAhead != 0) {
1294             if (lookaheadStatus != 0
1295                 && row->fAccepting == lookaheadStatus) {
1296                 // Lookahead match is completed.
1297                 result               = lookaheadResult;
1298                 lookaheadStatus      = 0;
1299                 // TODO:  make a standalone hard break in a rule work.
1300                 if (lookAheadHardBreak) {
1301                     UTEXT_SETNATIVEINDEX(fText, result);
1302                     return result;
1303                 }
1304                 // Look-ahead completed, but other rules may match further.  Continue on
1305                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
1306                 goto continueOn;
1307             }
1308 
1309             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1310             lookaheadResult = r;
1311             lookaheadStatus = row->fLookAhead;
1312             goto continueOn;
1313         }
1314 
1315 
1316         if (row->fAccepting != 0) {
1317             // Because this is an accepting state, any in-progress look-ahead match
1318             //   is no longer relavant.  Clear out the pending lookahead status.
1319             lookaheadStatus = 0;
1320         }
1321 
1322 continueOn:
1323         if (state == STOP_STATE) {
1324             // This is the normal exit from the lookup state machine.
1325             // We have advanced through the string until it is certain that no
1326             //   longer match is possible, no matter what characters follow.
1327             break;
1328         }
1329 
1330         // Move (backwards) to the next character to process.
1331         // If this is a beginning-of-input loop iteration, don't advance
1332         //    the input position.  The next iteration will be processing the
1333         //    first real input character.
1334         if (mode == RBBI_RUN) {
1335             c = UTEXT_PREVIOUS32(fText);
1336         } else {
1337             if (mode == RBBI_START) {
1338                 mode = RBBI_RUN;
1339             }
1340         }
1341     }
1342 
1343     // The state machine is done.  Check whether it found a match...
1344 
1345     // If the iterator failed to advance in the match engine, force it ahead by one.
1346     //   (This really indicates a defect in the break rules.  They should always match
1347     //    at least one character.)
1348     if (result == initialPosition) {
1349         UTEXT_SETNATIVEINDEX(fText, initialPosition);
1350         UTEXT_PREVIOUS32(fText);
1351         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1352     }
1353 
1354     // Leave the iterator at our result position.
1355     UTEXT_SETNATIVEINDEX(fText, result);
1356     #ifdef RBBI_DEBUG
1357         if (fTrace) {
1358             RBBIDebugPrintf("result = %d\n\n", result);
1359         }
1360     #endif
1361     return result;
1362 }
1363 
1364 
1365 void
reset()1366 RuleBasedBreakIterator::reset()
1367 {
1368     if (fCachedBreakPositions) {
1369         uprv_free(fCachedBreakPositions);
1370     }
1371     fCachedBreakPositions = NULL;
1372     fNumCachedBreakPositions = 0;
1373     fDictionaryCharCount = 0;
1374     fPositionInCache = 0;
1375 }
1376 
1377 
1378 
1379 //-------------------------------------------------------------------------------
1380 //
1381 //   getRuleStatus()   Return the break rule tag associated with the current
1382 //                     iterator position.  If the iterator arrived at its current
1383 //                     position by iterating forwards, the value will have been
1384 //                     cached by the handleNext() function.
1385 //
1386 //                     If no cached status value is available, the status is
1387 //                     found by doing a previous() followed by a next(), which
1388 //                     leaves the iterator where it started, and computes the
1389 //                     status while doing the next().
1390 //
1391 //-------------------------------------------------------------------------------
makeRuleStatusValid()1392 void RuleBasedBreakIterator::makeRuleStatusValid() {
1393     if (fLastStatusIndexValid == FALSE) {
1394         //  No cached status is available.
1395         if (fText == NULL || current() == 0) {
1396             //  At start of text, or there is no text.  Status is always zero.
1397             fLastRuleStatusIndex = 0;
1398             fLastStatusIndexValid = TRUE;
1399         } else {
1400             //  Not at start of text.  Find status the tedious way.
1401             int32_t pa = current();
1402             previous();
1403             if (fNumCachedBreakPositions > 0) {
1404                 reset();                // Blow off the dictionary cache
1405             }
1406             int32_t pb = next();
1407             if (pa != pb) {
1408                 // note: the if (pa != pb) test is here only to eliminate warnings for
1409                 //       unused local variables on gcc.  Logically, it isn't needed.
1410                 U_ASSERT(pa == pb);
1411             }
1412         }
1413     }
1414     U_ASSERT(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fData->fStatusMaxIdx);
1415 }
1416 
1417 
getRuleStatus() const1418 int32_t  RuleBasedBreakIterator::getRuleStatus() const {
1419     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
1420     nonConstThis->makeRuleStatusValid();
1421 
1422     // fLastRuleStatusIndex indexes to the start of the appropriate status record
1423     //                                                 (the number of status values.)
1424     //   This function returns the last (largest) of the array of status values.
1425     int32_t  idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
1426     int32_t  tagVal = fData->fRuleStatusTable[idx];
1427 
1428     return tagVal;
1429 }
1430 
1431 
1432 
1433 
getRuleStatusVec(int32_t * fillInVec,int32_t capacity,UErrorCode & status)1434 int32_t RuleBasedBreakIterator::getRuleStatusVec(
1435              int32_t *fillInVec, int32_t capacity, UErrorCode &status)
1436 {
1437     if (U_FAILURE(status)) {
1438         return 0;
1439     }
1440 
1441     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
1442     nonConstThis->makeRuleStatusValid();
1443     int32_t  numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
1444     int32_t  numValsToCopy = numVals;
1445     if (numVals > capacity) {
1446         status = U_BUFFER_OVERFLOW_ERROR;
1447         numValsToCopy = capacity;
1448     }
1449     int i;
1450     for (i=0; i<numValsToCopy; i++) {
1451         fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
1452     }
1453     return numVals;
1454 }
1455 
1456 
1457 
1458 //-------------------------------------------------------------------------------
1459 //
1460 //   getBinaryRules        Access to the compiled form of the rules,
1461 //                         for use by build system tools that save the data
1462 //                         for standard iterator types.
1463 //
1464 //-------------------------------------------------------------------------------
getBinaryRules(uint32_t & length)1465 const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
1466     const uint8_t  *retPtr = NULL;
1467     length = 0;
1468 
1469     if (fData != NULL) {
1470         retPtr = (const uint8_t *)fData->fHeader;
1471         length = fData->fHeader->fLength;
1472     }
1473     return retPtr;
1474 }
1475 
1476 
1477 
1478 
1479 //-------------------------------------------------------------------------------
1480 //
1481 //  BufferClone       TODO:  In my (Andy) opinion, this function should be deprecated.
1482 //                    Saving one heap allocation isn't worth the trouble.
1483 //                    Cloning shouldn't be done in tight loops, and
1484 //                    making the clone copy involves other heap operations anyway.
1485 //                    And the application code for correctly dealing with buffer
1486 //                    size problems and the eventual object destruction is ugly.
1487 //
1488 //-------------------------------------------------------------------------------
createBufferClone(void * stackBuffer,int32_t & bufferSize,UErrorCode & status)1489 BreakIterator *  RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
1490                                    int32_t &bufferSize,
1491                                    UErrorCode &status)
1492 {
1493     if (U_FAILURE(status)){
1494         return NULL;
1495     }
1496 
1497     //
1498     //  If user buffer size is zero this is a preflight operation to
1499     //    obtain the needed buffer size, allowing for worst case misalignment.
1500     //
1501     if (bufferSize == 0) {
1502         bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
1503         return NULL;
1504     }
1505 
1506 
1507     //
1508     //  Check the alignment and size of the user supplied buffer.
1509     //  Allocate heap memory if the user supplied memory is insufficient.
1510     //
1511     char    *buf   = (char *)stackBuffer;
1512     uint32_t s      = bufferSize;
1513 
1514     if (stackBuffer == NULL) {
1515         s = 0;   // Ignore size, force allocation if user didn't give us a buffer.
1516     }
1517     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
1518         uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf);
1519         s   -= offsetUp;
1520         buf += offsetUp;
1521     }
1522     if (s < sizeof(RuleBasedBreakIterator)) {
1523         // Not enough room in the caller-supplied buffer.
1524         // Do a plain-vanilla heap based clone and return that, along with
1525         //   a warning that the clone was allocated.
1526         RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this);
1527         if (clonedBI == 0) {
1528             status = U_MEMORY_ALLOCATION_ERROR;
1529         } else {
1530             status = U_SAFECLONE_ALLOCATED_WARNING;
1531         }
1532         return clonedBI;
1533     }
1534 
1535     //
1536     //  Clone the source BI into the caller-supplied buffer.
1537     //
1538     RuleBasedBreakIterator *clone = new(buf) RuleBasedBreakIterator(*this);
1539     clone->fBufferClone = TRUE;   // Flag to prevent deleting storage on close (From C code)
1540 
1541     return clone;
1542 }
1543 
1544 
1545 //-------------------------------------------------------------------------------
1546 //
1547 //  isDictionaryChar      Return true if the category lookup for this char
1548 //                        indicates that it is in the set of dictionary lookup
1549 //                        chars.
1550 //
1551 //                        This function is intended for use by dictionary based
1552 //                        break iterators.
1553 //
1554 //-------------------------------------------------------------------------------
1555 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) {
1556     if (fData == NULL) {
1557         return FALSE;
1558     }
1559     uint16_t category;
1560     UTRIE_GET16(&fData->fTrie, c, category);
1561     return (category & 0x4000) != 0;
1562 }*/
1563 
1564 
1565 //-------------------------------------------------------------------------------
1566 //
1567 //  checkDictionary       This function handles all processing of characters in
1568 //                        the "dictionary" set. It will determine the appropriate
1569 //                        course of action, and possibly set up a cache in the
1570 //                        process.
1571 //
1572 //-------------------------------------------------------------------------------
checkDictionary(int32_t startPos,int32_t endPos,UBool reverse)1573 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
1574                             int32_t endPos,
1575                             UBool reverse) {
1576     // Reset the old break cache first.
1577     uint32_t dictionaryCount = fDictionaryCharCount;
1578     reset();
1579 
1580     if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
1581         return (reverse ? startPos : endPos);
1582     }
1583 
1584     // Bug 5532.  The dictionary code will crash if the input text is UTF-8
1585     //      because native indexes are different from UTF-16 indexes.
1586     //      Temporary hack: skip dictionary lookup for UTF-8 encoded text.
1587     //      It wont give the right breaks, but it's better than a crash.
1588     //
1589     //      Check the type of the UText by checking its pFuncs field, which
1590     //      is UText's function dispatch table.  It will be the same for all
1591     //      UTF-8 UTexts and different for any other UText type.
1592     //
1593     //      We have no other type of UText available with non-UTF-16 native indexing.
1594     //      This whole check will go away once the dictionary code is fixed.
1595     static const void *utext_utf8Funcs;
1596     if (utext_utf8Funcs == NULL) {
1597         // Cache the UTF-8 UText function pointer value.
1598         UErrorCode status = U_ZERO_ERROR;
1599         UText tempUText = UTEXT_INITIALIZER;
1600         utext_openUTF8(&tempUText, NULL, 0, &status);
1601         utext_utf8Funcs = tempUText.pFuncs;
1602         utext_close(&tempUText);
1603     }
1604     if (fText->pFuncs == utext_utf8Funcs) {
1605         return (reverse ? startPos : endPos);
1606     }
1607 
1608     // Starting from the starting point, scan towards the proposed result,
1609     // looking for the first dictionary character (which may be the one
1610     // we're on, if we're starting in the middle of a range).
1611     utext_setNativeIndex(fText, reverse ? endPos : startPos);
1612     if (reverse) {
1613         UTEXT_PREVIOUS32(fText);
1614     }
1615 
1616     int32_t rangeStart = startPos;
1617     int32_t rangeEnd = endPos;
1618 
1619     uint16_t    category;
1620     int32_t     current;
1621     UErrorCode  status = U_ZERO_ERROR;
1622     UStack      breaks(status);
1623     int32_t     foundBreakCount = 0;
1624     UChar32     c = utext_current32(fText);
1625 
1626     UTRIE_GET16(&fData->fTrie, c, category);
1627 
1628     // Is the character we're starting on a dictionary character? If so, we
1629     // need to back up to include the entire run; otherwise the results of
1630     // the break algorithm will differ depending on where we start. Since
1631     // the result is cached and there is typically a non-dictionary break
1632     // within a small number of words, there should be little performance impact.
1633     if (category & 0x4000) {
1634         if (reverse) {
1635             do {
1636                 utext_next32(fText);          // TODO:  recast to work directly with postincrement.
1637                 c = utext_current32(fText);
1638                 UTRIE_GET16(&fData->fTrie, c, category);
1639             } while (c != U_SENTINEL && (category & 0x4000));
1640             // Back up to the last dictionary character
1641             rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1642             if (c == U_SENTINEL) {
1643                 // c = fText->last32();
1644                 //   TODO:  why was this if needed?
1645                 c = UTEXT_PREVIOUS32(fText);
1646             }
1647             else {
1648                 c = UTEXT_PREVIOUS32(fText);
1649             }
1650         }
1651         else {
1652             do {
1653                 c = UTEXT_PREVIOUS32(fText);
1654                 UTRIE_GET16(&fData->fTrie, c, category);
1655             }
1656             while (c != U_SENTINEL && (category & 0x4000));
1657             // Back up to the last dictionary character
1658             if (c == U_SENTINEL) {
1659                 // c = fText->first32();
1660                 c = utext_current32(fText);
1661             }
1662             else {
1663                 utext_next32(fText);
1664                 c = utext_current32(fText);
1665             }
1666             rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
1667         }
1668         UTRIE_GET16(&fData->fTrie, c, category);
1669     }
1670 
1671     // Loop through the text, looking for ranges of dictionary characters.
1672     // For each span, find the appropriate break engine, and ask it to find
1673     // any breaks within the span.
1674     // Note: we always do this in the forward direction, so that the break
1675     // cache is built in the right order.
1676     if (reverse) {
1677         utext_setNativeIndex(fText, rangeStart);
1678         c = utext_current32(fText);
1679         UTRIE_GET16(&fData->fTrie, c, category);
1680     }
1681     while(U_SUCCESS(status)) {
1682         while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
1683             utext_next32(fText);           // TODO:  tweak for post-increment operation
1684             c = utext_current32(fText);
1685             UTRIE_GET16(&fData->fTrie, c, category);
1686         }
1687         if (current >= rangeEnd) {
1688             break;
1689         }
1690 
1691         // We now have a dictionary character. Get the appropriate language object
1692         // to deal with it.
1693         const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
1694 
1695         // Ask the language object if there are any breaks. It will leave the text
1696         // pointer on the other side of its range, ready to search for the next one.
1697         if (lbe != NULL) {
1698             foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
1699         }
1700 
1701         // Reload the loop variables for the next go-round
1702         c = utext_current32(fText);
1703         UTRIE_GET16(&fData->fTrie, c, category);
1704     }
1705 
1706     // If we found breaks, build a new break cache. The first and last entries must
1707     // be the original starting and ending position.
1708     if (foundBreakCount > 0) {
1709         int32_t totalBreaks = foundBreakCount;
1710         if (startPos < breaks.elementAti(0)) {
1711             totalBreaks += 1;
1712         }
1713         if (endPos > breaks.peeki()) {
1714             totalBreaks += 1;
1715         }
1716         fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
1717         if (fCachedBreakPositions != NULL) {
1718             int32_t out = 0;
1719             fNumCachedBreakPositions = totalBreaks;
1720             if (startPos < breaks.elementAti(0)) {
1721                 fCachedBreakPositions[out++] = startPos;
1722             }
1723             for (int32_t i = 0; i < foundBreakCount; ++i) {
1724                 fCachedBreakPositions[out++] = breaks.elementAti(i);
1725             }
1726             if (endPos > fCachedBreakPositions[out-1]) {
1727                 fCachedBreakPositions[out] = endPos;
1728             }
1729             // If there are breaks, then by definition, we are replacing the original
1730             // proposed break by one of the breaks we found. Use following() and
1731             // preceding() to do the work. They should never recurse in this case.
1732             if (reverse) {
1733                 return preceding(endPos - 1);
1734             }
1735             else {
1736                 return following(startPos);
1737             }
1738         }
1739         // If the allocation failed, just fall through to the "no breaks found" case.
1740     }
1741 
1742     // If we get here, there were no language-based breaks. Set the text pointer
1743     // to the original proposed break.
1744     utext_setNativeIndex(fText, reverse ? startPos : endPos);
1745     return (reverse ? startPos : endPos);
1746 }
1747 
1748 U_NAMESPACE_END
1749 
1750 // defined in ucln_cmn.h
1751 
1752 static U_NAMESPACE_QUALIFIER UStack *gLanguageBreakFactories = NULL;
1753 
1754 /**
1755  * Release all static memory held by breakiterator.
1756  */
1757 U_CDECL_BEGIN
breakiterator_cleanup_dict(void)1758 static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
1759     if (gLanguageBreakFactories) {
1760         delete gLanguageBreakFactories;
1761         gLanguageBreakFactories = NULL;
1762     }
1763     return TRUE;
1764 }
1765 U_CDECL_END
1766 
1767 U_CDECL_BEGIN
_deleteFactory(void * obj)1768 static void U_CALLCONV _deleteFactory(void *obj) {
1769     delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory *) obj;
1770 }
1771 U_CDECL_END
1772 U_NAMESPACE_BEGIN
1773 
1774 static const LanguageBreakEngine*
getLanguageBreakEngineFromFactory(UChar32 c,int32_t breakType)1775 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
1776 {
1777     UBool       needsInit;
1778     UErrorCode  status = U_ZERO_ERROR;
1779     UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit);
1780 
1781     if (needsInit) {
1782         UStack  *factories = new UStack(_deleteFactory, NULL, status);
1783         if (factories != NULL && U_SUCCESS(status)) {
1784             ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
1785             factories->push(builtIn, status);
1786 #ifdef U_LOCAL_SERVICE_HOOK
1787             LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
1788             if (extra != NULL) {
1789                 factories->push(extra, status);
1790             }
1791 #endif
1792         }
1793         umtx_lock(NULL);
1794         if (gLanguageBreakFactories == NULL) {
1795             gLanguageBreakFactories = factories;
1796             factories = NULL;
1797             ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
1798         }
1799         umtx_unlock(NULL);
1800         delete factories;
1801     }
1802 
1803     if (gLanguageBreakFactories == NULL) {
1804         return NULL;
1805     }
1806 
1807     int32_t i = gLanguageBreakFactories->size();
1808     const LanguageBreakEngine *lbe = NULL;
1809     while (--i >= 0) {
1810         LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
1811         lbe = factory->getEngineFor(c, breakType);
1812         if (lbe != NULL) {
1813             break;
1814         }
1815     }
1816     return lbe;
1817 }
1818 
1819 
1820 //-------------------------------------------------------------------------------
1821 //
1822 //  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the
1823 //                          the characer c.
1824 //
1825 //-------------------------------------------------------------------------------
1826 const LanguageBreakEngine *
getLanguageBreakEngine(UChar32 c)1827 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
1828     const LanguageBreakEngine *lbe = NULL;
1829     UErrorCode status = U_ZERO_ERROR;
1830 
1831     if (fLanguageBreakEngines == NULL) {
1832         fLanguageBreakEngines = new UStack(status);
1833         if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
1834             delete fLanguageBreakEngines;
1835             fLanguageBreakEngines = 0;
1836             return NULL;
1837         }
1838     }
1839 
1840     int32_t i = fLanguageBreakEngines->size();
1841     while (--i >= 0) {
1842         lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
1843         if (lbe->handles(c, fBreakType)) {
1844             return lbe;
1845         }
1846     }
1847 
1848     // No existing dictionary took the character. See if a factory wants to
1849     // give us a new LanguageBreakEngine for this character.
1850     lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
1851 
1852     // If we got one, use it and push it on our stack.
1853     if (lbe != NULL) {
1854         fLanguageBreakEngines->push((void *)lbe, status);
1855         // Even if we can't remember it, we can keep looking it up, so
1856         // return it even if the push fails.
1857         return lbe;
1858     }
1859 
1860     // No engine is forthcoming for this character. Add it to the
1861     // reject set. Create the reject break engine if needed.
1862     if (fUnhandledBreakEngine == NULL) {
1863         fUnhandledBreakEngine = new UnhandledEngine(status);
1864         if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
1865             status = U_MEMORY_ALLOCATION_ERROR;
1866         }
1867         // Put it last so that scripts for which we have an engine get tried
1868         // first.
1869         fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
1870         // If we can't insert it, or creation failed, get rid of it
1871         if (U_FAILURE(status)) {
1872             delete fUnhandledBreakEngine;
1873             fUnhandledBreakEngine = 0;
1874             return NULL;
1875         }
1876     }
1877 
1878     // Tell the reject engine about the character; at its discretion, it may
1879     // add more than just the one character.
1880     fUnhandledBreakEngine->handleCharacter(c, fBreakType);
1881 
1882     return fUnhandledBreakEngine;
1883 }
1884 
1885 
1886 
1887 /*int32_t RuleBasedBreakIterator::getBreakType() const {
1888     return fBreakType;
1889 }*/
1890 
setBreakType(int32_t type)1891 void RuleBasedBreakIterator::setBreakType(int32_t type) {
1892     fBreakType = type;
1893     reset();
1894 }
1895 
1896 U_NAMESPACE_END
1897 
1898 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1899