• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ***************************************************************************
3 *   Copyright (C) 1999-2010 International Business Machines Corporation
4 *   and others. All rights reserved.
5 ***************************************************************************
6 */
7 //
8 //  file:  rbbi.c    Contains the implementation of the rule based break iterator
9 //                   runtime engine and the API implementation for
10 //                   class RuleBasedBreakIterator
11 //
12 
13 #include <typeinfo>  // for 'typeid' to work
14 
15 #include "unicode/utypes.h"
16 
17 #if !UCONFIG_NO_BREAK_ITERATION
18 
19 #include "unicode/rbbi.h"
20 #include "unicode/schriter.h"
21 #include "unicode/uchriter.h"
22 #include "unicode/udata.h"
23 #include "unicode/uclean.h"
24 #include "rbbidata.h"
25 #include "rbbirb.h"
26 #include "cmemory.h"
27 #include "cstring.h"
28 #include "umutex.h"
29 #include "ucln_cmn.h"
30 #include "brkeng.h"
31 
32 #include "uassert.h"
33 #include "uvector.h"
34 
35 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
36 #if U_LOCAL_SERVICE_HOOK
37 #include "localsvc.h"
38 #endif
39 
40 #ifdef RBBI_DEBUG
41 static UBool fTrace = FALSE;
42 #endif
43 
44 U_NAMESPACE_BEGIN
45 
46 // The state number of the starting state
47 #define START_STATE 1
48 
49 // The state-transition value indicating "stop"
50 #define STOP_STATE  0
51 
52 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
54 
55 
56 //=======================================================================
57 // constructors
58 //=======================================================================
59 
60 /**
61  * Constructs a RuleBasedBreakIterator that uses the already-created
62  * tables object that is passed in as a parameter.
63  */
64 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
65 {
66     init();
67     fData = new RBBIDataWrapper(data, status); // status checked in constructor
68     if (U_FAILURE(status)) {return;}
69     if(fData == 0) {
70         status = U_MEMORY_ALLOCATION_ERROR;
71         return;
72     }
73 }
74 
75 /**
76  * Same as above but does not adopt memory
77  */
RuleBasedBreakIterator(const RBBIDataHeader * data,enum EDontAdopt,UErrorCode & status)78 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)
79 {
80     init();
81     fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor
82     if (U_FAILURE(status)) {return;}
83     if(fData == 0) {
84         status = U_MEMORY_ALLOCATION_ERROR;
85         return;
86     }
87 }
88 
89 //-------------------------------------------------------------------------------
90 //
91 //   Constructor   from a UDataMemory handle to precompiled break rules
92 //                 stored in an ICU data file.
93 //
94 //-------------------------------------------------------------------------------
RuleBasedBreakIterator(UDataMemory * udm,UErrorCode & status)95 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
96 {
97     init();
98     fData = new RBBIDataWrapper(udm, status); // status checked in constructor
99     if (U_FAILURE(status)) {return;}
100     if(fData == 0) {
101         status = U_MEMORY_ALLOCATION_ERROR;
102         return;
103     }
104 }
105 
106 
107 
108 //-------------------------------------------------------------------------------
109 //
110 //   Constructor       from a set of rules supplied as a string.
111 //
112 //-------------------------------------------------------------------------------
RuleBasedBreakIterator(const UnicodeString & rules,UParseError & parseError,UErrorCode & status)113 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
114                                                 UParseError          &parseError,
115                                                 UErrorCode           &status)
116 {
117     init();
118     if (U_FAILURE(status)) {return;}
119     RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
120         RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
121     // Note:  This is a bit awkward.  The RBBI ruleBuilder has a factory method that
122     //        creates and returns a complete RBBI.  From here, in a constructor, we
123     //        can't just return the object created by the builder factory, hence
124     //        the assignment of the factory created object to "this".
125     if (U_SUCCESS(status)) {
126         *this = *bi;
127         delete bi;
128     }
129 }
130 
131 
132 //-------------------------------------------------------------------------------
133 //
134 // Default Constructor.      Create an empty shell that can be set up later.
135 //                           Used when creating a RuleBasedBreakIterator from a set
136 //                           of rules.
137 //-------------------------------------------------------------------------------
RuleBasedBreakIterator()138 RuleBasedBreakIterator::RuleBasedBreakIterator() {
139     init();
140 }
141 
142 
143 //-------------------------------------------------------------------------------
144 //
145 //   Copy constructor.  Will produce a break iterator with the same behavior,
146 //                      and which iterates over the same text, as the one passed in.
147 //
148 //-------------------------------------------------------------------------------
RuleBasedBreakIterator(const RuleBasedBreakIterator & other)149 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
150 : BreakIterator(other)
151 {
152     this->init();
153     *this = other;
154 }
155 
156 
157 /**
158  * Destructor
159  */
~RuleBasedBreakIterator()160 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
161     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
162         // fCharIter was adopted from the outside.
163         delete fCharIter;
164     }
165     fCharIter = NULL;
166     delete fSCharIter;
167     fCharIter = NULL;
168     delete fDCharIter;
169     fDCharIter = NULL;
170 
171     utext_close(fText);
172 
173     if (fData != NULL) {
174         fData->removeReference();
175         fData = NULL;
176     }
177     if (fCachedBreakPositions) {
178         uprv_free(fCachedBreakPositions);
179         fCachedBreakPositions = NULL;
180     }
181     if (fLanguageBreakEngines) {
182         delete fLanguageBreakEngines;
183         fLanguageBreakEngines = NULL;
184     }
185     if (fUnhandledBreakEngine) {
186         delete fUnhandledBreakEngine;
187         fUnhandledBreakEngine = NULL;
188     }
189 }
190 
191 /**
192  * Assignment operator.  Sets this iterator to have the same behavior,
193  * and iterate over the same text, as the one passed in.
194  */
195 RuleBasedBreakIterator&
operator =(const RuleBasedBreakIterator & that)196 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
197     if (this == &that) {
198         return *this;
199     }
200     reset();    // Delete break cache information
201     fBreakType = that.fBreakType;
202     if (fLanguageBreakEngines != NULL) {
203         delete fLanguageBreakEngines;
204         fLanguageBreakEngines = NULL;   // Just rebuild for now
205     }
206     // TODO: clone fLanguageBreakEngines from "that"
207     UErrorCode status = U_ZERO_ERROR;
208     fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
209 
210     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
211         delete fCharIter;
212     }
213     fCharIter = NULL;
214 
215     if (that.fCharIter != NULL ) {
216         // This is a little bit tricky - it will intially appear that
217         //  this->fCharIter is adopted, even if that->fCharIter was
218         //  not adopted.  That's ok.
219         fCharIter = that.fCharIter->clone();
220     }
221 
222     if (fData != NULL) {
223         fData->removeReference();
224         fData = NULL;
225     }
226     if (that.fData != NULL) {
227         fData = that.fData->addReference();
228     }
229 
230     return *this;
231 }
232 
233 
234 
235 //-----------------------------------------------------------------------------
236 //
237 //    init()      Shared initialization routine.   Used by all the constructors.
238 //                Initializes all fields, leaving the object in a consistent state.
239 //
240 //-----------------------------------------------------------------------------
init()241 void RuleBasedBreakIterator::init() {
242     UErrorCode  status    = U_ZERO_ERROR;
243     fBufferClone          = FALSE;
244     fText                 = utext_openUChars(NULL, NULL, 0, &status);
245     fCharIter             = NULL;
246     fSCharIter            = NULL;
247     fDCharIter            = NULL;
248     fData                 = NULL;
249     fLastRuleStatusIndex  = 0;
250     fLastStatusIndexValid = TRUE;
251     fDictionaryCharCount  = 0;
252     fBreakType            = UBRK_WORD;  // Defaulting BreakType to word gives reasonable
253                                         //   dictionary behavior for Break Iterators that are
254                                         //   built from rules.  Even better would be the ability to
255                                         //   declare the type in the rules.
256 
257     fCachedBreakPositions    = NULL;
258     fLanguageBreakEngines    = NULL;
259     fUnhandledBreakEngine    = NULL;
260     fNumCachedBreakPositions = 0;
261     fPositionInCache         = 0;
262 
263 #ifdef RBBI_DEBUG
264     static UBool debugInitDone = FALSE;
265     if (debugInitDone == FALSE) {
266         char *debugEnv = getenv("U_RBBIDEBUG");
267         if (debugEnv && uprv_strstr(debugEnv, "trace")) {
268             fTrace = TRUE;
269         }
270         debugInitDone = TRUE;
271     }
272 #endif
273 }
274 
275 
276 
277 //-----------------------------------------------------------------------------
278 //
279 //    clone - Returns a newly-constructed RuleBasedBreakIterator with the same
280 //            behavior, and iterating over the same text, as this one.
281 //            Virtual function: does the right thing with subclasses.
282 //
283 //-----------------------------------------------------------------------------
284 BreakIterator*
clone(void) const285 RuleBasedBreakIterator::clone(void) const {
286     return new RuleBasedBreakIterator(*this);
287 }
288 
289 /**
290  * Equality operator.  Returns TRUE if both BreakIterators are of the
291  * same class, have the same behavior, and iterate over the same text.
292  */
293 UBool
operator ==(const BreakIterator & that) const294 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
295     if (typeid(*this) != typeid(that)) {
296         return FALSE;
297     }
298 
299     const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
300 
301     if (!utext_equals(fText, that2.fText)) {
302         // The two break iterators are operating on different text,
303         //   or have a different interation position.
304         return FALSE;
305     };
306 
307     // TODO:  need a check for when in a dictionary region at different offsets.
308 
309     if (that2.fData == fData ||
310         (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
311             // The two break iterators are using the same rules.
312             return TRUE;
313         }
314     return FALSE;
315 }
316 
317 /**
318  * Compute a hash code for this BreakIterator
319  * @return A hash code
320  */
321 int32_t
hashCode(void) const322 RuleBasedBreakIterator::hashCode(void) const {
323     int32_t   hash = 0;
324     if (fData != NULL) {
325         hash = fData->hashCode();
326     }
327     return hash;
328 }
329 
330 
setText(UText * ut,UErrorCode & status)331 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
332     if (U_FAILURE(status)) {
333         return;
334     }
335     reset();
336     fText = utext_clone(fText, ut, FALSE, TRUE, &status);
337 
338     // Set up a dummy CharacterIterator to be returned if anyone
339     //   calls getText().  With input from UText, there is no reasonable
340     //   way to return a characterIterator over the actual input text.
341     //   Return one over an empty string instead - this is the closest
342     //   we can come to signaling a failure.
343     //   (GetText() is obsolete, this failure is sort of OK)
344     if (fDCharIter == NULL) {
345         static const UChar c = 0;
346         fDCharIter = new UCharCharacterIterator(&c, 0);
347         if (fDCharIter == NULL) {
348             status = U_MEMORY_ALLOCATION_ERROR;
349             return;
350         }
351     }
352 
353     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
354         // existing fCharIter was adopted from the outside.  Delete it now.
355         delete fCharIter;
356     }
357     fCharIter = fDCharIter;
358 
359     this->first();
360 }
361 
362 
getUText(UText * fillIn,UErrorCode & status) const363 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
364     UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
365     return result;
366 }
367 
368 
369 
370 /**
371  * Returns the description used to create this iterator
372  */
373 const UnicodeString&
getRules() const374 RuleBasedBreakIterator::getRules() const {
375     if (fData != NULL) {
376         return fData->getRuleSourceString();
377     } else {
378         static const UnicodeString *s;
379         if (s == NULL) {
380             // TODO:  something more elegant here.
381             //        perhaps API should return the string by value.
382             //        Note:  thread unsafe init & leak are semi-ok, better than
383             //               what was before.  Sould be cleaned up, though.
384             s = new UnicodeString;
385         }
386         return *s;
387     }
388 }
389 
390 //=======================================================================
391 // BreakIterator overrides
392 //=======================================================================
393 
394 /**
395  * Return a CharacterIterator over the text being analyzed.
396  */
397 CharacterIterator&
getText() const398 RuleBasedBreakIterator::getText() const {
399     return *fCharIter;
400 }
401 
402 /**
403  * Set the iterator to analyze a new piece of text.  This function resets
404  * the current iteration position to the beginning of the text.
405  * @param newText An iterator over the text to analyze.
406  */
407 void
adoptText(CharacterIterator * newText)408 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
409     // If we are holding a CharacterIterator adopted from a
410     //   previous call to this function, delete it now.
411     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
412         delete fCharIter;
413     }
414 
415     fCharIter = newText;
416     UErrorCode status = U_ZERO_ERROR;
417     reset();
418     if (newText==NULL || newText->startIndex() != 0) {
419         // startIndex !=0 wants to be an error, but there's no way to report it.
420         // Make the iterator text be an empty string.
421         fText = utext_openUChars(fText, NULL, 0, &status);
422     } else {
423         fText = utext_openCharacterIterator(fText, newText, &status);
424     }
425     this->first();
426 }
427 
428 /**
429  * Set the iterator to analyze a new piece of text.  This function resets
430  * the current iteration position to the beginning of the text.
431  * @param newText An iterator over the text to analyze.
432  */
433 void
setText(const UnicodeString & newText)434 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
435     UErrorCode status = U_ZERO_ERROR;
436     reset();
437     fText = utext_openConstUnicodeString(fText, &newText, &status);
438 
439     // Set up a character iterator on the string.
440     //   Needed in case someone calls getText().
441     //  Can not, unfortunately, do this lazily on the (probably never)
442     //  call to getText(), because getText is const.
443     if (fSCharIter == NULL) {
444         fSCharIter = new StringCharacterIterator(newText);
445     } else {
446         fSCharIter->setText(newText);
447     }
448 
449     if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
450         // old fCharIter was adopted from the outside.  Delete it.
451         delete fCharIter;
452     }
453     fCharIter = fSCharIter;
454 
455     this->first();
456 }
457 
458 
459 
460 /**
461  * Sets the current iteration position to the beginning of the text.
462  * @return The offset of the beginning of the text.
463  */
first(void)464 int32_t RuleBasedBreakIterator::first(void) {
465     reset();
466     fLastRuleStatusIndex  = 0;
467     fLastStatusIndexValid = TRUE;
468     //if (fText == NULL)
469     //    return BreakIterator::DONE;
470 
471     utext_setNativeIndex(fText, 0);
472     return 0;
473 }
474 
475 /**
476  * Sets the current iteration position to the end of the text.
477  * @return The text's past-the-end offset.
478  */
last(void)479 int32_t RuleBasedBreakIterator::last(void) {
480     reset();
481     if (fText == NULL) {
482         fLastRuleStatusIndex  = 0;
483         fLastStatusIndexValid = TRUE;
484         return BreakIterator::DONE;
485     }
486 
487     fLastStatusIndexValid = FALSE;
488     int32_t pos = (int32_t)utext_nativeLength(fText);
489     utext_setNativeIndex(fText, pos);
490     return pos;
491 }
492 
493 /**
494  * Advances the iterator either forward or backward the specified number of steps.
495  * Negative values move backward, and positive values move forward.  This is
496  * equivalent to repeatedly calling next() or previous().
497  * @param n The number of steps to move.  The sign indicates the direction
498  * (negative is backwards, and positive is forwards).
499  * @return The character offset of the boundary position n boundaries away from
500  * the current one.
501  */
next(int32_t n)502 int32_t RuleBasedBreakIterator::next(int32_t n) {
503     int32_t result = current();
504     while (n > 0) {
505         result = next();
506         --n;
507     }
508     while (n < 0) {
509         result = previous();
510         ++n;
511     }
512     return result;
513 }
514 
515 /**
516  * Advances the iterator to the next boundary position.
517  * @return The position of the first boundary after this one.
518  */
next(void)519 int32_t RuleBasedBreakIterator::next(void) {
520     // if we have cached break positions and we're still in the range
521     // covered by them, just move one step forward in the cache
522     if (fCachedBreakPositions != NULL) {
523         if (fPositionInCache < fNumCachedBreakPositions - 1) {
524             ++fPositionInCache;
525             int32_t pos = fCachedBreakPositions[fPositionInCache];
526             utext_setNativeIndex(fText, pos);
527             return pos;
528         }
529         else {
530             reset();
531         }
532     }
533 
534     int32_t startPos = current();
535     int32_t result = handleNext(fData->fForwardTable);
536     if (fDictionaryCharCount > 0) {
537         result = checkDictionary(startPos, result, FALSE);
538     }
539     return result;
540 }
541 
542 /**
543  * Advances the iterator backwards, to the last boundary preceding this one.
544  * @return The position of the last boundary position preceding this one.
545  */
previous(void)546 int32_t RuleBasedBreakIterator::previous(void) {
547     int32_t result;
548     int32_t startPos;
549 
550     // if we have cached break positions and we're still in the range
551     // covered by them, just move one step backward in the cache
552     if (fCachedBreakPositions != NULL) {
553         if (fPositionInCache > 0) {
554             --fPositionInCache;
555             // If we're at the beginning of the cache, need to reevaluate the
556             // rule status
557             if (fPositionInCache <= 0) {
558                 fLastStatusIndexValid = FALSE;
559             }
560             int32_t pos = fCachedBreakPositions[fPositionInCache];
561             utext_setNativeIndex(fText, pos);
562             return pos;
563         }
564         else {
565             reset();
566         }
567     }
568 
569     // if we're already sitting at the beginning of the text, return DONE
570     if (fText == NULL || (startPos = current()) == 0) {
571         fLastRuleStatusIndex  = 0;
572         fLastStatusIndexValid = TRUE;
573         return BreakIterator::DONE;
574     }
575 
576     if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
577         result = handlePrevious(fData->fReverseTable);
578         if (fDictionaryCharCount > 0) {
579             result = checkDictionary(result, startPos, TRUE);
580         }
581         return result;
582     }
583 
584     // old rule syntax
585     // set things up.  handlePrevious() will back us up to some valid
586     // break position before the current position (we back our internal
587     // iterator up one step to prevent handlePrevious() from returning
588     // the current position), but not necessarily the last one before
589 
590     // where we started
591 
592     int32_t start = current();
593 
594     UTEXT_PREVIOUS32(fText);
595     int32_t lastResult    = handlePrevious(fData->fReverseTable);
596     if (lastResult == UBRK_DONE) {
597         lastResult = 0;
598         utext_setNativeIndex(fText, 0);
599     }
600     result = lastResult;
601     int32_t lastTag       = 0;
602     UBool   breakTagValid = FALSE;
603 
604     // iterate forward from the known break position until we pass our
605     // starting point.  The last break position before the starting
606     // point is our return value
607 
608     for (;;) {
609         result         = next();
610         if (result == BreakIterator::DONE || result >= start) {
611             break;
612         }
613         lastResult     = result;
614         lastTag        = fLastRuleStatusIndex;
615         breakTagValid  = TRUE;
616     }
617 
618     // fLastBreakTag wants to have the value for section of text preceding
619     // the result position that we are to return (in lastResult.)  If
620     // the backwards rules overshot and the above loop had to do two or more
621     // next()s to move up to the desired return position, we will have a valid
622     // tag value. But, if handlePrevious() took us to exactly the correct result positon,
623     // we wont have a tag value for that position, which is only set by handleNext().
624 
625     // set the current iteration position to be the last break position
626     // before where we started, and then return that value
627     utext_setNativeIndex(fText, lastResult);
628     fLastRuleStatusIndex  = lastTag;       // for use by getRuleStatus()
629     fLastStatusIndexValid = breakTagValid;
630 
631     // No need to check the dictionary; it will have been handled by
632     // next()
633 
634     return lastResult;
635 }
636 
637 /**
638  * Sets the iterator to refer to the first boundary position following
639  * the specified position.
640  * @offset The position from which to begin searching for a break position.
641  * @return The position of the first break after the current position.
642  */
following(int32_t offset)643 int32_t RuleBasedBreakIterator::following(int32_t offset) {
644     // if we have cached break positions and offset is in the range
645     // covered by them, use them
646     // TODO: could use binary search
647     // TODO: what if offset is outside range, but break is not?
648     if (fCachedBreakPositions != NULL) {
649         if (offset >= fCachedBreakPositions[0]
650                 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
651             fPositionInCache = 0;
652             // We are guaranteed not to leave the array due to range test above
653             while (offset >= fCachedBreakPositions[fPositionInCache]) {
654                 ++fPositionInCache;
655             }
656             int32_t pos = fCachedBreakPositions[fPositionInCache];
657             utext_setNativeIndex(fText, pos);
658             return pos;
659         }
660         else {
661             reset();
662         }
663     }
664 
665     // if the offset passed in is already past the end of the text,
666     // just return DONE; if it's before the beginning, return the
667     // text's starting offset
668     fLastRuleStatusIndex  = 0;
669     fLastStatusIndexValid = TRUE;
670     if (fText == NULL || offset >= utext_nativeLength(fText)) {
671         last();
672         return next();
673     }
674     else if (offset < 0) {
675         return first();
676     }
677 
678     // otherwise, set our internal iteration position (temporarily)
679     // to the position passed in.  If this is the _beginning_ position,
680     // then we can just use next() to get our return value
681 
682     int32_t result = 0;
683 
684     if (fData->fSafeRevTable != NULL) {
685         // new rule syntax
686         utext_setNativeIndex(fText, offset);
687         // move forward one codepoint to prepare for moving back to a
688         // safe point.
689         // this handles offset being between a supplementary character
690         UTEXT_NEXT32(fText);
691         // handlePrevious will move most of the time to < 1 boundary away
692         handlePrevious(fData->fSafeRevTable);
693         int32_t result = next();
694         while (result <= offset) {
695             result = next();
696         }
697         return result;
698     }
699     if (fData->fSafeFwdTable != NULL) {
700         // backup plan if forward safe table is not available
701         utext_setNativeIndex(fText, offset);
702         UTEXT_PREVIOUS32(fText);
703         // handle next will give result >= offset
704         handleNext(fData->fSafeFwdTable);
705         // previous will give result 0 or 1 boundary away from offset,
706         // most of the time
707         // we have to
708         int32_t oldresult = previous();
709         while (oldresult > offset) {
710             int32_t result = previous();
711             if (result <= offset) {
712                 return oldresult;
713             }
714             oldresult = result;
715         }
716         int32_t result = next();
717         if (result <= offset) {
718             return next();
719         }
720         return result;
721     }
722     // otherwise, we have to sync up first.  Use handlePrevious() to back
723     // up to a known break position before the specified position (if
724     // we can determine that the specified position is a break position,
725     // we don't back up at all).  This may or may not be the last break
726     // position at or before our starting position.  Advance forward
727     // from here until we've passed the starting position.  The position
728     // we stop on will be the first break position after the specified one.
729     // old rule syntax
730 
731     utext_setNativeIndex(fText, offset);
732     if (offset==0 ||
733         (offset==1  && utext_getNativeIndex(fText)==0)) {
734         return next();
735     }
736     result = previous();
737 
738     while (result != BreakIterator::DONE && result <= offset) {
739         result = next();
740     }
741 
742     return result;
743 }
744 
745 /**
746  * Sets the iterator to refer to the last boundary position before the
747  * specified position.
748  * @offset The position to begin searching for a break from.
749  * @return The position of the last boundary before the starting position.
750  */
preceding(int32_t offset)751 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
752     // if we have cached break positions and offset is in the range
753     // covered by them, use them
754     if (fCachedBreakPositions != NULL) {
755         // TODO: binary search?
756         // TODO: What if offset is outside range, but break is not?
757         if (offset > fCachedBreakPositions[0]
758                 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
759             fPositionInCache = 0;
760             while (fPositionInCache < fNumCachedBreakPositions
761                    && offset > fCachedBreakPositions[fPositionInCache])
762                 ++fPositionInCache;
763             --fPositionInCache;
764             // If we're at the beginning of the cache, need to reevaluate the
765             // rule status
766             if (fPositionInCache <= 0) {
767                 fLastStatusIndexValid = FALSE;
768             }
769             utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
770             return fCachedBreakPositions[fPositionInCache];
771         }
772         else {
773             reset();
774         }
775     }
776 
777     // if the offset passed in is already past the end of the text,
778     // just return DONE; if it's before the beginning, return the
779     // text's starting offset
780     if (fText == NULL || offset > utext_nativeLength(fText)) {
781         // return BreakIterator::DONE;
782         return last();
783     }
784     else if (offset < 0) {
785         return first();
786     }
787 
788     // if we start by updating the current iteration position to the
789     // position specified by the caller, we can just use previous()
790     // to carry out this operation
791 
792     if (fData->fSafeFwdTable != NULL) {
793         // new rule syntax
794         utext_setNativeIndex(fText, offset);
795         int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
796         if (newOffset != offset) {
797             // Will come here if specified offset was not a code point boundary AND
798             //   the underlying implmentation is using UText, which snaps any non-code-point-boundary
799             //   indices to the containing code point.
800             // For breakitereator::preceding only, these non-code-point indices need to be moved
801             //   up to refer to the following codepoint.
802             UTEXT_NEXT32(fText);
803             offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
804         }
805 
806         // TODO:  (synwee) would it be better to just check for being in the middle of a surrogate pair,
807         //        rather than adjusting the position unconditionally?
808         //        (Change would interact with safe rules.)
809         // TODO:  change RBBI behavior for off-boundary indices to match that of UText?
810         //        affects only preceding(), seems cleaner, but is slightly different.
811         UTEXT_PREVIOUS32(fText);
812         handleNext(fData->fSafeFwdTable);
813         int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
814         while (result >= offset) {
815             result = previous();
816         }
817         return result;
818     }
819     if (fData->fSafeRevTable != NULL) {
820         // backup plan if forward safe table is not available
821         //  TODO:  check whether this path can be discarded
822         //         It's probably OK to say that rules must supply both safe tables
823         //            if they use safe tables at all.  We have certainly never described
824         //            to anyone how to work with just one safe table.
825         utext_setNativeIndex(fText, offset);
826         UTEXT_NEXT32(fText);
827 
828         // handle previous will give result <= offset
829         handlePrevious(fData->fSafeRevTable);
830 
831         // next will give result 0 or 1 boundary away from offset,
832         // most of the time
833         // we have to
834         int32_t oldresult = next();
835         while (oldresult < offset) {
836             int32_t result = next();
837             if (result >= offset) {
838                 return oldresult;
839             }
840             oldresult = result;
841         }
842         int32_t result = previous();
843         if (result >= offset) {
844             return previous();
845         }
846         return result;
847     }
848 
849     // old rule syntax
850     utext_setNativeIndex(fText, offset);
851     return previous();
852 }
853 
854 /**
855  * Returns true if the specfied position is a boundary position.  As a side
856  * effect, leaves the iterator pointing to the first boundary position at
857  * or after "offset".
858  * @param offset the offset to check.
859  * @return True if "offset" is a boundary position.
860  */
isBoundary(int32_t offset)861 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
862     // the beginning index of the iterator is always a boundary position by definition
863     if (offset == 0) {
864         first();       // For side effects on current position, tag values.
865         return TRUE;
866     }
867 
868     if (offset == (int32_t)utext_nativeLength(fText)) {
869         last();       // For side effects on current position, tag values.
870         return TRUE;
871     }
872 
873     // out-of-range indexes are never boundary positions
874     if (offset < 0) {
875         first();       // For side effects on current position, tag values.
876         return FALSE;
877     }
878 
879     if (offset > utext_nativeLength(fText)) {
880         last();        // For side effects on current position, tag values.
881         return FALSE;
882     }
883 
884     // otherwise, we can use following() on the position before the specified
885     // one and return true if the position we get back is the one the user
886     // specified
887     utext_previous32From(fText, offset);
888     int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
889     UBool    result  = following(backOne) == offset;
890     return result;
891 }
892 
893 /**
894  * Returns the current iteration position.
895  * @return The current iteration position.
896  */
current(void) const897 int32_t RuleBasedBreakIterator::current(void) const {
898     int32_t  pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
899     return pos;
900 }
901 
902 //=======================================================================
903 // implementation
904 //=======================================================================
905 
906 //
907 // RBBIRunMode  -  the state machine runs an extra iteration at the beginning and end
908 //                 of user text.  A variable with this enum type keeps track of where we
909 //                 are.  The state machine only fetches user input while in the RUN mode.
910 //
911 enum RBBIRunMode {
912     RBBI_START,     // state machine processing is before first char of input
913     RBBI_RUN,       // state machine processing is in the user text
914     RBBI_END        // state machine processing is after end of user text.
915 };
916 
917 
918 //-----------------------------------------------------------------------------------
919 //
920 //  handleNext(stateTable)
921 //     This method is the actual implementation of the rbbi next() method.
922 //     This method initializes the state machine to state 1
923 //     and advances through the text character by character until we reach the end
924 //     of the text or the state machine transitions to state 0.  We update our return
925 //     value every time the state machine passes through an accepting state.
926 //
927 //-----------------------------------------------------------------------------------
handleNext(const RBBIStateTable * statetable)928 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
929     int32_t             state;
930     int16_t             category        = 0;
931     RBBIRunMode         mode;
932 
933     RBBIStateTableRow  *row;
934     UChar32             c;
935     int32_t             lookaheadStatus = 0;
936     int32_t             lookaheadTagIdx = 0;
937     int32_t             result          = 0;
938     int32_t             initialPosition = 0;
939     int32_t             lookaheadResult = 0;
940     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
941     const char         *tableData       = statetable->fTableData;
942     uint32_t            tableRowLen     = statetable->fRowLen;
943 
944     #ifdef RBBI_DEBUG
945         if (fTrace) {
946             RBBIDebugPuts("Handle Next   pos   char  state category");
947         }
948     #endif
949 
950     // No matter what, handleNext alway correctly sets the break tag value.
951     fLastStatusIndexValid = TRUE;
952     fLastRuleStatusIndex = 0;
953 
954     // if we're already at the end of the text, return DONE.
955     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
956     result          = initialPosition;
957     c               = UTEXT_NEXT32(fText);
958     if (fData == NULL || c==U_SENTINEL) {
959         return BreakIterator::DONE;
960     }
961 
962     //  Set the initial state for the state machine
963     state = START_STATE;
964     row = (RBBIStateTableRow *)
965             //(statetable->fTableData + (statetable->fRowLen * state));
966             (tableData + tableRowLen * state);
967 
968 
969     mode     = RBBI_RUN;
970     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
971         category = 2;
972         mode     = RBBI_START;
973     }
974 
975 
976     // loop until we reach the end of the text or transition to state 0
977     //
978     for (;;) {
979         if (c == U_SENTINEL) {
980             // Reached end of input string.
981             if (mode == RBBI_END) {
982                 // We have already run the loop one last time with the
983                 //   character set to the psueudo {eof} value.  Now it is time
984                 //   to unconditionally bail out.
985                 if (lookaheadResult > result) {
986                     // We ran off the end of the string with a pending look-ahead match.
987                     // Treat this as if the look-ahead condition had been met, and return
988                     //  the match at the / position from the look-ahead rule.
989                     result               = lookaheadResult;
990                     fLastRuleStatusIndex = lookaheadTagIdx;
991                     lookaheadStatus = 0;
992                 }
993                 break;
994             }
995             // Run the loop one last time with the fake end-of-input character category.
996             mode = RBBI_END;
997             category = 1;
998         }
999 
1000         //
1001         // Get the char category.  An incoming category of 1 or 2 means that
1002         //      we are preset for doing the beginning or end of input, and
1003         //      that we shouldn't get a category from an actual text input character.
1004         //
1005         if (mode == RBBI_RUN) {
1006             // look up the current character's character category, which tells us
1007             // which column in the state table to look at.
1008             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
1009             //        not the size of the character going in, which is a UChar32.
1010             //
1011             UTRIE_GET16(&fData->fTrie, c, category);
1012 
1013             // Check the dictionary bit in the character's category.
1014             //    Counter is only used by dictionary based iterators (subclasses).
1015             //    Chars that need to be handled by a dictionary have a flag bit set
1016             //    in their category values.
1017             //
1018             if ((category & 0x4000) != 0)  {
1019                 fDictionaryCharCount++;
1020                 //  And off the dictionary flag bit.
1021                 category &= ~0x4000;
1022             }
1023         }
1024 
1025         #ifdef RBBI_DEBUG
1026             if (fTrace) {
1027                 RBBIDebugPrintf("             %4ld   ", utext_getNativeIndex(fText));
1028                 if (0x20<=c && c<0x7f) {
1029                     RBBIDebugPrintf("\"%c\"  ", c);
1030                 } else {
1031                     RBBIDebugPrintf("%5x  ", c);
1032                 }
1033                 RBBIDebugPrintf("%3d  %3d\n", state, category);
1034             }
1035         #endif
1036 
1037         // State Transition - move machine to its next state
1038         //
1039         state = row->fNextState[category];
1040         row = (RBBIStateTableRow *)
1041             // (statetable->fTableData + (statetable->fRowLen * state));
1042             (tableData + tableRowLen * state);
1043 
1044 
1045         if (row->fAccepting == -1) {
1046             // Match found, common case.
1047             if (mode != RBBI_START) {
1048                 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1049             }
1050             fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
1051         }
1052 
1053         if (row->fLookAhead != 0) {
1054             if (lookaheadStatus != 0
1055                 && row->fAccepting == lookaheadStatus) {
1056                 // Lookahead match is completed.
1057                 result               = lookaheadResult;
1058                 fLastRuleStatusIndex = lookaheadTagIdx;
1059                 lookaheadStatus      = 0;
1060                 // TODO:  make a standalone hard break in a rule work.
1061                 if (lookAheadHardBreak) {
1062                     UTEXT_SETNATIVEINDEX(fText, result);
1063                     return result;
1064                 }
1065                 // Look-ahead completed, but other rules may match further.  Continue on
1066                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
1067                 goto continueOn;
1068             }
1069 
1070             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1071             lookaheadResult = r;
1072             lookaheadStatus = row->fLookAhead;
1073             lookaheadTagIdx = row->fTagIdx;
1074             goto continueOn;
1075         }
1076 
1077 
1078         if (row->fAccepting != 0) {
1079             // Because this is an accepting state, any in-progress look-ahead match
1080             //   is no longer relavant.  Clear out the pending lookahead status.
1081             lookaheadStatus = 0;           // clear out any pending look-ahead match.
1082         }
1083 
1084 continueOn:
1085         if (state == STOP_STATE) {
1086             // This is the normal exit from the lookup state machine.
1087             // We have advanced through the string until it is certain that no
1088             //   longer match is possible, no matter what characters follow.
1089             break;
1090         }
1091 
1092         // Advance to the next character.
1093         // If this is a beginning-of-input loop iteration, don't advance
1094         //    the input position.  The next iteration will be processing the
1095         //    first real input character.
1096         if (mode == RBBI_RUN) {
1097             c = UTEXT_NEXT32(fText);
1098         } else {
1099             if (mode == RBBI_START) {
1100                 mode = RBBI_RUN;
1101             }
1102         }
1103 
1104 
1105     }
1106 
1107     // The state machine is done.  Check whether it found a match...
1108 
1109     // If the iterator failed to advance in the match engine, force it ahead by one.
1110     //   (This really indicates a defect in the break rules.  They should always match
1111     //    at least one character.)
1112     if (result == initialPosition) {
1113         UTEXT_SETNATIVEINDEX(fText, initialPosition);
1114         UTEXT_NEXT32(fText);
1115         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1116     }
1117 
1118     // Leave the iterator at our result position.
1119     UTEXT_SETNATIVEINDEX(fText, result);
1120     #ifdef RBBI_DEBUG
1121         if (fTrace) {
1122             RBBIDebugPrintf("result = %d\n\n", result);
1123         }
1124     #endif
1125     return result;
1126 }
1127 
1128 
1129 
1130 //-----------------------------------------------------------------------------------
1131 //
1132 //  handlePrevious()
1133 //
1134 //      Iterate backwards, according to the logic of the reverse rules.
1135 //      This version handles the exact style backwards rules.
1136 //
1137 //      The logic of this function is very similar to handleNext(), above.
1138 //
1139 //-----------------------------------------------------------------------------------
handlePrevious(const RBBIStateTable * statetable)1140 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
1141     int32_t             state;
1142     int16_t             category        = 0;
1143     RBBIRunMode         mode;
1144     RBBIStateTableRow  *row;
1145     UChar32             c;
1146     int32_t             lookaheadStatus = 0;
1147     int32_t             result          = 0;
1148     int32_t             initialPosition = 0;
1149     int32_t             lookaheadResult = 0;
1150     UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
1151 
1152     #ifdef RBBI_DEBUG
1153         if (fTrace) {
1154             RBBIDebugPuts("Handle Previous   pos   char  state category");
1155         }
1156     #endif
1157 
1158     // handlePrevious() never gets the rule status.
1159     // Flag the status as invalid; if the user ever asks for status, we will need
1160     // to back up, then re-find the break position using handleNext(), which does
1161     // get the status value.
1162     fLastStatusIndexValid = FALSE;
1163     fLastRuleStatusIndex = 0;
1164 
1165     // if we're already at the start of the text, return DONE.
1166     if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
1167         return BreakIterator::DONE;
1168     }
1169 
1170     //  Set up the starting char.
1171     initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1172     result          = initialPosition;
1173     c               = UTEXT_PREVIOUS32(fText);
1174 
1175     //  Set the initial state for the state machine
1176     state = START_STATE;
1177     row = (RBBIStateTableRow *)
1178             (statetable->fTableData + (statetable->fRowLen * state));
1179     category = 3;
1180     mode     = RBBI_RUN;
1181     if (statetable->fFlags & RBBI_BOF_REQUIRED) {
1182         category = 2;
1183         mode     = RBBI_START;
1184     }
1185 
1186 
1187     // loop until we reach the start of the text or transition to state 0
1188     //
1189     for (;;) {
1190         if (c == U_SENTINEL) {
1191             // Reached end of input string.
1192             if (mode == RBBI_END) {
1193                 // We have already run the loop one last time with the
1194                 //   character set to the psueudo {eof} value.  Now it is time
1195                 //   to unconditionally bail out.
1196                 if (lookaheadResult < result) {
1197                     // We ran off the end of the string with a pending look-ahead match.
1198                     // Treat this as if the look-ahead condition had been met, and return
1199                     //  the match at the / position from the look-ahead rule.
1200                     result               = lookaheadResult;
1201                     lookaheadStatus = 0;
1202                 } else if (result == initialPosition) {
1203                     // Ran off start, no match found.
1204                     // move one index one (towards the start, since we are doing a previous())
1205                     UTEXT_SETNATIVEINDEX(fText, initialPosition);
1206                     UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
1207                 }
1208                 break;
1209             }
1210             // Run the loop one last time with the fake end-of-input character category.
1211             mode = RBBI_END;
1212             category = 1;
1213         }
1214 
1215         //
1216         // Get the char category.  An incoming category of 1 or 2 means that
1217         //      we are preset for doing the beginning or end of input, and
1218         //      that we shouldn't get a category from an actual text input character.
1219         //
1220         if (mode == RBBI_RUN) {
1221             // look up the current character's character category, which tells us
1222             // which column in the state table to look at.
1223             // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
1224             //        not the size of the character going in, which is a UChar32.
1225             //
1226             UTRIE_GET16(&fData->fTrie, c, category);
1227 
1228             // Check the dictionary bit in the character's category.
1229             //    Counter is only used by dictionary based iterators (subclasses).
1230             //    Chars that need to be handled by a dictionary have a flag bit set
1231             //    in their category values.
1232             //
1233             if ((category & 0x4000) != 0)  {
1234                 fDictionaryCharCount++;
1235                 //  And off the dictionary flag bit.
1236                 category &= ~0x4000;
1237             }
1238         }
1239 
1240         #ifdef RBBI_DEBUG
1241             if (fTrace) {
1242                 RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText));
1243                 if (0x20<=c && c<0x7f) {
1244                     RBBIDebugPrintf("\"%c\"  ", c);
1245                 } else {
1246                     RBBIDebugPrintf("%5x  ", c);
1247                 }
1248                 RBBIDebugPrintf("%3d  %3d\n", state, category);
1249             }
1250         #endif
1251 
1252         // State Transition - move machine to its next state
1253         //
1254         state = row->fNextState[category];
1255         row = (RBBIStateTableRow *)
1256             (statetable->fTableData + (statetable->fRowLen * state));
1257 
1258         if (row->fAccepting == -1) {
1259             // Match found, common case.
1260             result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1261         }
1262 
1263         if (row->fLookAhead != 0) {
1264             if (lookaheadStatus != 0
1265                 && row->fAccepting == lookaheadStatus) {
1266                 // Lookahead match is completed.
1267                 result               = lookaheadResult;
1268                 lookaheadStatus      = 0;
1269                 // TODO:  make a standalone hard break in a rule work.
1270                 if (lookAheadHardBreak) {
1271                     UTEXT_SETNATIVEINDEX(fText, result);
1272                     return result;
1273                 }
1274                 // Look-ahead completed, but other rules may match further.  Continue on
1275                 //  TODO:  junk this feature?  I don't think it's used anywhwere.
1276                 goto continueOn;
1277             }
1278 
1279             int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1280             lookaheadResult = r;
1281             lookaheadStatus = row->fLookAhead;
1282             goto continueOn;
1283         }
1284 
1285 
1286         if (row->fAccepting != 0) {
1287             // Because this is an accepting state, any in-progress look-ahead match
1288             //   is no longer relavant.  Clear out the pending lookahead status.
1289             lookaheadStatus = 0;
1290         }
1291 
1292 continueOn:
1293         if (state == STOP_STATE) {
1294             // This is the normal exit from the lookup state machine.
1295             // We have advanced through the string until it is certain that no
1296             //   longer match is possible, no matter what characters follow.
1297             break;
1298         }
1299 
1300         // Move (backwards) to the next character to process.
1301         // If this is a beginning-of-input loop iteration, don't advance
1302         //    the input position.  The next iteration will be processing the
1303         //    first real input character.
1304         if (mode == RBBI_RUN) {
1305             c = UTEXT_PREVIOUS32(fText);
1306         } else {
1307             if (mode == RBBI_START) {
1308                 mode = RBBI_RUN;
1309             }
1310         }
1311     }
1312 
1313     // The state machine is done.  Check whether it found a match...
1314 
1315     // If the iterator failed to advance in the match engine, force it ahead by one.
1316     //   (This really indicates a defect in the break rules.  They should always match
1317     //    at least one character.)
1318     if (result == initialPosition) {
1319         UTEXT_SETNATIVEINDEX(fText, initialPosition);
1320         UTEXT_PREVIOUS32(fText);
1321         result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1322     }
1323 
1324     // Leave the iterator at our result position.
1325     UTEXT_SETNATIVEINDEX(fText, result);
1326     #ifdef RBBI_DEBUG
1327         if (fTrace) {
1328             RBBIDebugPrintf("result = %d\n\n", result);
1329         }
1330     #endif
1331     return result;
1332 }
1333 
1334 
1335 void
reset()1336 RuleBasedBreakIterator::reset()
1337 {
1338     if (fCachedBreakPositions) {
1339         uprv_free(fCachedBreakPositions);
1340     }
1341     fCachedBreakPositions = NULL;
1342     fNumCachedBreakPositions = 0;
1343     fDictionaryCharCount = 0;
1344     fPositionInCache = 0;
1345 }
1346 
1347 
1348 
1349 //-------------------------------------------------------------------------------
1350 //
1351 //   getRuleStatus()   Return the break rule tag associated with the current
1352 //                     iterator position.  If the iterator arrived at its current
1353 //                     position by iterating forwards, the value will have been
1354 //                     cached by the handleNext() function.
1355 //
1356 //                     If no cached status value is available, the status is
1357 //                     found by doing a previous() followed by a next(), which
1358 //                     leaves the iterator where it started, and computes the
1359 //                     status while doing the next().
1360 //
1361 //-------------------------------------------------------------------------------
makeRuleStatusValid()1362 void RuleBasedBreakIterator::makeRuleStatusValid() {
1363     if (fLastStatusIndexValid == FALSE) {
1364         //  No cached status is available.
1365         if (fText == NULL || current() == 0) {
1366             //  At start of text, or there is no text.  Status is always zero.
1367             fLastRuleStatusIndex = 0;
1368             fLastStatusIndexValid = TRUE;
1369         } else {
1370             //  Not at start of text.  Find status the tedious way.
1371             int32_t pa = current();
1372             previous();
1373             if (fNumCachedBreakPositions > 0) {
1374                 reset();                // Blow off the dictionary cache
1375             }
1376             int32_t pb = next();
1377             if (pa != pb) {
1378                 // note: the if (pa != pb) test is here only to eliminate warnings for
1379                 //       unused local variables on gcc.  Logically, it isn't needed.
1380                 U_ASSERT(pa == pb);
1381             }
1382         }
1383     }
1384     U_ASSERT(fLastRuleStatusIndex >= 0  &&  fLastRuleStatusIndex < fData->fStatusMaxIdx);
1385 }
1386 
1387 
getRuleStatus() const1388 int32_t  RuleBasedBreakIterator::getRuleStatus() const {
1389     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
1390     nonConstThis->makeRuleStatusValid();
1391 
1392     // fLastRuleStatusIndex indexes to the start of the appropriate status record
1393     //                                                 (the number of status values.)
1394     //   This function returns the last (largest) of the array of status values.
1395     int32_t  idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
1396     int32_t  tagVal = fData->fRuleStatusTable[idx];
1397 
1398     return tagVal;
1399 }
1400 
1401 
1402 
1403 
getRuleStatusVec(int32_t * fillInVec,int32_t capacity,UErrorCode & status)1404 int32_t RuleBasedBreakIterator::getRuleStatusVec(
1405              int32_t *fillInVec, int32_t capacity, UErrorCode &status)
1406 {
1407     if (U_FAILURE(status)) {
1408         return 0;
1409     }
1410 
1411     RuleBasedBreakIterator *nonConstThis  = (RuleBasedBreakIterator *)this;
1412     nonConstThis->makeRuleStatusValid();
1413     int32_t  numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
1414     int32_t  numValsToCopy = numVals;
1415     if (numVals > capacity) {
1416         status = U_BUFFER_OVERFLOW_ERROR;
1417         numValsToCopy = capacity;
1418     }
1419     int i;
1420     for (i=0; i<numValsToCopy; i++) {
1421         fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
1422     }
1423     return numVals;
1424 }
1425 
1426 
1427 
1428 //-------------------------------------------------------------------------------
1429 //
1430 //   getBinaryRules        Access to the compiled form of the rules,
1431 //                         for use by build system tools that save the data
1432 //                         for standard iterator types.
1433 //
1434 //-------------------------------------------------------------------------------
getBinaryRules(uint32_t & length)1435 const uint8_t  *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
1436     const uint8_t  *retPtr = NULL;
1437     length = 0;
1438 
1439     if (fData != NULL) {
1440         retPtr = (const uint8_t *)fData->fHeader;
1441         length = fData->fHeader->fLength;
1442     }
1443     return retPtr;
1444 }
1445 
1446 
1447 
1448 
1449 //-------------------------------------------------------------------------------
1450 //
1451 //  BufferClone       TODO:  In my (Andy) opinion, this function should be deprecated.
1452 //                    Saving one heap allocation isn't worth the trouble.
1453 //                    Cloning shouldn't be done in tight loops, and
1454 //                    making the clone copy involves other heap operations anyway.
1455 //                    And the application code for correctly dealing with buffer
1456 //                    size problems and the eventual object destruction is ugly.
1457 //
1458 //-------------------------------------------------------------------------------
createBufferClone(void * stackBuffer,int32_t & bufferSize,UErrorCode & status)1459 BreakIterator *  RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
1460                                    int32_t &bufferSize,
1461                                    UErrorCode &status)
1462 {
1463     if (U_FAILURE(status)){
1464         return NULL;
1465     }
1466 
1467     //
1468     //  If user buffer size is zero this is a preflight operation to
1469     //    obtain the needed buffer size, allowing for worst case misalignment.
1470     //
1471     if (bufferSize == 0) {
1472         bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
1473         return NULL;
1474     }
1475 
1476 
1477     //
1478     //  Check the alignment and size of the user supplied buffer.
1479     //  Allocate heap memory if the user supplied memory is insufficient.
1480     //
1481     char    *buf   = (char *)stackBuffer;
1482     uint32_t s      = bufferSize;
1483 
1484     if (stackBuffer == NULL) {
1485         s = 0;   // Ignore size, force allocation if user didn't give us a buffer.
1486     }
1487     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
1488         uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf);
1489         s   -= offsetUp;
1490         buf += offsetUp;
1491     }
1492     if (s < sizeof(RuleBasedBreakIterator)) {
1493         // Not enough room in the caller-supplied buffer.
1494         // Do a plain-vanilla heap based clone and return that, along with
1495         //   a warning that the clone was allocated.
1496         RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this);
1497         if (clonedBI == 0) {
1498             status = U_MEMORY_ALLOCATION_ERROR;
1499         } else {
1500             status = U_SAFECLONE_ALLOCATED_WARNING;
1501         }
1502         return clonedBI;
1503     }
1504 
1505     //
1506     //  Clone the source BI into the caller-supplied buffer.
1507     //    TODO:  using an overloaded operator new to directly initialize the
1508     //           copy in the user's buffer would be better, but it doesn't seem
1509     //           to get along with namespaces.  Investigate why.
1510     //
1511     //           The memcpy is only safe with an empty (default constructed)
1512     //           break iterator.  Use on others can screw up reference counts
1513     //           to data.  memcpy-ing objects is not really a good idea...
1514     //
1515     RuleBasedBreakIterator localIter;        // Empty break iterator, source for memcpy
1516     RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf;
1517     uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++ gorp, BreakIterator base class part
1518     clone->init();                // Init RuleBasedBreakIterator part, (user default constructor)
1519     *clone = *this;               // clone = the real BI we want.
1520     clone->fBufferClone = TRUE;   // Flag to prevent deleting storage on close (From C code)
1521 
1522     return clone;
1523 }
1524 
1525 
1526 //-------------------------------------------------------------------------------
1527 //
1528 //  isDictionaryChar      Return true if the category lookup for this char
1529 //                        indicates that it is in the set of dictionary lookup
1530 //                        chars.
1531 //
1532 //                        This function is intended for use by dictionary based
1533 //                        break iterators.
1534 //
1535 //-------------------------------------------------------------------------------
1536 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) {
1537     if (fData == NULL) {
1538         return FALSE;
1539     }
1540     uint16_t category;
1541     UTRIE_GET16(&fData->fTrie, c, category);
1542     return (category & 0x4000) != 0;
1543 }*/
1544 
1545 
1546 //-------------------------------------------------------------------------------
1547 //
1548 //  checkDictionary       This function handles all processing of characters in
1549 //                        the "dictionary" set. It will determine the appropriate
1550 //                        course of action, and possibly set up a cache in the
1551 //                        process.
1552 //
1553 //-------------------------------------------------------------------------------
checkDictionary(int32_t startPos,int32_t endPos,UBool reverse)1554 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
1555                             int32_t endPos,
1556                             UBool reverse) {
1557     // Reset the old break cache first.
1558     uint32_t dictionaryCount = fDictionaryCharCount;
1559     reset();
1560 
1561     if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
1562         return (reverse ? startPos : endPos);
1563     }
1564 
1565     // Bug 5532.  The dictionary code will crash if the input text is UTF-8
1566     //      because native indexes are different from UTF-16 indexes.
1567     //      Temporary hack: skip dictionary lookup for UTF-8 encoded text.
1568     //      It wont give the right breaks, but it's better than a crash.
1569     //
1570     //      Check the type of the UText by checking its pFuncs field, which
1571     //      is UText's function dispatch table.  It will be the same for all
1572     //      UTF-8 UTexts and different for any other UText type.
1573     //
1574     //      We have no other type of UText available with non-UTF-16 native indexing.
1575     //      This whole check will go away once the dictionary code is fixed.
1576     static const void *utext_utf8Funcs;
1577     if (utext_utf8Funcs == NULL) {
1578         // Cache the UTF-8 UText function pointer value.
1579         UErrorCode status = U_ZERO_ERROR;
1580         UText tempUText = UTEXT_INITIALIZER;
1581         utext_openUTF8(&tempUText, NULL, 0, &status);
1582         utext_utf8Funcs = tempUText.pFuncs;
1583         utext_close(&tempUText);
1584     }
1585     if (fText->pFuncs == utext_utf8Funcs) {
1586         return (reverse ? startPos : endPos);
1587     }
1588 
1589     // Starting from the starting point, scan towards the proposed result,
1590     // looking for the first dictionary character (which may be the one
1591     // we're on, if we're starting in the middle of a range).
1592     utext_setNativeIndex(fText, reverse ? endPos : startPos);
1593     if (reverse) {
1594         UTEXT_PREVIOUS32(fText);
1595     }
1596 
1597     int32_t rangeStart = startPos;
1598     int32_t rangeEnd = endPos;
1599 
1600     uint16_t    category;
1601     int32_t     current;
1602     UErrorCode  status = U_ZERO_ERROR;
1603     UStack      breaks(status);
1604     int32_t     foundBreakCount = 0;
1605     UChar32     c = utext_current32(fText);
1606 
1607     UTRIE_GET16(&fData->fTrie, c, category);
1608 
1609     // Is the character we're starting on a dictionary character? If so, we
1610     // need to back up to include the entire run; otherwise the results of
1611     // the break algorithm will differ depending on where we start. Since
1612     // the result is cached and there is typically a non-dictionary break
1613     // within a small number of words, there should be little performance impact.
1614     if (category & 0x4000) {
1615         if (reverse) {
1616             do {
1617                 utext_next32(fText);          // TODO:  recast to work directly with postincrement.
1618                 c = utext_current32(fText);
1619                 UTRIE_GET16(&fData->fTrie, c, category);
1620             } while (c != U_SENTINEL && (category & 0x4000));
1621             // Back up to the last dictionary character
1622             rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1623             if (c == U_SENTINEL) {
1624                 // c = fText->last32();
1625                 //   TODO:  why was this if needed?
1626                 c = UTEXT_PREVIOUS32(fText);
1627             }
1628             else {
1629                 c = UTEXT_PREVIOUS32(fText);
1630             }
1631         }
1632         else {
1633             do {
1634                 c = UTEXT_PREVIOUS32(fText);
1635                 UTRIE_GET16(&fData->fTrie, c, category);
1636             }
1637             while (c != U_SENTINEL && (category & 0x4000));
1638             // Back up to the last dictionary character
1639             if (c == U_SENTINEL) {
1640                 // c = fText->first32();
1641                 c = utext_current32(fText);
1642             }
1643             else {
1644                 utext_next32(fText);
1645                 c = utext_current32(fText);
1646             }
1647             rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
1648         }
1649         UTRIE_GET16(&fData->fTrie, c, category);
1650     }
1651 
1652     // Loop through the text, looking for ranges of dictionary characters.
1653     // For each span, find the appropriate break engine, and ask it to find
1654     // any breaks within the span.
1655     // Note: we always do this in the forward direction, so that the break
1656     // cache is built in the right order.
1657     if (reverse) {
1658         utext_setNativeIndex(fText, rangeStart);
1659         c = utext_current32(fText);
1660         UTRIE_GET16(&fData->fTrie, c, category);
1661     }
1662     while(U_SUCCESS(status)) {
1663         while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
1664             utext_next32(fText);           // TODO:  tweak for post-increment operation
1665             c = utext_current32(fText);
1666             UTRIE_GET16(&fData->fTrie, c, category);
1667         }
1668         if (current >= rangeEnd) {
1669             break;
1670         }
1671 
1672         // We now have a dictionary character. Get the appropriate language object
1673         // to deal with it.
1674         const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
1675 
1676         // Ask the language object if there are any breaks. It will leave the text
1677         // pointer on the other side of its range, ready to search for the next one.
1678         if (lbe != NULL) {
1679             foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
1680         }
1681 
1682         // Reload the loop variables for the next go-round
1683         c = utext_current32(fText);
1684         UTRIE_GET16(&fData->fTrie, c, category);
1685     }
1686 
1687     // If we found breaks, build a new break cache. The first and last entries must
1688     // be the original starting and ending position.
1689     if (foundBreakCount > 0) {
1690         int32_t totalBreaks = foundBreakCount;
1691         if (startPos < breaks.elementAti(0)) {
1692             totalBreaks += 1;
1693         }
1694         if (endPos > breaks.peeki()) {
1695             totalBreaks += 1;
1696         }
1697         fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
1698         if (fCachedBreakPositions != NULL) {
1699             int32_t out = 0;
1700             fNumCachedBreakPositions = totalBreaks;
1701             if (startPos < breaks.elementAti(0)) {
1702                 fCachedBreakPositions[out++] = startPos;
1703             }
1704             for (int32_t i = 0; i < foundBreakCount; ++i) {
1705                 fCachedBreakPositions[out++] = breaks.elementAti(i);
1706             }
1707             if (endPos > fCachedBreakPositions[out-1]) {
1708                 fCachedBreakPositions[out] = endPos;
1709             }
1710             // If there are breaks, then by definition, we are replacing the original
1711             // proposed break by one of the breaks we found. Use following() and
1712             // preceding() to do the work. They should never recurse in this case.
1713             if (reverse) {
1714                 return preceding(endPos - 1);
1715             }
1716             else {
1717                 return following(startPos);
1718             }
1719         }
1720         // If the allocation failed, just fall through to the "no breaks found" case.
1721     }
1722 
1723     // If we get here, there were no language-based breaks. Set the text pointer
1724     // to the original proposed break.
1725     utext_setNativeIndex(fText, reverse ? startPos : endPos);
1726     return (reverse ? startPos : endPos);
1727 }
1728 
1729 U_NAMESPACE_END
1730 
1731 // defined in ucln_cmn.h
1732 
1733 static U_NAMESPACE_QUALIFIER UStack *gLanguageBreakFactories = NULL;
1734 
1735 /**
1736  * Release all static memory held by breakiterator.
1737  */
1738 U_CDECL_BEGIN
breakiterator_cleanup_dict(void)1739 static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
1740     if (gLanguageBreakFactories) {
1741         delete gLanguageBreakFactories;
1742         gLanguageBreakFactories = NULL;
1743     }
1744     return TRUE;
1745 }
1746 U_CDECL_END
1747 
1748 U_CDECL_BEGIN
_deleteFactory(void * obj)1749 static void U_CALLCONV _deleteFactory(void *obj) {
1750     delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory *) obj;
1751 }
1752 U_CDECL_END
1753 U_NAMESPACE_BEGIN
1754 
1755 static const LanguageBreakEngine*
getLanguageBreakEngineFromFactory(UChar32 c,int32_t breakType)1756 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
1757 {
1758     UBool       needsInit;
1759     UErrorCode  status = U_ZERO_ERROR;
1760     UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit);
1761 
1762     if (needsInit) {
1763         UStack  *factories = new UStack(_deleteFactory, NULL, status);
1764         if (factories != NULL && U_SUCCESS(status)) {
1765             ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
1766             factories->push(builtIn, status);
1767 #ifdef U_LOCAL_SERVICE_HOOK
1768             LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
1769             if (extra != NULL) {
1770                 factories->push(extra, status);
1771             }
1772 #endif
1773         }
1774         umtx_lock(NULL);
1775         if (gLanguageBreakFactories == NULL) {
1776             gLanguageBreakFactories = factories;
1777             factories = NULL;
1778             ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
1779         }
1780         umtx_unlock(NULL);
1781         delete factories;
1782     }
1783 
1784     if (gLanguageBreakFactories == NULL) {
1785         return NULL;
1786     }
1787 
1788     int32_t i = gLanguageBreakFactories->size();
1789     const LanguageBreakEngine *lbe = NULL;
1790     while (--i >= 0) {
1791         LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
1792         lbe = factory->getEngineFor(c, breakType);
1793         if (lbe != NULL) {
1794             break;
1795         }
1796     }
1797     return lbe;
1798 }
1799 
1800 
1801 //-------------------------------------------------------------------------------
1802 //
1803 //  getLanguageBreakEngine  Find an appropriate LanguageBreakEngine for the
1804 //                          the characer c.
1805 //
1806 //-------------------------------------------------------------------------------
1807 const LanguageBreakEngine *
getLanguageBreakEngine(UChar32 c)1808 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
1809     const LanguageBreakEngine *lbe = NULL;
1810     UErrorCode status = U_ZERO_ERROR;
1811 
1812     if (fLanguageBreakEngines == NULL) {
1813         fLanguageBreakEngines = new UStack(status);
1814         if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
1815             delete fLanguageBreakEngines;
1816             fLanguageBreakEngines = 0;
1817             return NULL;
1818         }
1819     }
1820 
1821     int32_t i = fLanguageBreakEngines->size();
1822     while (--i >= 0) {
1823         lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
1824         if (lbe->handles(c, fBreakType)) {
1825             return lbe;
1826         }
1827     }
1828 
1829     // No existing dictionary took the character. See if a factory wants to
1830     // give us a new LanguageBreakEngine for this character.
1831     lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
1832 
1833     // If we got one, use it and push it on our stack.
1834     if (lbe != NULL) {
1835         fLanguageBreakEngines->push((void *)lbe, status);
1836         // Even if we can't remember it, we can keep looking it up, so
1837         // return it even if the push fails.
1838         return lbe;
1839     }
1840 
1841     // No engine is forthcoming for this character. Add it to the
1842     // reject set. Create the reject break engine if needed.
1843     if (fUnhandledBreakEngine == NULL) {
1844         fUnhandledBreakEngine = new UnhandledEngine(status);
1845         if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
1846             status = U_MEMORY_ALLOCATION_ERROR;
1847         }
1848         // Put it last so that scripts for which we have an engine get tried
1849         // first.
1850         fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
1851         // If we can't insert it, or creation failed, get rid of it
1852         if (U_FAILURE(status)) {
1853             delete fUnhandledBreakEngine;
1854             fUnhandledBreakEngine = 0;
1855             return NULL;
1856         }
1857     }
1858 
1859     // Tell the reject engine about the character; at its discretion, it may
1860     // add more than just the one character.
1861     fUnhandledBreakEngine->handleCharacter(c, fBreakType);
1862 
1863     return fUnhandledBreakEngine;
1864 }
1865 
1866 
1867 
1868 /*int32_t RuleBasedBreakIterator::getBreakType() const {
1869     return fBreakType;
1870 }*/
1871 
setBreakType(int32_t type)1872 void RuleBasedBreakIterator::setBreakType(int32_t type) {
1873     fBreakType = type;
1874     reset();
1875 }
1876 
1877 U_NAMESPACE_END
1878 
1879 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1880