1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2014-2015, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
8 */
9 
10 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
12 
13 #include "cmemory.h"
14 
15 #include "unicode/filteredbrk.h"
16 #include "unicode/ucharstriebuilder.h"
17 #include "unicode/ures.h"
18 
19 #include "uresimp.h" // ures_getByKeyWithFallback
20 #include "ubrkimpl.h" // U_ICUDATA_BRKITR
21 #include "uvector.h"
22 #include "cmemory.h"
23 #include "umutex.h"
24 
25 U_NAMESPACE_BEGIN
26 
27 #ifndef FB_DEBUG
28 #define FB_DEBUG 0
29 #endif
30 
31 #if FB_DEBUG
32 #include <stdio.h>
_fb_trace(const char * m,const UnicodeString * s,UBool b,int32_t d,const char * f,int l)33 static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, const char *f, int l) {
34   char buf[2048];
35   if(s) {
36     s->extract(0,s->length(),buf,2048);
37   } else {
38     strcpy(buf,"NULL");
39   }
40   fprintf(stderr,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n",
41           f, l, m, buf, (const void*)s, b?'T':'F',(int)d);
42 }
43 
44 #define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__)
45 #else
46 #define FB_TRACE(m,s,b,d)
47 #endif
48 
49 /**
50  * Used with sortedInsert()
51  */
compareUnicodeString(UElement t1,UElement t2)52 static int32_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
53     const UnicodeString &a = *(const UnicodeString*)t1.pointer;
54     const UnicodeString &b = *(const UnicodeString*)t2.pointer;
55     return a.compare(b);
56 }
57 
58 /**
59  * A UVector which implements a set of strings.
60  */
61 class U_COMMON_API UStringSet : public UVector {
62  public:
UStringSet(UErrorCode & status)63   UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject,
64                                            uhash_compareUnicodeString,
65                                            1,
66                                            status) {}
67   virtual ~UStringSet();
68   /**
69    * Is this UnicodeSet contained?
70    */
contains(const UnicodeString & s)71   inline UBool contains(const UnicodeString& s) {
72     return contains((void*) &s);
73   }
74   using UVector::contains;
75   /**
76    * Return the ith UnicodeString alias
77    */
getStringAt(int32_t i) const78   inline const UnicodeString* getStringAt(int32_t i) const {
79     return (const UnicodeString*)elementAt(i);
80   }
81   /**
82    * Adopt the UnicodeString if not already contained.
83    * Caller no longer owns the pointer in any case.
84    * @return true if adopted successfully, false otherwise (error, or else duplicate)
85    */
adopt(UnicodeString * str,UErrorCode & status)86   inline UBool adopt(UnicodeString *str, UErrorCode &status) {
87     if(U_FAILURE(status) || contains(*str)) {
88       delete str;
89       return false;
90     } else {
91       sortedInsert(str, compareUnicodeString, status);
92       if(U_FAILURE(status)) {
93         return false;
94       }
95       return true;
96     }
97   }
98   /**
99    * Add by value.
100    * @return true if successfully adopted.
101    */
add(const UnicodeString & str,UErrorCode & status)102   inline UBool add(const UnicodeString& str, UErrorCode &status) {
103     if(U_FAILURE(status)) return false;
104     UnicodeString *t = new UnicodeString(str);
105     if(t==NULL) {
106       status = U_MEMORY_ALLOCATION_ERROR; return false;
107     }
108     return adopt(t, status);
109   }
110   /**
111    * Remove this string.
112    * @return true if successfully removed, false otherwise (error, or else it wasn't there)
113    */
remove(const UnicodeString & s,UErrorCode & status)114   inline UBool remove(const UnicodeString &s, UErrorCode &status) {
115     if(U_FAILURE(status)) return false;
116     return removeElement((void*) &s);
117   }
118 };
119 
120 /**
121  * Virtual, won't be inlined
122  */
~UStringSet()123 UStringSet::~UStringSet() {}
124 
125 /* ----------------------------------------------------------- */
126 
127 
128 /* Filtered Break constants */
129 static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie
130 static const int32_t kMATCH   = (1<<1); //< exact match - skip this one.
131 static const int32_t kSuppressInReverse = (1<<0);
132 static const int32_t kAddToForward = (1<<1);
133 static const UChar   kFULLSTOP = 0x002E; // '.'
134 
135 /**
136  * Shared data for SimpleFilteredSentenceBreakIterator
137  */
138 class SimpleFilteredSentenceBreakData : public UMemory {
139 public:
SimpleFilteredSentenceBreakData(UCharsTrie * forwards,UCharsTrie * backwards)140   SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards )
141       : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { }
incr()142     SimpleFilteredSentenceBreakData *incr() {
143         umtx_atomic_inc(&refcount);
144         return this;
145     }
decr()146     SimpleFilteredSentenceBreakData *decr() {
147         if(umtx_atomic_dec(&refcount) <= 0) {
148             delete this;
149         }
150         return 0;
151     }
152     virtual ~SimpleFilteredSentenceBreakData();
153 
hasForwardsPartialTrie() const154     bool hasForwardsPartialTrie() const { return fForwardsPartialTrie.isValid(); }
hasBackwardsTrie() const155     bool hasBackwardsTrie() const { return fBackwardsTrie.isValid(); }
156 
getForwardsPartialTrie() const157     const UCharsTrie &getForwardsPartialTrie() const { return *fForwardsPartialTrie; }
getBackwardsTrie() const158     const UCharsTrie &getBackwardsTrie() const { return *fBackwardsTrie; }
159 
160 private:
161     // These tries own their data arrays.
162     // They are shared and must therefore not be modified.
163     LocalPointer<UCharsTrie>    fForwardsPartialTrie; //  Has ".a" for "a.M."
164     LocalPointer<UCharsTrie>    fBackwardsTrie; //  i.e. ".srM" for Mrs.
165     u_atomic_int32_t            refcount;
166 };
167 
~SimpleFilteredSentenceBreakData()168 SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {}
169 
170 /**
171  * Concrete implementation
172  */
173 class SimpleFilteredSentenceBreakIterator : public BreakIterator {
174 public:
175   SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status);
176   SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other);
177   virtual ~SimpleFilteredSentenceBreakIterator();
178 private:
179   SimpleFilteredSentenceBreakData *fData;
180   LocalPointer<BreakIterator> fDelegate;
181   LocalUTextPointer           fText;
182 
183   /* -- subclass interface -- */
184 public:
185   /* -- cloning and other subclass stuff -- */
createBufferClone(void *,int32_t &,UErrorCode & status)186   virtual BreakIterator *  createBufferClone(void * /*stackBuffer*/,
187                                              int32_t &/*BufferSize*/,
188                                              UErrorCode &status) override {
189     // for now - always deep clone
190     status = U_SAFECLONE_ALLOCATED_WARNING;
191     return clone();
192   }
clone() const193   virtual SimpleFilteredSentenceBreakIterator* clone() const override { return new SimpleFilteredSentenceBreakIterator(*this); }
getDynamicClassID(void) const194   virtual UClassID getDynamicClassID(void) const override { return NULL; }
operator ==(const BreakIterator & o) const195   virtual bool operator==(const BreakIterator& o) const override { if(this==&o) return true; return false; }
196 
197   /* -- text modifying -- */
setText(UText * text,UErrorCode & status)198   virtual void setText(UText *text, UErrorCode &status) override { fDelegate->setText(text,status); }
refreshInputText(UText * input,UErrorCode & status)199   virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) override { fDelegate->refreshInputText(input,status); return *this; }
adoptText(CharacterIterator * it)200   virtual void adoptText(CharacterIterator* it) override { fDelegate->adoptText(it); }
setText(const UnicodeString & text)201   virtual void setText(const UnicodeString &text) override { fDelegate->setText(text); }
202 
203   /* -- other functions that are just delegated -- */
getUText(UText * fillIn,UErrorCode & status) const204   virtual UText *getUText(UText *fillIn, UErrorCode &status) const override { return fDelegate->getUText(fillIn,status); }
getText(void) const205   virtual CharacterIterator& getText(void) const override { return fDelegate->getText(); }
206 
207   /* -- ITERATION -- */
208   virtual int32_t first(void) override;
209   virtual int32_t preceding(int32_t offset) override;
210   virtual int32_t previous(void) override;
211   virtual UBool isBoundary(int32_t offset) override;
current(void) const212   virtual int32_t current(void) const override { return fDelegate->current(); } // we keep the delegate current, so this should be correct.
213 
214   virtual int32_t next(void) override;
215 
216   virtual int32_t next(int32_t n) override;
217   virtual int32_t following(int32_t offset) override;
218   virtual int32_t last(void) override;
219 
220 private:
221     /**
222      * Given that the fDelegate has already given its "initial" answer,
223      * find the NEXT actual (non-excepted) break.
224      * @param n initial position from delegate
225      * @return new break position or UBRK_DONE
226      */
227     int32_t internalNext(int32_t n);
228     /**
229      * Given that the fDelegate has already given its "initial" answer,
230      * find the PREV actual (non-excepted) break.
231      * @param n initial position from delegate
232      * @return new break position or UBRK_DONE
233      */
234     int32_t internalPrev(int32_t n);
235     /**
236      * set up the UText with the value of the fDelegate.
237      * Call this before calling breakExceptionAt.
238      * May be able to avoid excess calls
239      */
240     void resetState(UErrorCode &status);
241     /**
242      * Is there a match  (exception) at this spot?
243      */
244     enum EFBMatchResult { kNoExceptionHere, kExceptionHere };
245     /**
246      * Determine if there is an exception at this spot
247      * @param n spot to check
248      * @return kNoExceptionHere or kExceptionHere
249      **/
250     enum EFBMatchResult breakExceptionAt(int32_t n);
251 };
252 
SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator & other)253 SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other)
254   : BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone())
255 {
256 }
257 
258 
SimpleFilteredSentenceBreakIterator(BreakIterator * adopt,UCharsTrie * forwards,UCharsTrie * backwards,UErrorCode & status)259 SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) :
260   BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)),
261   fData(new SimpleFilteredSentenceBreakData(forwards, backwards)),
262   fDelegate(adopt)
263 {
264     if (fData == nullptr) {
265         delete forwards;
266         delete backwards;
267         if (U_SUCCESS(status)) {
268             status = U_MEMORY_ALLOCATION_ERROR;
269         }
270     }
271 }
272 
~SimpleFilteredSentenceBreakIterator()273 SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() {
274     fData = fData->decr();
275 }
276 
resetState(UErrorCode & status)277 void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) {
278   fText.adoptInstead(fDelegate->getUText(fText.orphan(), status));
279 }
280 
281 SimpleFilteredSentenceBreakIterator::EFBMatchResult
breakExceptionAt(int32_t n)282 SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) {
283     int64_t bestPosn = -1;
284     int32_t bestValue = -1;
285     // loops while 'n' points to an exception.
286     utext_setNativeIndex(fText.getAlias(), n); // from n..
287 
288     //if(debug2) u_printf(" n@ %d\n", n);
289     // Assume a space is following the '.'  (so we handle the case:  "Mr. /Brown")
290     if(utext_previous32(fText.getAlias())==u' ') {  // TODO: skip a class of chars here??
291       // TODO only do this the 1st time?
292       //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch);
293     } else {
294       //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch);
295       utext_next32(fText.getAlias());
296       //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch);
297     }
298 
299     {
300         // Do not modify the shared trie!
301         UCharsTrie iter(fData->getBackwardsTrie());
302         UChar32 uch;
303         while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL) {  // more to consume backwards
304             UStringTrieResult r = iter.nextForCodePoint(uch);
305             if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
306                 bestPosn = utext_getNativeIndex(fText.getAlias());
307                 bestValue = iter.getValue();
308             }
309             if(!USTRINGTRIE_HAS_NEXT(r)) {
310                 break;
311             }
312             //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias()));
313         }
314     }
315 
316     //if(bestValue >= 0) {
317         //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
318     //}
319 
320     if(bestPosn>=0) {
321       //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
322 
323       //if(USTRINGTRIE_MATCHES(r)) {  // matched - so, now what?
324       //int32_t bestValue = iter.getValue();
325       ////if(debug2) u_printf("rev< /%C/ matched, skip..%d  bestValue=%d\n", (UChar)uch, r, bestValue);
326 
327       if(bestValue == kMATCH) { // exact match!
328         //if(debug2) u_printf(" exact backward match\n");
329         return kExceptionHere; // See if the next is another exception.
330       } else if(bestValue == kPARTIAL
331                 && fData->hasForwardsPartialTrie()) { // make sure there's a forward trie
332         //if(debug2) u_printf(" partial backward match\n");
333         // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
334         // to see if it matches something going forward.
335         UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE;
336         utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close ..
337         //if(debug2) u_printf("Retrying at %d\n", bestPosn);
338         // Do not modify the shared trie!
339         UCharsTrie iter(fData->getForwardsPartialTrie());
340         UChar32 uch;
341         while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL &&
342               USTRINGTRIE_HAS_NEXT(rfwd=iter.nextForCodePoint(uch))) {
343           //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
344         }
345         if(USTRINGTRIE_MATCHES(rfwd)) {
346           //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch);
347           // only full matches here, nothing to check
348           // skip the next:
349             return kExceptionHere;
350         } else {
351           //if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch);
352           // no match (no exception) -return the 'underlying' break
353           return kNoExceptionHere;
354         }
355       } else {
356         return kNoExceptionHere; // internal error and/or no forwards trie
357       }
358     } else {
359       //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r);  // no best match
360       return kNoExceptionHere; // No match - so exit. Not an exception.
361     }
362 }
363 
364 // the workhorse single next.
365 int32_t
internalNext(int32_t n)366 SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) {
367   if(n == UBRK_DONE || // at end  or
368     !fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions
369       return n;
370   }
371   // OK, do we need to break here?
372   UErrorCode status = U_ZERO_ERROR;
373   // refresh text
374   resetState(status);
375   if(U_FAILURE(status)) return UBRK_DONE; // bail out
376   int64_t utextLen = utext_nativeLength(fText.getAlias());
377 
378   //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
379   while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate).
380     SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
381 
382     switch(m) {
383     case kExceptionHere:
384       n = fDelegate->next(); // skip this one. Find the next lowerlevel break.
385       continue;
386 
387     default:
388     case kNoExceptionHere:
389       return n;
390     }
391   }
392   return n;
393 }
394 
395 int32_t
internalPrev(int32_t n)396 SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) {
397   if(n == 0 || n == UBRK_DONE || // at end  or
398     !fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions
399       return n;
400   }
401   // OK, do we need to break here?
402   UErrorCode status = U_ZERO_ERROR;
403   // refresh text
404   resetState(status);
405   if(U_FAILURE(status)) return UBRK_DONE; // bail out
406 
407   //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
408   while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate).
409     SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
410 
411     switch(m) {
412     case kExceptionHere:
413       n = fDelegate->previous(); // skip this one. Find the next lowerlevel break.
414       continue;
415 
416     default:
417     case kNoExceptionHere:
418       return n;
419     }
420   }
421   return n;
422 }
423 
424 
425 int32_t
next()426 SimpleFilteredSentenceBreakIterator::next() {
427   return internalNext(fDelegate->next());
428 }
429 
430 int32_t
first(void)431 SimpleFilteredSentenceBreakIterator::first(void) {
432   // Don't suppress a break opportunity at the beginning of text.
433   return fDelegate->first();
434 }
435 
436 int32_t
preceding(int32_t offset)437 SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) {
438   return internalPrev(fDelegate->preceding(offset));
439 }
440 
441 int32_t
previous(void)442 SimpleFilteredSentenceBreakIterator::previous(void) {
443   return internalPrev(fDelegate->previous());
444 }
445 
isBoundary(int32_t offset)446 UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) {
447   if (!fDelegate->isBoundary(offset)) return false; // no break to suppress
448 
449   if (!fData->hasBackwardsTrie()) return true; // no data = no suppressions
450 
451   UErrorCode status = U_ZERO_ERROR;
452   resetState(status);
453 
454   SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset);
455 
456   switch(m) {
457   case kExceptionHere:
458     return false;
459   default:
460   case kNoExceptionHere:
461     return true;
462   }
463 }
464 
465 int32_t
next(int32_t offset)466 SimpleFilteredSentenceBreakIterator::next(int32_t offset) {
467   return internalNext(fDelegate->next(offset));
468 }
469 
470 int32_t
following(int32_t offset)471 SimpleFilteredSentenceBreakIterator::following(int32_t offset) {
472   return internalNext(fDelegate->following(offset));
473 }
474 
475 int32_t
last(void)476 SimpleFilteredSentenceBreakIterator::last(void) {
477   // Don't suppress a break opportunity at the end of text.
478   return fDelegate->last();
479 }
480 
481 
482 /**
483  * Concrete implementation of builder class.
484  */
485 class U_COMMON_API SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder {
486 public:
487   virtual ~SimpleFilteredBreakIteratorBuilder();
488   SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status);
489   SimpleFilteredBreakIteratorBuilder(UErrorCode &status);
490   virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status) override;
491   virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status) override;
492   virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status) override;
493 private:
494   UStringSet fSet;
495 };
496 
~SimpleFilteredBreakIteratorBuilder()497 SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder()
498 {
499 }
500 
SimpleFilteredBreakIteratorBuilder(UErrorCode & status)501 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status)
502   : fSet(status)
503 {
504 }
505 
SimpleFilteredBreakIteratorBuilder(const Locale & fromLocale,UErrorCode & status)506 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status)
507   : fSet(status)
508 {
509   if(U_SUCCESS(status)) {
510     UErrorCode subStatus = U_ZERO_ERROR;
511     LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &subStatus));
512     if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
513       status = subStatus; // copy the failing status
514 #if FB_DEBUG
515       fprintf(stderr, "open BUNDLE %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
516 #endif
517       return;  // leaves the builder empty, if you try to use it.
518     }
519     LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", NULL, &subStatus));
520     if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
521       status = subStatus; // copy the failing status
522 #if FB_DEBUG
523       fprintf(stderr, "open EXCEPTIONS %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
524 #endif
525       return;  // leaves the builder empty, if you try to use it.
526     }
527     LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", NULL, &subStatus));
528 
529 #if FB_DEBUG
530     {
531       UErrorCode subsub = subStatus;
532       fprintf(stderr, "open SentenceBreak %s => %s, %s\n", fromLocale.getBaseName(), ures_getLocale(breaks.getAlias(), &subsub), u_errorName(subStatus));
533     }
534 #endif
535 
536     if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
537       status = subStatus; // copy the failing status
538 #if FB_DEBUG
539       fprintf(stderr, "open %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
540 #endif
541       return;  // leaves the builder empty, if you try to use it.
542     }
543 
544     LocalUResourceBundlePointer strs;
545     subStatus = status; // Pick up inherited warning status now
546     do {
547       strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus));
548       if(strs.isValid() && U_SUCCESS(subStatus)) {
549         UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status));
550         suppressBreakAfter(str, status); // load the string
551       }
552     } while (strs.isValid() && U_SUCCESS(subStatus));
553     if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) {
554       status = subStatus;
555     }
556   }
557 }
558 
559 UBool
suppressBreakAfter(const UnicodeString & exception,UErrorCode & status)560 SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
561 {
562   UBool r = fSet.add(exception, status);
563   FB_TRACE("suppressBreakAfter",&exception,r,0);
564   return r;
565 }
566 
567 UBool
unsuppressBreakAfter(const UnicodeString & exception,UErrorCode & status)568 SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
569 {
570   UBool r = fSet.remove(exception, status);
571   FB_TRACE("unsuppressBreakAfter",&exception,r,0);
572   return r;
573 }
574 
575 /**
576  * Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly.
577  * Work around this.
578  *
579  * Note: "new UnicodeString[subCount]" ends up calling global operator new
580  * on MSVC2012 for some reason.
581  */
newUnicodeStringArray(size_t count)582 static inline UnicodeString* newUnicodeStringArray(size_t count) {
583     return new UnicodeString[count ? count : 1];
584 }
585 
586 BreakIterator *
build(BreakIterator * adoptBreakIterator,UErrorCode & status)587 SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) {
588   LocalPointer<BreakIterator> adopt(adoptBreakIterator);
589 
590   LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status), status);
591   LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status), status);
592   if(U_FAILURE(status)) {
593     return NULL;
594   }
595 
596   int32_t revCount = 0;
597   int32_t fwdCount = 0;
598 
599   int32_t subCount = fSet.size();
600 
601   UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount);
602 
603   LocalArray<UnicodeString> ustrs(ustrs_ptr);
604 
605   LocalMemory<int> partials;
606   partials.allocateInsteadAndReset(subCount);
607 
608   LocalPointer<UCharsTrie>    backwardsTrie; //  i.e. ".srM" for Mrs.
609   LocalPointer<UCharsTrie>    forwardsPartialTrie; //  Has ".a" for "a.M."
610 
611   int n=0;
612   for ( int32_t i = 0;
613         i<fSet.size();
614         i++) {
615     const UnicodeString *abbr = fSet.getStringAt(i);
616     if(abbr) {
617       FB_TRACE("build",abbr,TRUE,i);
618       ustrs[n] = *abbr; // copy by value
619       FB_TRACE("ustrs[n]",&ustrs[n],TRUE,i);
620     } else {
621       FB_TRACE("build",abbr,FALSE,i);
622       status = U_MEMORY_ALLOCATION_ERROR;
623       return NULL;
624     }
625     partials[n] = 0; // default: not partial
626     n++;
627   }
628   // first pass - find partials.
629   for(int i=0;i<subCount;i++) {
630     int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations
631     if(nn>-1 && (nn+1)!=ustrs[i].length()) {
632       FB_TRACE("partial",&ustrs[i],FALSE,i);
633       // is partial.
634       // is it unique?
635       int sameAs = -1;
636       for(int j=0;j<subCount;j++) {
637         if(j==i) continue;
638         if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) {
639           FB_TRACE("prefix",&ustrs[j],FALSE,nn+1);
640           //UBool otherIsPartial = ((nn+1)!=ustrs[j].length());  // true if ustrs[j] doesn't end at nn
641           if(partials[j]==0) { // hasn't been processed yet
642             partials[j] = kSuppressInReverse | kAddToForward;
643             FB_TRACE("suppressing",&ustrs[j],FALSE,j);
644           } else if(partials[j] & kSuppressInReverse) {
645             sameAs = j; // the other entry is already in the reverse table.
646           }
647         }
648       }
649       FB_TRACE("for partial same-",&ustrs[i],FALSE,sameAs);
650       FB_TRACE(" == partial #",&ustrs[i],FALSE,partials[i]);
651       UnicodeString prefix(ustrs[i], 0, nn+1);
652       if(sameAs == -1 && partials[i] == 0) {
653         // first one - add the prefix to the reverse table.
654         prefix.reverse();
655         builder->add(prefix, kPARTIAL, status);
656         revCount++;
657         FB_TRACE("Added partial",&prefix,FALSE, i);
658         FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i);
659         partials[i] = kSuppressInReverse | kAddToForward;
660       } else {
661         FB_TRACE("NOT adding partial",&prefix,FALSE, i);
662         FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i);
663       }
664     }
665   }
666   for(int i=0;i<subCount;i++) {
667     if(partials[i]==0) {
668       ustrs[i].reverse();
669       builder->add(ustrs[i], kMATCH, status);
670       revCount++;
671       FB_TRACE(u_errorName(status), &ustrs[i], FALSE, i);
672     } else {
673       FB_TRACE("Adding fwd",&ustrs[i], FALSE, i);
674 
675       // an optimization would be to only add the portion after the '.'
676       // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward,
677       // instead of "Ph.D." since we already know the "Ph." part is a match.
678       // would need the trie to be able to hold 0-length strings, though.
679       builder2->add(ustrs[i], kMATCH, status); // forward
680       fwdCount++;
681       //ustrs[i].reverse();
682       ////if(debug2) u_printf("SUPPRESS- not Added(%d):  /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status));
683     }
684   }
685   FB_TRACE("AbbrCount",NULL,FALSE, subCount);
686 
687   if(revCount>0) {
688     backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status));
689     if(U_FAILURE(status)) {
690       FB_TRACE(u_errorName(status),NULL,FALSE, -1);
691       return NULL;
692     }
693   }
694 
695   if(fwdCount>0) {
696     forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status));
697     if(U_FAILURE(status)) {
698       FB_TRACE(u_errorName(status),NULL,FALSE, -1);
699       return NULL;
700     }
701   }
702 
703   return new SimpleFilteredSentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status);
704 }
705 
706 
707 // ----------- Base class implementation
708 
FilteredBreakIteratorBuilder()709 FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() {
710 }
711 
~FilteredBreakIteratorBuilder()712 FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() {
713 }
714 
715 FilteredBreakIteratorBuilder *
createInstance(const Locale & where,UErrorCode & status)716 FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) {
717   if(U_FAILURE(status)) return NULL;
718   LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status), status);
719   return (U_SUCCESS(status))? ret.orphan(): NULL;
720 }
721 
722 FilteredBreakIteratorBuilder *
createInstance(UErrorCode & status)723 FilteredBreakIteratorBuilder::createInstance(UErrorCode &status) {
724   return createEmptyInstance(status);
725 }
726 
727 FilteredBreakIteratorBuilder *
createEmptyInstance(UErrorCode & status)728 FilteredBreakIteratorBuilder::createEmptyInstance(UErrorCode& status) {
729   if(U_FAILURE(status)) return NULL;
730   LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status);
731   return (U_SUCCESS(status))? ret.orphan(): NULL;
732 }
733 
734 U_NAMESPACE_END
735 
736 #endif //#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
737