1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 1997-2015, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
8 *
9 * File brkiter.cpp
10 *
11 * Modification History:
12 *
13 * Date Name Description
14 * 02/18/97 aliu Converted from OpenClass. Added DONE.
15 * 01/13/2000 helena Added UErrorCode parameter to createXXXInstance methods.
16 *****************************************************************************************
17 */
18
19 // *****************************************************************************
20 // This file was generated from the java source file BreakIterator.java
21 // *****************************************************************************
22
23 #include "unicode/utypes.h"
24
25 #if !UCONFIG_NO_BREAK_ITERATION
26
27 #include "unicode/rbbi.h"
28 #include "unicode/brkiter.h"
29 #include "unicode/udata.h"
30 #include "unicode/uloc.h"
31 #include "unicode/ures.h"
32 #include "unicode/ustring.h"
33 #include "unicode/filteredbrk.h"
34 #include "bytesinkutil.h"
35 #include "ucln_cmn.h"
36 #include "cstring.h"
37 #include "umutex.h"
38 #include "servloc.h"
39 #include "locbased.h"
40 #include "uresimp.h"
41 #include "uassert.h"
42 #include "ubrkimpl.h"
43 #include "utracimp.h"
44 #include "charstr.h"
45
46 // *****************************************************************************
47 // class BreakIterator
48 // This class implements methods for finding the location of boundaries in text.
49 // Instances of BreakIterator maintain a current position and scan over text
50 // returning the index of characters where boundaries occur.
51 // *****************************************************************************
52
53 U_NAMESPACE_BEGIN
54
55 // -------------------------------------
56
57 BreakIterator*
buildInstance(const Locale & loc,const char * type,UErrorCode & status)58 BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &status)
59 {
60 char fnbuff[256];
61 char ext[4]={'\0'};
62 CharString actualLocale;
63 int32_t size;
64 const char16_t* brkfname = nullptr;
65 UResourceBundle brkRulesStack;
66 UResourceBundle brkNameStack;
67 UResourceBundle *brkRules = &brkRulesStack;
68 UResourceBundle *brkName = &brkNameStack;
69 RuleBasedBreakIterator *result = nullptr;
70
71 if (U_FAILURE(status))
72 return nullptr;
73
74 ures_initStackObject(brkRules);
75 ures_initStackObject(brkName);
76
77 // Get the locale
78 UResourceBundle *b = ures_openNoDefault(U_ICUDATA_BRKITR, loc.getName(), &status);
79
80 // Get the "boundaries" array.
81 if (U_SUCCESS(status)) {
82 brkRules = ures_getByKeyWithFallback(b, "boundaries", brkRules, &status);
83 // Get the string object naming the rules file
84 brkName = ures_getByKeyWithFallback(brkRules, type, brkName, &status);
85 // Get the actual string
86 brkfname = ures_getString(brkName, &size, &status);
87 U_ASSERT((size_t)size<sizeof(fnbuff));
88 if ((size_t)size>=sizeof(fnbuff)) {
89 size=0;
90 if (U_SUCCESS(status)) {
91 status = U_BUFFER_OVERFLOW_ERROR;
92 }
93 }
94
95 // Use the string if we found it
96 if (U_SUCCESS(status) && brkfname) {
97 actualLocale.append(ures_getLocaleInternal(brkName, &status), -1, status);
98
99 char16_t* extStart=u_strchr(brkfname, 0x002e);
100 int len = 0;
101 if (extStart != nullptr){
102 len = (int)(extStart-brkfname);
103 u_UCharsToChars(extStart+1, ext, sizeof(ext)); // nul terminates the buff
104 u_UCharsToChars(brkfname, fnbuff, len);
105 }
106 fnbuff[len]=0; // nul terminate
107 }
108 }
109
110 ures_close(brkRules);
111 ures_close(brkName);
112
113 UDataMemory* file = udata_open(U_ICUDATA_BRKITR, ext, fnbuff, &status);
114 if (U_FAILURE(status)) {
115 ures_close(b);
116 return nullptr;
117 }
118
119 // Create a RuleBasedBreakIterator
120 result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != nullptr, status);
121
122 // If there is a result, set the valid locale and actual locale, and the kind
123 if (U_SUCCESS(status) && result != nullptr) {
124 U_LOCALE_BASED(locBased, *(BreakIterator*)result);
125
126 locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status),
127 actualLocale.data());
128 uprv_strncpy(result->requestLocale, loc.getName(), ULOC_FULLNAME_CAPACITY);
129 result->requestLocale[ULOC_FULLNAME_CAPACITY-1] = 0; // always terminate
130 }
131
132 ures_close(b);
133
134 if (U_FAILURE(status) && result != nullptr) { // Sometimes redundant check, but simple
135 delete result;
136 return nullptr;
137 }
138
139 if (result == nullptr) {
140 udata_close(file);
141 if (U_SUCCESS(status)) {
142 status = U_MEMORY_ALLOCATION_ERROR;
143 }
144 }
145
146 return result;
147 }
148
149 // Creates a break iterator for word breaks.
150 BreakIterator* U_EXPORT2
createWordInstance(const Locale & key,UErrorCode & status)151 BreakIterator::createWordInstance(const Locale& key, UErrorCode& status)
152 {
153 return createInstance(key, UBRK_WORD, status);
154 }
155
156 // -------------------------------------
157
158 // Creates a break iterator for line breaks.
159 BreakIterator* U_EXPORT2
createLineInstance(const Locale & key,UErrorCode & status)160 BreakIterator::createLineInstance(const Locale& key, UErrorCode& status)
161 {
162 return createInstance(key, UBRK_LINE, status);
163 }
164
165 // -------------------------------------
166
167 // Creates a break iterator for character breaks.
168 BreakIterator* U_EXPORT2
createCharacterInstance(const Locale & key,UErrorCode & status)169 BreakIterator::createCharacterInstance(const Locale& key, UErrorCode& status)
170 {
171 return createInstance(key, UBRK_CHARACTER, status);
172 }
173
174 // -------------------------------------
175
176 // Creates a break iterator for sentence breaks.
177 BreakIterator* U_EXPORT2
createSentenceInstance(const Locale & key,UErrorCode & status)178 BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status)
179 {
180 return createInstance(key, UBRK_SENTENCE, status);
181 }
182
183 // -------------------------------------
184
185 // Creates a break iterator for title casing breaks.
186 BreakIterator* U_EXPORT2
createTitleInstance(const Locale & key,UErrorCode & status)187 BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status)
188 {
189 return createInstance(key, UBRK_TITLE, status);
190 }
191
192 // -------------------------------------
193
194 // Gets all the available locales that has localized text boundary data.
195 const Locale* U_EXPORT2
getAvailableLocales(int32_t & count)196 BreakIterator::getAvailableLocales(int32_t& count)
197 {
198 return Locale::getAvailableLocales(count);
199 }
200
201 // ------------------------------------------
202 //
203 // Constructors, destructor and assignment operator
204 //
205 //-------------------------------------------
206
BreakIterator()207 BreakIterator::BreakIterator()
208 {
209 *validLocale = *actualLocale = *requestLocale = 0;
210 }
211
BreakIterator(const BreakIterator & other)212 BreakIterator::BreakIterator(const BreakIterator &other) : UObject(other) {
213 uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
214 uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
215 uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale));
216 }
217
operator =(const BreakIterator & other)218 BreakIterator &BreakIterator::operator =(const BreakIterator &other) {
219 if (this != &other) {
220 uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
221 uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
222 uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale));
223 }
224 return *this;
225 }
226
~BreakIterator()227 BreakIterator::~BreakIterator()
228 {
229 }
230
231 // ------------------------------------------
232 //
233 // Registration
234 //
235 //-------------------------------------------
236 #if !UCONFIG_NO_SERVICE
237
238 // -------------------------------------
239
240 class ICUBreakIteratorFactory : public ICUResourceBundleFactory {
241 public:
242 virtual ~ICUBreakIteratorFactory();
243 protected:
handleCreate(const Locale & loc,int32_t kind,const ICUService *,UErrorCode & status) const244 virtual UObject* handleCreate(const Locale& loc, int32_t kind, const ICUService* /*service*/, UErrorCode& status) const override {
245 return BreakIterator::makeInstance(loc, kind, status);
246 }
247 };
248
~ICUBreakIteratorFactory()249 ICUBreakIteratorFactory::~ICUBreakIteratorFactory() {}
250
251 // -------------------------------------
252
253 class ICUBreakIteratorService : public ICULocaleService {
254 public:
ICUBreakIteratorService()255 ICUBreakIteratorService()
256 : ICULocaleService(UNICODE_STRING("Break Iterator", 14))
257 {
258 UErrorCode status = U_ZERO_ERROR;
259 registerFactory(new ICUBreakIteratorFactory(), status);
260 }
261
262 virtual ~ICUBreakIteratorService();
263
cloneInstance(UObject * instance) const264 virtual UObject* cloneInstance(UObject* instance) const override {
265 return ((BreakIterator*)instance)->clone();
266 }
267
handleDefault(const ICUServiceKey & key,UnicodeString *,UErrorCode & status) const268 virtual UObject* handleDefault(const ICUServiceKey& key, UnicodeString* /*actualID*/, UErrorCode& status) const override {
269 LocaleKey& lkey = static_cast<LocaleKey&>(const_cast<ICUServiceKey&>(key));
270 int32_t kind = lkey.kind();
271 Locale loc;
272 lkey.currentLocale(loc);
273 return BreakIterator::makeInstance(loc, kind, status);
274 }
275
isDefault() const276 virtual UBool isDefault() const override {
277 return countFactories() == 1;
278 }
279 };
280
~ICUBreakIteratorService()281 ICUBreakIteratorService::~ICUBreakIteratorService() {}
282
283 // -------------------------------------
284
285 // defined in ucln_cmn.h
286 U_NAMESPACE_END
287
288 static icu::UInitOnce gInitOnceBrkiter {};
289 static icu::ICULocaleService* gService = nullptr;
290
291
292
293 /**
294 * Release all static memory held by breakiterator.
295 */
296 U_CDECL_BEGIN
breakiterator_cleanup()297 static UBool U_CALLCONV breakiterator_cleanup() {
298 #if !UCONFIG_NO_SERVICE
299 if (gService) {
300 delete gService;
301 gService = nullptr;
302 }
303 gInitOnceBrkiter.reset();
304 #endif
305 return true;
306 }
307 U_CDECL_END
308 U_NAMESPACE_BEGIN
309
310 static void U_CALLCONV
initService()311 initService() {
312 gService = new ICUBreakIteratorService();
313 ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR, breakiterator_cleanup);
314 }
315
316 static ICULocaleService*
getService()317 getService()
318 {
319 umtx_initOnce(gInitOnceBrkiter, &initService);
320 return gService;
321 }
322
323
324 // -------------------------------------
325
326 static inline UBool
hasService()327 hasService()
328 {
329 return !gInitOnceBrkiter.isReset() && getService() != nullptr;
330 }
331
332 // -------------------------------------
333
334 URegistryKey U_EXPORT2
registerInstance(BreakIterator * toAdopt,const Locale & locale,UBreakIteratorType kind,UErrorCode & status)335 BreakIterator::registerInstance(BreakIterator* toAdopt, const Locale& locale, UBreakIteratorType kind, UErrorCode& status)
336 {
337 ICULocaleService *service = getService();
338 if (service == nullptr) {
339 status = U_MEMORY_ALLOCATION_ERROR;
340 return nullptr;
341 }
342 return service->registerInstance(toAdopt, locale, kind, status);
343 }
344
345 // -------------------------------------
346
347 UBool U_EXPORT2
unregister(URegistryKey key,UErrorCode & status)348 BreakIterator::unregister(URegistryKey key, UErrorCode& status)
349 {
350 if (U_SUCCESS(status)) {
351 if (hasService()) {
352 return gService->unregister(key, status);
353 }
354 status = U_MEMORY_ALLOCATION_ERROR;
355 }
356 return false;
357 }
358
359 // -------------------------------------
360
361 StringEnumeration* U_EXPORT2
getAvailableLocales()362 BreakIterator::getAvailableLocales()
363 {
364 ICULocaleService *service = getService();
365 if (service == nullptr) {
366 return nullptr;
367 }
368 return service->getAvailableLocales();
369 }
370 #endif /* UCONFIG_NO_SERVICE */
371
372 // -------------------------------------
373
374 BreakIterator*
createInstance(const Locale & loc,int32_t kind,UErrorCode & status)375 BreakIterator::createInstance(const Locale& loc, int32_t kind, UErrorCode& status)
376 {
377 if (U_FAILURE(status)) {
378 return nullptr;
379 }
380
381 #if !UCONFIG_NO_SERVICE
382 if (hasService()) {
383 Locale actualLoc("");
384 BreakIterator *result = (BreakIterator*)gService->get(loc, kind, &actualLoc, status);
385 // TODO: The way the service code works in ICU 2.8 is that if
386 // there is a real registered break iterator, the actualLoc
387 // will be populated, but if the handleDefault path is taken
388 // (because nothing is registered that can handle the
389 // requested locale) then the actualLoc comes back empty. In
390 // that case, the returned object already has its actual/valid
391 // locale data populated (by makeInstance, which is what
392 // handleDefault calls), so we don't touch it. YES, A COMMENT
393 // THIS LONG is a sign of bad code -- so the action item is to
394 // revisit this in ICU 3.0 and clean it up/fix it/remove it.
395 if (U_SUCCESS(status) && (result != nullptr) && *actualLoc.getName() != 0) {
396 U_LOCALE_BASED(locBased, *result);
397 locBased.setLocaleIDs(actualLoc.getName(), actualLoc.getName());
398 }
399 return result;
400 }
401 else
402 #endif
403 {
404 return makeInstance(loc, kind, status);
405 }
406 }
407
408 // -------------------------------------
409 enum { kKeyValueLenMax = 32 };
410
411 BreakIterator*
makeInstance(const Locale & loc,int32_t kind,UErrorCode & status)412 BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
413 {
414
415 if (U_FAILURE(status)) {
416 return nullptr;
417 }
418
419 BreakIterator *result = nullptr;
420 switch (kind) {
421 case UBRK_CHARACTER:
422 {
423 UTRACE_ENTRY(UTRACE_UBRK_CREATE_CHARACTER);
424 result = BreakIterator::buildInstance(loc, "grapheme", status);
425 UTRACE_EXIT_STATUS(status);
426 }
427 break;
428 case UBRK_WORD:
429 {
430 UTRACE_ENTRY(UTRACE_UBRK_CREATE_WORD);
431 result = BreakIterator::buildInstance(loc, "word", status);
432 UTRACE_EXIT_STATUS(status);
433 }
434 break;
435 case UBRK_LINE:
436 {
437 char lb_lw[kKeyValueLenMax];
438 UTRACE_ENTRY(UTRACE_UBRK_CREATE_LINE);
439 uprv_strcpy(lb_lw, "line");
440 UErrorCode kvStatus = U_ZERO_ERROR;
441 CharString value;
442 CharStringByteSink valueSink(&value);
443 loc.getKeywordValue("lb", valueSink, kvStatus);
444 if (U_SUCCESS(kvStatus) && (value == "strict" || value == "normal" || value == "loose")) {
445 uprv_strcat(lb_lw, "_");
446 uprv_strcat(lb_lw, value.data());
447 }
448 // lw=phrase is only supported in Japanese and Korean
449 if (uprv_strcmp(loc.getLanguage(), "ja") == 0 || uprv_strcmp(loc.getLanguage(), "ko") == 0) {
450 value.clear();
451 loc.getKeywordValue("lw", valueSink, kvStatus);
452 if (U_SUCCESS(kvStatus) && value == "phrase") {
453 uprv_strcat(lb_lw, "_");
454 uprv_strcat(lb_lw, value.data());
455 }
456 }
457 result = BreakIterator::buildInstance(loc, lb_lw, status);
458
459 UTRACE_DATA1(UTRACE_INFO, "lb_lw=%s", lb_lw);
460 UTRACE_EXIT_STATUS(status);
461 }
462 break;
463 case UBRK_SENTENCE:
464 {
465 UTRACE_ENTRY(UTRACE_UBRK_CREATE_SENTENCE);
466 result = BreakIterator::buildInstance(loc, "sentence", status);
467 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
468 char ssKeyValue[kKeyValueLenMax] = {0};
469 UErrorCode kvStatus = U_ZERO_ERROR;
470 int32_t kLen = loc.getKeywordValue("ss", ssKeyValue, kKeyValueLenMax, kvStatus);
471 if (U_SUCCESS(kvStatus) && kLen > 0 && uprv_strcmp(ssKeyValue,"standard")==0) {
472 FilteredBreakIteratorBuilder* fbiBuilder = FilteredBreakIteratorBuilder::createInstance(loc, kvStatus);
473 if (U_SUCCESS(kvStatus)) {
474 result = fbiBuilder->build(result, status);
475 delete fbiBuilder;
476 }
477 }
478 #endif
479 UTRACE_EXIT_STATUS(status);
480 }
481 break;
482 case UBRK_TITLE:
483 {
484 UTRACE_ENTRY(UTRACE_UBRK_CREATE_TITLE);
485 result = BreakIterator::buildInstance(loc, "title", status);
486 UTRACE_EXIT_STATUS(status);
487 }
488 break;
489 default:
490 status = U_ILLEGAL_ARGUMENT_ERROR;
491 }
492
493 if (U_FAILURE(status)) {
494 return nullptr;
495 }
496
497 return result;
498 }
499
500 Locale
getLocale(ULocDataLocaleType type,UErrorCode & status) const501 BreakIterator::getLocale(ULocDataLocaleType type, UErrorCode& status) const {
502 if (type == ULOC_REQUESTED_LOCALE) {
503 return Locale(requestLocale);
504 }
505 U_LOCALE_BASED(locBased, *this);
506 return locBased.getLocale(type, status);
507 }
508
509 const char *
getLocaleID(ULocDataLocaleType type,UErrorCode & status) const510 BreakIterator::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const {
511 if (type == ULOC_REQUESTED_LOCALE) {
512 return requestLocale;
513 }
514 U_LOCALE_BASED(locBased, *this);
515 return locBased.getLocaleID(type, status);
516 }
517
518
519 // This implementation of getRuleStatus is a do-nothing stub, here to
520 // provide a default implementation for any derived BreakIterator classes that
521 // do not implement it themselves.
getRuleStatus() const522 int32_t BreakIterator::getRuleStatus() const {
523 return 0;
524 }
525
526 // This implementation of getRuleStatusVec is a do-nothing stub, here to
527 // provide a default implementation for any derived BreakIterator classes that
528 // do not implement it themselves.
getRuleStatusVec(int32_t * fillInVec,int32_t capacity,UErrorCode & status)529 int32_t BreakIterator::getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status) {
530 if (U_FAILURE(status)) {
531 return 0;
532 }
533 if (capacity < 1) {
534 status = U_BUFFER_OVERFLOW_ERROR;
535 return 1;
536 }
537 *fillInVec = 0;
538 return 1;
539 }
540
BreakIterator(const Locale & valid,const Locale & actual)541 BreakIterator::BreakIterator (const Locale& valid, const Locale& actual) {
542 U_LOCALE_BASED(locBased, (*this));
543 locBased.setLocaleIDs(valid, actual);
544 }
545
546 U_NAMESPACE_END
547
548 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
549
550 //eof
551