1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 1997-2015, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
8 *
9 * File brkiter.cpp
10 *
11 * Modification History:
12 *
13 * Date Name Description
14 * 02/18/97 aliu Converted from OpenClass. Added DONE.
15 * 01/13/2000 helena Added UErrorCode parameter to createXXXInstance methods.
16 *****************************************************************************************
17 */
18
19 // *****************************************************************************
20 // This file was generated from the java source file BreakIterator.java
21 // *****************************************************************************
22
23 #include "unicode/utypes.h"
24
25 #if !UCONFIG_NO_BREAK_ITERATION
26
27 #include "unicode/rbbi.h"
28 #include "unicode/brkiter.h"
29 #include "unicode/udata.h"
30 #include "unicode/uloc.h"
31 #include "unicode/ures.h"
32 #include "unicode/ustring.h"
33 #include "unicode/filteredbrk.h"
34 #include "bytesinkutil.h"
35 #include "ucln_cmn.h"
36 #include "cstring.h"
37 #include "umutex.h"
38 #include "servloc.h"
39 #include "locbased.h"
40 #include "uresimp.h"
41 #include "uassert.h"
42 #include "ubrkimpl.h"
43 #include "utracimp.h"
44 #include "charstr.h"
45
46 // *****************************************************************************
47 // class BreakIterator
48 // This class implements methods for finding the location of boundaries in text.
49 // Instances of BreakIterator maintain a current position and scan over text
50 // returning the index of characters where boundaries occur.
51 // *****************************************************************************
52
53 U_NAMESPACE_BEGIN
54
55 // -------------------------------------
56
57 BreakIterator*
buildInstance(const Locale & loc,const char * type,UErrorCode & status)58 BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &status)
59 {
60 char fnbuff[256];
61 char ext[4]={'\0'};
62 CharString actualLocale;
63 int32_t size;
64 const char16_t* brkfname = nullptr;
65 UResourceBundle brkRulesStack;
66 UResourceBundle brkNameStack;
67 UResourceBundle *brkRules = &brkRulesStack;
68 UResourceBundle *brkName = &brkNameStack;
69 RuleBasedBreakIterator *result = nullptr;
70
71 if (U_FAILURE(status))
72 return nullptr;
73
74 ures_initStackObject(brkRules);
75 ures_initStackObject(brkName);
76
77 // Get the locale
78 UResourceBundle *b = ures_openNoDefault(U_ICUDATA_BRKITR, loc.getName(), &status);
79
80 // Get the "boundaries" array.
81 if (U_SUCCESS(status)) {
82 brkRules = ures_getByKeyWithFallback(b, "boundaries", brkRules, &status);
83 // Get the string object naming the rules file
84 brkName = ures_getByKeyWithFallback(brkRules, type, brkName, &status);
85 // Get the actual string
86 brkfname = ures_getString(brkName, &size, &status);
87 U_ASSERT((size_t)size<sizeof(fnbuff));
88 if (static_cast<size_t>(size) >= sizeof(fnbuff)) {
89 size=0;
90 if (U_SUCCESS(status)) {
91 status = U_BUFFER_OVERFLOW_ERROR;
92 }
93 }
94
95 // Use the string if we found it
96 if (U_SUCCESS(status) && brkfname) {
97 actualLocale.append(ures_getLocaleInternal(brkName, &status), -1, status);
98
99 char16_t* extStart=u_strchr(brkfname, 0x002e);
100 int len = 0;
101 if (extStart != nullptr){
102 len = static_cast<int>(extStart - brkfname);
103 u_UCharsToChars(extStart+1, ext, sizeof(ext)); // nul terminates the buff
104 u_UCharsToChars(brkfname, fnbuff, len);
105 }
106 fnbuff[len]=0; // nul terminate
107 }
108 }
109
110 ures_close(brkRules);
111 ures_close(brkName);
112
113 UDataMemory* file = udata_open(U_ICUDATA_BRKITR, ext, fnbuff, &status);
114 if (U_FAILURE(status)) {
115 ures_close(b);
116 return nullptr;
117 }
118
119 // Create a RuleBasedBreakIterator
120 result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != nullptr, status);
121
122 // If there is a result, set the valid locale and actual locale, and the kind
123 if (U_SUCCESS(status) && result != nullptr) {
124 U_LOCALE_BASED(locBased, *(BreakIterator*)result);
125
126 locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status),
127 actualLocale.data());
128 uprv_strncpy(result->requestLocale, loc.getName(), ULOC_FULLNAME_CAPACITY);
129 result->requestLocale[ULOC_FULLNAME_CAPACITY-1] = 0; // always terminate
130 }
131
132 ures_close(b);
133
134 if (U_FAILURE(status) && result != nullptr) { // Sometimes redundant check, but simple
135 delete result;
136 return nullptr;
137 }
138
139 if (result == nullptr) {
140 udata_close(file);
141 if (U_SUCCESS(status)) {
142 status = U_MEMORY_ALLOCATION_ERROR;
143 }
144 }
145
146 return result;
147 }
148
149 // Creates a break iterator for word breaks.
150 BreakIterator* U_EXPORT2
createWordInstance(const Locale & key,UErrorCode & status)151 BreakIterator::createWordInstance(const Locale& key, UErrorCode& status)
152 {
153 return createInstance(key, UBRK_WORD, status);
154 }
155
156 // -------------------------------------
157
158 // Creates a break iterator for line breaks.
159 BreakIterator* U_EXPORT2
createLineInstance(const Locale & key,UErrorCode & status)160 BreakIterator::createLineInstance(const Locale& key, UErrorCode& status)
161 {
162 return createInstance(key, UBRK_LINE, status);
163 }
164
165 // -------------------------------------
166
167 // Creates a break iterator for character breaks.
168 BreakIterator* U_EXPORT2
createCharacterInstance(const Locale & key,UErrorCode & status)169 BreakIterator::createCharacterInstance(const Locale& key, UErrorCode& status)
170 {
171 return createInstance(key, UBRK_CHARACTER, status);
172 }
173
174 // -------------------------------------
175
176 // Creates a break iterator for sentence breaks.
177 BreakIterator* U_EXPORT2
createSentenceInstance(const Locale & key,UErrorCode & status)178 BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status)
179 {
180 return createInstance(key, UBRK_SENTENCE, status);
181 }
182
183 // -------------------------------------
184
185 // Creates a break iterator for title casing breaks.
186 BreakIterator* U_EXPORT2
createTitleInstance(const Locale & key,UErrorCode & status)187 BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status)
188 {
189 return createInstance(key, UBRK_TITLE, status);
190 }
191
192 // -------------------------------------
193
194 // Gets all the available locales that has localized text boundary data.
195 const Locale* U_EXPORT2
getAvailableLocales(int32_t & count)196 BreakIterator::getAvailableLocales(int32_t& count)
197 {
198 return Locale::getAvailableLocales(count);
199 }
200
201 // ------------------------------------------
202 //
203 // Constructors, destructor and assignment operator
204 //
205 //-------------------------------------------
206
BreakIterator()207 BreakIterator::BreakIterator()
208 {
209 *validLocale = *actualLocale = *requestLocale = 0;
210 }
211
BreakIterator(const BreakIterator & other)212 BreakIterator::BreakIterator(const BreakIterator &other) : UObject(other) {
213 uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
214 uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
215 uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale));
216 }
217
operator =(const BreakIterator & other)218 BreakIterator &BreakIterator::operator =(const BreakIterator &other) {
219 if (this != &other) {
220 uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
221 uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
222 uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale));
223 }
224 return *this;
225 }
226
~BreakIterator()227 BreakIterator::~BreakIterator()
228 {
229 }
230
231 // ------------------------------------------
232 //
233 // Registration
234 //
235 //-------------------------------------------
236 #if !UCONFIG_NO_SERVICE
237
238 // -------------------------------------
239
240 class ICUBreakIteratorFactory : public ICUResourceBundleFactory {
241 public:
242 virtual ~ICUBreakIteratorFactory();
243 protected:
handleCreate(const Locale & loc,int32_t kind,const ICUService *,UErrorCode & status) const244 virtual UObject* handleCreate(const Locale& loc, int32_t kind, const ICUService* /*service*/, UErrorCode& status) const override {
245 return BreakIterator::makeInstance(loc, kind, status);
246 }
247 };
248
~ICUBreakIteratorFactory()249 ICUBreakIteratorFactory::~ICUBreakIteratorFactory() {}
250
251 // -------------------------------------
252
253 class ICUBreakIteratorService : public ICULocaleService {
254 public:
ICUBreakIteratorService()255 ICUBreakIteratorService()
256 : ICULocaleService(UNICODE_STRING("Break Iterator", 14))
257 {
258 UErrorCode status = U_ZERO_ERROR;
259 registerFactory(new ICUBreakIteratorFactory(), status);
260 }
261
262 virtual ~ICUBreakIteratorService();
263
cloneInstance(UObject * instance) const264 virtual UObject* cloneInstance(UObject* instance) const override {
265 return ((BreakIterator*)instance)->clone();
266 }
267
handleDefault(const ICUServiceKey & key,UnicodeString *,UErrorCode & status) const268 virtual UObject* handleDefault(const ICUServiceKey& key, UnicodeString* /*actualID*/, UErrorCode& status) const override {
269 LocaleKey& lkey = static_cast<LocaleKey&>(const_cast<ICUServiceKey&>(key));
270 int32_t kind = lkey.kind();
271 Locale loc;
272 lkey.currentLocale(loc);
273 return BreakIterator::makeInstance(loc, kind, status);
274 }
275
isDefault() const276 virtual UBool isDefault() const override {
277 return countFactories() == 1;
278 }
279 };
280
~ICUBreakIteratorService()281 ICUBreakIteratorService::~ICUBreakIteratorService() {}
282
283 // -------------------------------------
284
285 // defined in ucln_cmn.h
286 U_NAMESPACE_END
287
288 static icu::UInitOnce gInitOnceBrkiter {};
289 static icu::ICULocaleService* gService = nullptr;
290
291
292
293 /**
294 * Release all static memory held by breakiterator.
295 */
296 U_CDECL_BEGIN
breakiterator_cleanup()297 static UBool U_CALLCONV breakiterator_cleanup() {
298 #if !UCONFIG_NO_SERVICE
299 if (gService) {
300 delete gService;
301 gService = nullptr;
302 }
303 gInitOnceBrkiter.reset();
304 #endif
305 return true;
306 }
307 U_CDECL_END
308 U_NAMESPACE_BEGIN
309
310 static void U_CALLCONV
initService()311 initService() {
312 gService = new ICUBreakIteratorService();
313 ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR, breakiterator_cleanup);
314 }
315
316 static ICULocaleService*
getService()317 getService()
318 {
319 umtx_initOnce(gInitOnceBrkiter, &initService);
320 return gService;
321 }
322
323
324 // -------------------------------------
325
326 static inline UBool
hasService()327 hasService()
328 {
329 return !gInitOnceBrkiter.isReset() && getService() != nullptr;
330 }
331
332 // -------------------------------------
333
334 URegistryKey U_EXPORT2
registerInstance(BreakIterator * toAdopt,const Locale & locale,UBreakIteratorType kind,UErrorCode & status)335 BreakIterator::registerInstance(BreakIterator* toAdopt, const Locale& locale, UBreakIteratorType kind, UErrorCode& status)
336 {
337 ICULocaleService *service = getService();
338 if (service == nullptr) {
339 status = U_MEMORY_ALLOCATION_ERROR;
340 return nullptr;
341 }
342 return service->registerInstance(toAdopt, locale, kind, status);
343 }
344
345 // -------------------------------------
346
347 UBool U_EXPORT2
unregister(URegistryKey key,UErrorCode & status)348 BreakIterator::unregister(URegistryKey key, UErrorCode& status)
349 {
350 if (U_SUCCESS(status)) {
351 if (hasService()) {
352 return gService->unregister(key, status);
353 }
354 status = U_MEMORY_ALLOCATION_ERROR;
355 }
356 return false;
357 }
358
359 // -------------------------------------
360
361 StringEnumeration* U_EXPORT2
getAvailableLocales()362 BreakIterator::getAvailableLocales()
363 {
364 ICULocaleService *service = getService();
365 if (service == nullptr) {
366 return nullptr;
367 }
368 return service->getAvailableLocales();
369 }
370 #endif /* UCONFIG_NO_SERVICE */
371
372 // -------------------------------------
373
374 BreakIterator*
createInstance(const Locale & loc,int32_t kind,UErrorCode & status)375 BreakIterator::createInstance(const Locale& loc, int32_t kind, UErrorCode& status)
376 {
377 if (U_FAILURE(status)) {
378 return nullptr;
379 }
380
381 #if !UCONFIG_NO_SERVICE
382 if (hasService()) {
383 Locale actualLoc("");
384 BreakIterator *result = (BreakIterator*)gService->get(loc, kind, &actualLoc, status);
385 // TODO: The way the service code works in ICU 2.8 is that if
386 // there is a real registered break iterator, the actualLoc
387 // will be populated, but if the handleDefault path is taken
388 // (because nothing is registered that can handle the
389 // requested locale) then the actualLoc comes back empty. In
390 // that case, the returned object already has its actual/valid
391 // locale data populated (by makeInstance, which is what
392 // handleDefault calls), so we don't touch it. YES, A COMMENT
393 // THIS LONG is a sign of bad code -- so the action item is to
394 // revisit this in ICU 3.0 and clean it up/fix it/remove it.
395 if (U_SUCCESS(status) && (result != nullptr) && *actualLoc.getName() != 0) {
396 U_LOCALE_BASED(locBased, *result);
397 locBased.setLocaleIDs(actualLoc.getName(), actualLoc.getName());
398 }
399 return result;
400 }
401 else
402 #endif
403 {
404 return makeInstance(loc, kind, status);
405 }
406 }
407
408 // -------------------------------------
409 enum { kKeyValueLenMax = 32 };
410
411 BreakIterator*
makeInstance(const Locale & loc,int32_t kind,UErrorCode & status)412 BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
413 {
414
415 if (U_FAILURE(status)) {
416 return nullptr;
417 }
418
419 BreakIterator *result = nullptr;
420 switch (kind) {
421 case UBRK_CHARACTER:
422 {
423 UTRACE_ENTRY(UTRACE_UBRK_CREATE_CHARACTER);
424 result = BreakIterator::buildInstance(loc, "grapheme", status);
425 UTRACE_EXIT_STATUS(status);
426 }
427 break;
428 case UBRK_WORD:
429 {
430 UTRACE_ENTRY(UTRACE_UBRK_CREATE_WORD);
431 result = BreakIterator::buildInstance(loc, "word", status);
432 UTRACE_EXIT_STATUS(status);
433 }
434 break;
435 case UBRK_LINE:
436 {
437 char lb_lw[kKeyValueLenMax];
438 UTRACE_ENTRY(UTRACE_UBRK_CREATE_LINE);
439 uprv_strcpy(lb_lw, "line");
440 UErrorCode kvStatus = U_ZERO_ERROR;
441 auto value = loc.getKeywordValue<CharString>("lb", kvStatus);
442 if (U_SUCCESS(kvStatus) && (value == "strict" || value == "normal" || value == "loose")) {
443 uprv_strcat(lb_lw, "_");
444 uprv_strcat(lb_lw, value.data());
445 }
446 // lw=phrase is only supported in Japanese and Korean
447 if (uprv_strcmp(loc.getLanguage(), "ja") == 0 || uprv_strcmp(loc.getLanguage(), "ko") == 0) {
448 value = loc.getKeywordValue<CharString>("lw", kvStatus);
449 if (U_SUCCESS(kvStatus) && value == "phrase") {
450 uprv_strcat(lb_lw, "_");
451 uprv_strcat(lb_lw, value.data());
452 }
453 }
454 result = BreakIterator::buildInstance(loc, lb_lw, status);
455
456 UTRACE_DATA1(UTRACE_INFO, "lb_lw=%s", lb_lw);
457 UTRACE_EXIT_STATUS(status);
458 }
459 break;
460 case UBRK_SENTENCE:
461 {
462 UTRACE_ENTRY(UTRACE_UBRK_CREATE_SENTENCE);
463 result = BreakIterator::buildInstance(loc, "sentence", status);
464 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
465 char ssKeyValue[kKeyValueLenMax] = {0};
466 UErrorCode kvStatus = U_ZERO_ERROR;
467 int32_t kLen = loc.getKeywordValue("ss", ssKeyValue, kKeyValueLenMax, kvStatus);
468 if (U_SUCCESS(kvStatus) && kLen > 0 && uprv_strcmp(ssKeyValue,"standard")==0) {
469 FilteredBreakIteratorBuilder* fbiBuilder = FilteredBreakIteratorBuilder::createInstance(loc, kvStatus);
470 if (U_SUCCESS(kvStatus)) {
471 result = fbiBuilder->build(result, status);
472 delete fbiBuilder;
473 }
474 }
475 #endif
476 UTRACE_EXIT_STATUS(status);
477 }
478 break;
479 case UBRK_TITLE:
480 {
481 UTRACE_ENTRY(UTRACE_UBRK_CREATE_TITLE);
482 result = BreakIterator::buildInstance(loc, "title", status);
483 UTRACE_EXIT_STATUS(status);
484 }
485 break;
486 default:
487 status = U_ILLEGAL_ARGUMENT_ERROR;
488 }
489
490 if (U_FAILURE(status)) {
491 return nullptr;
492 }
493
494 return result;
495 }
496
497 Locale
getLocale(ULocDataLocaleType type,UErrorCode & status) const498 BreakIterator::getLocale(ULocDataLocaleType type, UErrorCode& status) const {
499 if (type == ULOC_REQUESTED_LOCALE) {
500 return {requestLocale};
501 }
502 U_LOCALE_BASED(locBased, *this);
503 return locBased.getLocale(type, status);
504 }
505
506 const char *
getLocaleID(ULocDataLocaleType type,UErrorCode & status) const507 BreakIterator::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const {
508 if (type == ULOC_REQUESTED_LOCALE) {
509 return requestLocale;
510 }
511 U_LOCALE_BASED(locBased, *this);
512 return locBased.getLocaleID(type, status);
513 }
514
515
516 // This implementation of getRuleStatus is a do-nothing stub, here to
517 // provide a default implementation for any derived BreakIterator classes that
518 // do not implement it themselves.
getRuleStatus() const519 int32_t BreakIterator::getRuleStatus() const {
520 return 0;
521 }
522
523 // This implementation of getRuleStatusVec is a do-nothing stub, here to
524 // provide a default implementation for any derived BreakIterator classes that
525 // do not implement it themselves.
getRuleStatusVec(int32_t * fillInVec,int32_t capacity,UErrorCode & status)526 int32_t BreakIterator::getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status) {
527 if (U_FAILURE(status)) {
528 return 0;
529 }
530 if (capacity < 1) {
531 status = U_BUFFER_OVERFLOW_ERROR;
532 return 1;
533 }
534 *fillInVec = 0;
535 return 1;
536 }
537
BreakIterator(const Locale & valid,const Locale & actual)538 BreakIterator::BreakIterator (const Locale& valid, const Locale& actual) {
539 U_LOCALE_BASED(locBased, (*this));
540 locBased.setLocaleIDs(valid, actual);
541 }
542
543 U_NAMESPACE_END
544
545 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
546
547 //eof
548