1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ************************************************************************************
5 * Copyright (C) 2006-2016, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 ************************************************************************************
8 */
9
10 #include "unicode/utypes.h"
11
12 #if !UCONFIG_NO_BREAK_ITERATION
13
14 #include "unicode/uchar.h"
15 #include "unicode/uniset.h"
16 #include "unicode/chariter.h"
17 #include "unicode/ures.h"
18 #include "unicode/udata.h"
19 #include "unicode/putil.h"
20 #include "unicode/ustring.h"
21 #include "unicode/uscript.h"
22 #include "unicode/ucharstrie.h"
23 #include "unicode/bytestrie.h"
24
25 #include "brkeng.h"
26 #include "cmemory.h"
27 #include "dictbe.h"
28 #include "charstr.h"
29 #include "dictionarydata.h"
30 #include "mutex.h"
31 #include "uvector.h"
32 #include "umutex.h"
33 #include "uresimp.h"
34 #include "ubrkimpl.h"
35
36 U_NAMESPACE_BEGIN
37
38 /*
39 ******************************************************************
40 */
41
LanguageBreakEngine()42 LanguageBreakEngine::LanguageBreakEngine() {
43 }
44
~LanguageBreakEngine()45 LanguageBreakEngine::~LanguageBreakEngine() {
46 }
47
48 /*
49 ******************************************************************
50 */
51
LanguageBreakFactory()52 LanguageBreakFactory::LanguageBreakFactory() {
53 }
54
~LanguageBreakFactory()55 LanguageBreakFactory::~LanguageBreakFactory() {
56 }
57
58 /*
59 ******************************************************************
60 */
61
UnhandledEngine(UErrorCode & status)62 UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
63 (void)status;
64 }
65
~UnhandledEngine()66 UnhandledEngine::~UnhandledEngine() {
67 delete fHandled;
68 fHandled = nullptr;
69 }
70
71 UBool
handles(UChar32 c) const72 UnhandledEngine::handles(UChar32 c) const {
73 return fHandled && fHandled->contains(c);
74 }
75
76 int32_t
findBreaks(UText * text,int32_t,int32_t endPos,UVector32 &) const77 UnhandledEngine::findBreaks( UText *text,
78 int32_t /* startPos */,
79 int32_t endPos,
80 UVector32 &/*foundBreaks*/ ) const {
81 UChar32 c = utext_current32(text);
82 while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
83 utext_next32(text); // TODO: recast loop to work with post-increment operations.
84 c = utext_current32(text);
85 }
86 return 0;
87 }
88
89 void
handleCharacter(UChar32 c)90 UnhandledEngine::handleCharacter(UChar32 c) {
91 if (fHandled == nullptr) {
92 fHandled = new UnicodeSet();
93 if (fHandled == nullptr) {
94 return;
95 }
96 }
97 if (!fHandled->contains(c)) {
98 UErrorCode status = U_ZERO_ERROR;
99 // Apply the entire script of the character.
100 int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
101 fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
102 }
103 }
104
105 /*
106 ******************************************************************
107 */
108
ICULanguageBreakFactory(UErrorCode &)109 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
110 fEngines = 0;
111 }
112
~ICULanguageBreakFactory()113 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
114 if (fEngines != 0) {
115 delete fEngines;
116 }
117 }
118
119 U_NAMESPACE_END
120 U_CDECL_BEGIN
_deleteEngine(void * obj)121 static void U_CALLCONV _deleteEngine(void *obj) {
122 delete (const icu::LanguageBreakEngine *) obj;
123 }
124 U_CDECL_END
125 U_NAMESPACE_BEGIN
126
127 static UMutex gBreakEngineMutex = U_MUTEX_INITIALIZER;
128
129 const LanguageBreakEngine *
getEngineFor(UChar32 c)130 ICULanguageBreakFactory::getEngineFor(UChar32 c) {
131 const LanguageBreakEngine *lbe = NULL;
132 UErrorCode status = U_ZERO_ERROR;
133
134 Mutex m(&gBreakEngineMutex);
135
136 if (fEngines == NULL) {
137 UStack *engines = new UStack(_deleteEngine, NULL, status);
138 if (U_FAILURE(status) || engines == NULL) {
139 // Note: no way to return error code to caller.
140 delete engines;
141 return NULL;
142 }
143 fEngines = engines;
144 } else {
145 int32_t i = fEngines->size();
146 while (--i >= 0) {
147 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
148 if (lbe != NULL && lbe->handles(c)) {
149 return lbe;
150 }
151 }
152 }
153
154 // We didn't find an engine. Create one.
155 lbe = loadEngineFor(c);
156 if (lbe != NULL) {
157 fEngines->push((void *)lbe, status);
158 }
159 return lbe;
160 }
161
162 const LanguageBreakEngine *
loadEngineFor(UChar32 c)163 ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
164 UErrorCode status = U_ZERO_ERROR;
165 UScriptCode code = uscript_getScript(c, &status);
166 if (U_SUCCESS(status)) {
167 DictionaryMatcher *m = loadDictionaryMatcherFor(code);
168 if (m != NULL) {
169 const LanguageBreakEngine *engine = NULL;
170 switch(code) {
171 case USCRIPT_THAI:
172 engine = new ThaiBreakEngine(m, status);
173 break;
174 case USCRIPT_LAO:
175 engine = new LaoBreakEngine(m, status);
176 break;
177 case USCRIPT_MYANMAR:
178 engine = new BurmeseBreakEngine(m, status);
179 break;
180 case USCRIPT_KHMER:
181 engine = new KhmerBreakEngine(m, status);
182 break;
183
184 #if !UCONFIG_NO_NORMALIZATION
185 // CJK not available w/o normalization
186 case USCRIPT_HANGUL:
187 engine = new CjkBreakEngine(m, kKorean, status);
188 break;
189
190 // use same BreakEngine and dictionary for both Chinese and Japanese
191 case USCRIPT_HIRAGANA:
192 case USCRIPT_KATAKANA:
193 case USCRIPT_HAN:
194 engine = new CjkBreakEngine(m, kChineseJapanese, status);
195 break;
196 #if 0
197 // TODO: Have to get some characters with script=common handled
198 // by CjkBreakEngine (e.g. U+309B). Simply subjecting
199 // them to CjkBreakEngine does not work. The engine has to
200 // special-case them.
201 case USCRIPT_COMMON:
202 {
203 UBlockCode block = ublock_getCode(code);
204 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
205 engine = new CjkBreakEngine(dict, kChineseJapanese, status);
206 break;
207 }
208 #endif
209 #endif
210
211 default:
212 break;
213 }
214 if (engine == NULL) {
215 delete m;
216 }
217 else if (U_FAILURE(status)) {
218 delete engine;
219 engine = NULL;
220 }
221 return engine;
222 }
223 }
224 return NULL;
225 }
226
227 DictionaryMatcher *
loadDictionaryMatcherFor(UScriptCode script)228 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
229 UErrorCode status = U_ZERO_ERROR;
230 // open root from brkitr tree.
231 UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
232 b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
233 int32_t dictnlength = 0;
234 const UChar *dictfname =
235 ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
236 if (U_FAILURE(status)) {
237 ures_close(b);
238 return NULL;
239 }
240 CharString dictnbuf;
241 CharString ext;
242 const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot
243 if (extStart != NULL) {
244 int32_t len = (int32_t)(extStart - dictfname);
245 ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
246 dictnlength = len;
247 }
248 dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
249 ures_close(b);
250
251 UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
252 if (U_SUCCESS(status)) {
253 // build trie
254 const uint8_t *data = (const uint8_t *)udata_getMemory(file);
255 const int32_t *indexes = (const int32_t *)data;
256 const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
257 const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
258 DictionaryMatcher *m = NULL;
259 if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
260 const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
261 const char *characters = (const char *)(data + offset);
262 m = new BytesDictionaryMatcher(characters, transform, file);
263 }
264 else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
265 const UChar *characters = (const UChar *)(data + offset);
266 m = new UCharsDictionaryMatcher(characters, file);
267 }
268 if (m == NULL) {
269 // no matcher exists to take ownership - either we are an invalid
270 // type or memory allocation failed
271 udata_close(file);
272 }
273 return m;
274 } else if (dictfname != NULL) {
275 // we don't have a dictionary matcher.
276 // returning NULL here will cause us to fail to find a dictionary break engine, as expected
277 status = U_ZERO_ERROR;
278 return NULL;
279 }
280 return NULL;
281 }
282
283 U_NAMESPACE_END
284
285 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
286