1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ************************************************************************************
5 * Copyright (C) 2006-2016, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 ************************************************************************************
8 */
9
10 #include "unicode/utypes.h"
11
12 #if !UCONFIG_NO_BREAK_ITERATION
13
14 #include "unicode/uchar.h"
15 #include "unicode/uniset.h"
16 #include "unicode/chariter.h"
17 #include "unicode/ures.h"
18 #include "unicode/udata.h"
19 #include "unicode/putil.h"
20 #include "unicode/ustring.h"
21 #include "unicode/uscript.h"
22 #include "unicode/ucharstrie.h"
23 #include "unicode/bytestrie.h"
24
25 #include "brkeng.h"
26 #include "cmemory.h"
27 #include "dictbe.h"
28 #include "charstr.h"
29 #include "dictionarydata.h"
30 #include "mutex.h"
31 #include "uvector.h"
32 #include "umutex.h"
33 #include "uresimp.h"
34 #include "ubrkimpl.h"
35
36 U_NAMESPACE_BEGIN
37
38 /*
39 ******************************************************************
40 */
41
LanguageBreakEngine()42 LanguageBreakEngine::LanguageBreakEngine() {
43 }
44
~LanguageBreakEngine()45 LanguageBreakEngine::~LanguageBreakEngine() {
46 }
47
48 /*
49 ******************************************************************
50 */
51
LanguageBreakFactory()52 LanguageBreakFactory::LanguageBreakFactory() {
53 }
54
~LanguageBreakFactory()55 LanguageBreakFactory::~LanguageBreakFactory() {
56 }
57
58 /*
59 ******************************************************************
60 */
61
UnhandledEngine(UErrorCode &)62 UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) {
63 for (int32_t i = 0; i < UPRV_LENGTHOF(fHandled); ++i) {
64 fHandled[i] = 0;
65 }
66 }
67
~UnhandledEngine()68 UnhandledEngine::~UnhandledEngine() {
69 for (int32_t i = 0; i < UPRV_LENGTHOF(fHandled); ++i) {
70 if (fHandled[i] != 0) {
71 delete fHandled[i];
72 }
73 }
74 }
75
76 UBool
handles(UChar32 c,int32_t breakType) const77 UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
78 return (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)
79 && fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
80 }
81
82 int32_t
findBreaks(UText * text,int32_t,int32_t endPos,int32_t breakType,UVector32 &) const83 UnhandledEngine::findBreaks( UText *text,
84 int32_t /* startPos */,
85 int32_t endPos,
86 int32_t breakType,
87 UVector32 &/*foundBreaks*/ ) const {
88 if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) {
89 UChar32 c = utext_current32(text);
90 while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
91 utext_next32(text); // TODO: recast loop to work with post-increment operations.
92 c = utext_current32(text);
93 }
94 }
95 return 0;
96 }
97
98 void
handleCharacter(UChar32 c,int32_t breakType)99 UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
100 if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) {
101 if (fHandled[breakType] == 0) {
102 fHandled[breakType] = new UnicodeSet();
103 if (fHandled[breakType] == 0) {
104 return;
105 }
106 }
107 if (!fHandled[breakType]->contains(c)) {
108 UErrorCode status = U_ZERO_ERROR;
109 // Apply the entire script of the character.
110 int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
111 fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
112 }
113 }
114 }
115
116 /*
117 ******************************************************************
118 */
119
ICULanguageBreakFactory(UErrorCode &)120 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
121 fEngines = 0;
122 }
123
~ICULanguageBreakFactory()124 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
125 if (fEngines != 0) {
126 delete fEngines;
127 }
128 }
129
130 U_NAMESPACE_END
131 U_CDECL_BEGIN
_deleteEngine(void * obj)132 static void U_CALLCONV _deleteEngine(void *obj) {
133 delete (const icu::LanguageBreakEngine *) obj;
134 }
135 U_CDECL_END
136 U_NAMESPACE_BEGIN
137
138 static UMutex gBreakEngineMutex = U_MUTEX_INITIALIZER;
139
140 const LanguageBreakEngine *
getEngineFor(UChar32 c,int32_t breakType)141 ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
142 const LanguageBreakEngine *lbe = NULL;
143 UErrorCode status = U_ZERO_ERROR;
144
145 Mutex m(&gBreakEngineMutex);
146
147 if (fEngines == NULL) {
148 UStack *engines = new UStack(_deleteEngine, NULL, status);
149 if (U_FAILURE(status) || engines == NULL) {
150 // Note: no way to return error code to caller.
151 delete engines;
152 return NULL;
153 }
154 fEngines = engines;
155 } else {
156 int32_t i = fEngines->size();
157 while (--i >= 0) {
158 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
159 if (lbe != NULL && lbe->handles(c, breakType)) {
160 return lbe;
161 }
162 }
163 }
164
165 // We didn't find an engine. Create one.
166 lbe = loadEngineFor(c, breakType);
167 if (lbe != NULL) {
168 fEngines->push((void *)lbe, status);
169 }
170 return lbe;
171 }
172
173 const LanguageBreakEngine *
loadEngineFor(UChar32 c,int32_t breakType)174 ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
175 UErrorCode status = U_ZERO_ERROR;
176 UScriptCode code = uscript_getScript(c, &status);
177 if (U_SUCCESS(status)) {
178 DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);
179 if (m != NULL) {
180 const LanguageBreakEngine *engine = NULL;
181 switch(code) {
182 case USCRIPT_THAI:
183 engine = new ThaiBreakEngine(m, status);
184 break;
185 case USCRIPT_LAO:
186 engine = new LaoBreakEngine(m, status);
187 break;
188 case USCRIPT_MYANMAR:
189 engine = new BurmeseBreakEngine(m, status);
190 break;
191 case USCRIPT_KHMER:
192 engine = new KhmerBreakEngine(m, status);
193 break;
194
195 #if !UCONFIG_NO_NORMALIZATION
196 // CJK not available w/o normalization
197 case USCRIPT_HANGUL:
198 engine = new CjkBreakEngine(m, kKorean, status);
199 break;
200
201 // use same BreakEngine and dictionary for both Chinese and Japanese
202 case USCRIPT_HIRAGANA:
203 case USCRIPT_KATAKANA:
204 case USCRIPT_HAN:
205 engine = new CjkBreakEngine(m, kChineseJapanese, status);
206 break;
207 #if 0
208 // TODO: Have to get some characters with script=common handled
209 // by CjkBreakEngine (e.g. U+309B). Simply subjecting
210 // them to CjkBreakEngine does not work. The engine has to
211 // special-case them.
212 case USCRIPT_COMMON:
213 {
214 UBlockCode block = ublock_getCode(code);
215 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
216 engine = new CjkBreakEngine(dict, kChineseJapanese, status);
217 break;
218 }
219 #endif
220 #endif
221
222 default:
223 break;
224 }
225 if (engine == NULL) {
226 delete m;
227 }
228 else if (U_FAILURE(status)) {
229 delete engine;
230 engine = NULL;
231 }
232 return engine;
233 }
234 }
235 return NULL;
236 }
237
238 DictionaryMatcher *
loadDictionaryMatcherFor(UScriptCode script,int32_t)239 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) {
240 UErrorCode status = U_ZERO_ERROR;
241 // open root from brkitr tree.
242 UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
243 b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
244 int32_t dictnlength = 0;
245 const UChar *dictfname =
246 ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
247 if (U_FAILURE(status)) {
248 ures_close(b);
249 return NULL;
250 }
251 CharString dictnbuf;
252 CharString ext;
253 const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot
254 if (extStart != NULL) {
255 int32_t len = (int32_t)(extStart - dictfname);
256 ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
257 dictnlength = len;
258 }
259 dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
260 ures_close(b);
261
262 UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
263 if (U_SUCCESS(status)) {
264 // build trie
265 const uint8_t *data = (const uint8_t *)udata_getMemory(file);
266 const int32_t *indexes = (const int32_t *)data;
267 const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
268 const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
269 DictionaryMatcher *m = NULL;
270 if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
271 const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
272 const char *characters = (const char *)(data + offset);
273 m = new BytesDictionaryMatcher(characters, transform, file);
274 }
275 else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
276 const UChar *characters = (const UChar *)(data + offset);
277 m = new UCharsDictionaryMatcher(characters, file);
278 }
279 if (m == NULL) {
280 // no matcher exists to take ownership - either we are an invalid
281 // type or memory allocation failed
282 udata_close(file);
283 }
284 return m;
285 } else if (dictfname != NULL) {
286 // we don't have a dictionary matcher.
287 // returning NULL here will cause us to fail to find a dictionary break engine, as expected
288 status = U_ZERO_ERROR;
289 return NULL;
290 }
291 return NULL;
292 }
293
294 U_NAMESPACE_END
295
296 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
297