• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  ************************************************************************************
5  * Copyright (C) 2006-2016, International Business Machines Corporation
6  * and others. All Rights Reserved.
7  ************************************************************************************
8  */
9 
10 #include "unicode/utypes.h"
11 
12 #if !UCONFIG_NO_BREAK_ITERATION
13 
14 #include "unicode/uchar.h"
15 #include "unicode/uniset.h"
16 #include "unicode/chariter.h"
17 #include "unicode/ures.h"
18 #include "unicode/udata.h"
19 #include "unicode/putil.h"
20 #include "unicode/ustring.h"
21 #include "unicode/uscript.h"
22 #include "unicode/ucharstrie.h"
23 #include "unicode/bytestrie.h"
24 
25 #include "brkeng.h"
26 #include "cmemory.h"
27 #include "dictbe.h"
28 #include "charstr.h"
29 #include "dictionarydata.h"
30 #include "mutex.h"
31 #include "uvector.h"
32 #include "umutex.h"
33 #include "uresimp.h"
34 #include "ubrkimpl.h"
35 
36 U_NAMESPACE_BEGIN
37 
38 /*
39  ******************************************************************
40  */
41 
LanguageBreakEngine()42 LanguageBreakEngine::LanguageBreakEngine() {
43 }
44 
~LanguageBreakEngine()45 LanguageBreakEngine::~LanguageBreakEngine() {
46 }
47 
48 /*
49  ******************************************************************
50  */
51 
LanguageBreakFactory()52 LanguageBreakFactory::LanguageBreakFactory() {
53 }
54 
~LanguageBreakFactory()55 LanguageBreakFactory::~LanguageBreakFactory() {
56 }
57 
58 /*
59  ******************************************************************
60  */
61 
UnhandledEngine(UErrorCode &)62 UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) {
63     for (int32_t i = 0; i < UPRV_LENGTHOF(fHandled); ++i) {
64         fHandled[i] = 0;
65     }
66 }
67 
~UnhandledEngine()68 UnhandledEngine::~UnhandledEngine() {
69     for (int32_t i = 0; i < UPRV_LENGTHOF(fHandled); ++i) {
70         if (fHandled[i] != 0) {
71             delete fHandled[i];
72         }
73     }
74 }
75 
76 UBool
handles(UChar32 c,int32_t breakType) const77 UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
78     return (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)
79         && fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
80 }
81 
82 int32_t
findBreaks(UText * text,int32_t,int32_t endPos,int32_t breakType,UVector32 &) const83 UnhandledEngine::findBreaks( UText *text,
84                              int32_t /* startPos */,
85                              int32_t endPos,
86                              int32_t breakType,
87                              UVector32 &/*foundBreaks*/ ) const {
88     if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) {
89         UChar32 c = utext_current32(text);
90         while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
91             utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
92             c = utext_current32(text);
93         }
94     }
95     return 0;
96 }
97 
98 void
handleCharacter(UChar32 c,int32_t breakType)99 UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
100     if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) {
101         if (fHandled[breakType] == 0) {
102             fHandled[breakType] = new UnicodeSet();
103             if (fHandled[breakType] == 0) {
104                 return;
105             }
106         }
107         if (!fHandled[breakType]->contains(c)) {
108             UErrorCode status = U_ZERO_ERROR;
109             // Apply the entire script of the character.
110             int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
111             fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
112         }
113     }
114 }
115 
116 /*
117  ******************************************************************
118  */
119 
ICULanguageBreakFactory(UErrorCode &)120 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
121     fEngines = 0;
122 }
123 
~ICULanguageBreakFactory()124 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
125     if (fEngines != 0) {
126         delete fEngines;
127     }
128 }
129 
130 U_NAMESPACE_END
131 U_CDECL_BEGIN
_deleteEngine(void * obj)132 static void U_CALLCONV _deleteEngine(void *obj) {
133     delete (const icu::LanguageBreakEngine *) obj;
134 }
135 U_CDECL_END
136 U_NAMESPACE_BEGIN
137 
138 static UMutex gBreakEngineMutex = U_MUTEX_INITIALIZER;
139 
140 const LanguageBreakEngine *
getEngineFor(UChar32 c,int32_t breakType)141 ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
142     const LanguageBreakEngine *lbe = NULL;
143     UErrorCode  status = U_ZERO_ERROR;
144 
145     Mutex m(&gBreakEngineMutex);
146 
147     if (fEngines == NULL) {
148         UStack  *engines = new UStack(_deleteEngine, NULL, status);
149         if (U_FAILURE(status) || engines == NULL) {
150             // Note: no way to return error code to caller.
151             delete engines;
152             return NULL;
153         }
154         fEngines = engines;
155     } else {
156         int32_t i = fEngines->size();
157         while (--i >= 0) {
158             lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
159             if (lbe != NULL && lbe->handles(c, breakType)) {
160                 return lbe;
161             }
162         }
163     }
164 
165     // We didn't find an engine. Create one.
166     lbe = loadEngineFor(c, breakType);
167     if (lbe != NULL) {
168         fEngines->push((void *)lbe, status);
169     }
170     return lbe;
171 }
172 
173 const LanguageBreakEngine *
loadEngineFor(UChar32 c,int32_t breakType)174 ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
175     UErrorCode status = U_ZERO_ERROR;
176     UScriptCode code = uscript_getScript(c, &status);
177     if (U_SUCCESS(status)) {
178         DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);
179         if (m != NULL) {
180             const LanguageBreakEngine *engine = NULL;
181             switch(code) {
182             case USCRIPT_THAI:
183                 engine = new ThaiBreakEngine(m, status);
184                 break;
185             case USCRIPT_LAO:
186                 engine = new LaoBreakEngine(m, status);
187                 break;
188             case USCRIPT_MYANMAR:
189                 engine = new BurmeseBreakEngine(m, status);
190                 break;
191             case USCRIPT_KHMER:
192                 engine = new KhmerBreakEngine(m, status);
193                 break;
194 
195 #if !UCONFIG_NO_NORMALIZATION
196                 // CJK not available w/o normalization
197             case USCRIPT_HANGUL:
198                 engine = new CjkBreakEngine(m, kKorean, status);
199                 break;
200 
201             // use same BreakEngine and dictionary for both Chinese and Japanese
202             case USCRIPT_HIRAGANA:
203             case USCRIPT_KATAKANA:
204             case USCRIPT_HAN:
205                 engine = new CjkBreakEngine(m, kChineseJapanese, status);
206                 break;
207 #if 0
208             // TODO: Have to get some characters with script=common handled
209             // by CjkBreakEngine (e.g. U+309B). Simply subjecting
210             // them to CjkBreakEngine does not work. The engine has to
211             // special-case them.
212             case USCRIPT_COMMON:
213             {
214                 UBlockCode block = ublock_getCode(code);
215                 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
216                    engine = new CjkBreakEngine(dict, kChineseJapanese, status);
217                 break;
218             }
219 #endif
220 #endif
221 
222             default:
223                 break;
224             }
225             if (engine == NULL) {
226                 delete m;
227             }
228             else if (U_FAILURE(status)) {
229                 delete engine;
230                 engine = NULL;
231             }
232             return engine;
233         }
234     }
235     return NULL;
236 }
237 
238 DictionaryMatcher *
loadDictionaryMatcherFor(UScriptCode script,int32_t)239 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) {
240     UErrorCode status = U_ZERO_ERROR;
241     // open root from brkitr tree.
242     UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
243     b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
244     int32_t dictnlength = 0;
245     const UChar *dictfname =
246         ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
247     if (U_FAILURE(status)) {
248         ures_close(b);
249         return NULL;
250     }
251     CharString dictnbuf;
252     CharString ext;
253     const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength);  // last dot
254     if (extStart != NULL) {
255         int32_t len = (int32_t)(extStart - dictfname);
256         ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
257         dictnlength = len;
258     }
259     dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
260     ures_close(b);
261 
262     UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
263     if (U_SUCCESS(status)) {
264         // build trie
265         const uint8_t *data = (const uint8_t *)udata_getMemory(file);
266         const int32_t *indexes = (const int32_t *)data;
267         const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
268         const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
269         DictionaryMatcher *m = NULL;
270         if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
271             const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
272             const char *characters = (const char *)(data + offset);
273             m = new BytesDictionaryMatcher(characters, transform, file);
274         }
275         else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
276             const UChar *characters = (const UChar *)(data + offset);
277             m = new UCharsDictionaryMatcher(characters, file);
278         }
279         if (m == NULL) {
280             // no matcher exists to take ownership - either we are an invalid
281             // type or memory allocation failed
282             udata_close(file);
283         }
284         return m;
285     } else if (dictfname != NULL) {
286         // we don't have a dictionary matcher.
287         // returning NULL here will cause us to fail to find a dictionary break engine, as expected
288         status = U_ZERO_ERROR;
289         return NULL;
290     }
291     return NULL;
292 }
293 
294 U_NAMESPACE_END
295 
296 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
297