• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  ************************************************************************************
5  * Copyright (C) 2006-2016, International Business Machines Corporation
6  * and others. All Rights Reserved.
7  ************************************************************************************
8  */
9 
10 #include "unicode/utypes.h"
11 
12 #if !UCONFIG_NO_BREAK_ITERATION
13 
14 #include "unicode/uchar.h"
15 #include "unicode/uniset.h"
16 #include "unicode/chariter.h"
17 #include "unicode/ures.h"
18 #include "unicode/udata.h"
19 #include "unicode/putil.h"
20 #include "unicode/ustring.h"
21 #include "unicode/uscript.h"
22 #include "unicode/ucharstrie.h"
23 #include "unicode/bytestrie.h"
24 
25 #include "brkeng.h"
26 #include "cmemory.h"
27 #include "dictbe.h"
28 #include "lstmbe.h"
29 #include "charstr.h"
30 #include "dictionarydata.h"
31 #include "mutex.h"
32 #include "uvector.h"
33 #include "umutex.h"
34 #include "uresimp.h"
35 #include "ubrkimpl.h"
36 
37 U_NAMESPACE_BEGIN
38 
39 /*
40  ******************************************************************
41  */
42 
LanguageBreakEngine()43 LanguageBreakEngine::LanguageBreakEngine() {
44 }
45 
~LanguageBreakEngine()46 LanguageBreakEngine::~LanguageBreakEngine() {
47 }
48 
49 /*
50  ******************************************************************
51  */
52 
LanguageBreakFactory()53 LanguageBreakFactory::LanguageBreakFactory() {
54 }
55 
~LanguageBreakFactory()56 LanguageBreakFactory::~LanguageBreakFactory() {
57 }
58 
59 /*
60  ******************************************************************
61  */
62 
UnhandledEngine(UErrorCode & status)63 UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
64     (void)status;
65 }
66 
~UnhandledEngine()67 UnhandledEngine::~UnhandledEngine() {
68     delete fHandled;
69     fHandled = nullptr;
70 }
71 
72 UBool
handles(UChar32 c) const73 UnhandledEngine::handles(UChar32 c) const {
74     return fHandled && fHandled->contains(c);
75 }
76 
77 int32_t
findBreaks(UText * text,int32_t,int32_t endPos,UVector32 &,UBool,UErrorCode & status) const78 UnhandledEngine::findBreaks( UText *text,
79                              int32_t /* startPos */,
80                              int32_t endPos,
81                              UVector32 &/*foundBreaks*/,
82                              UBool /* isPhraseBreaking */,
83                              UErrorCode &status) const {
84     if (U_FAILURE(status)) return 0;
85     UChar32 c = utext_current32(text);
86     while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
87         utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
88         c = utext_current32(text);
89     }
90     return 0;
91 }
92 
93 void
handleCharacter(UChar32 c)94 UnhandledEngine::handleCharacter(UChar32 c) {
95     if (fHandled == nullptr) {
96         fHandled = new UnicodeSet();
97         if (fHandled == nullptr) {
98             return;
99         }
100     }
101     if (!fHandled->contains(c)) {
102         UErrorCode status = U_ZERO_ERROR;
103         // Apply the entire script of the character.
104         int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
105         fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
106     }
107 }
108 
109 /*
110  ******************************************************************
111  */
112 
ICULanguageBreakFactory(UErrorCode &)113 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
114     fEngines = 0;
115 }
116 
~ICULanguageBreakFactory()117 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
118     if (fEngines != 0) {
119         delete fEngines;
120     }
121 }
122 
123 U_NAMESPACE_END
124 U_CDECL_BEGIN
_deleteEngine(void * obj)125 static void U_CALLCONV _deleteEngine(void *obj) {
126     delete (const icu::LanguageBreakEngine *) obj;
127 }
128 U_CDECL_END
129 U_NAMESPACE_BEGIN
130 
131 const LanguageBreakEngine *
getEngineFor(UChar32 c)132 ICULanguageBreakFactory::getEngineFor(UChar32 c) {
133     const LanguageBreakEngine *lbe = NULL;
134     UErrorCode  status = U_ZERO_ERROR;
135 
136     static UMutex gBreakEngineMutex;
137     Mutex m(&gBreakEngineMutex);
138 
139     if (fEngines == nullptr) {
140         LocalPointer<UStack>  engines(new UStack(_deleteEngine, nullptr, status), status);
141         if (U_FAILURE(status) ) {
142             // Note: no way to return error code to caller.
143             return nullptr;
144         }
145         fEngines = engines.orphan();
146     } else {
147         int32_t i = fEngines->size();
148         while (--i >= 0) {
149             lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
150             if (lbe != NULL && lbe->handles(c)) {
151                 return lbe;
152             }
153         }
154     }
155 
156     // We didn't find an engine. Create one.
157     lbe = loadEngineFor(c);
158     if (lbe != nullptr) {
159         fEngines->push((void *)lbe, status);
160     }
161     return U_SUCCESS(status) ? lbe : nullptr;
162 }
163 
164 const LanguageBreakEngine *
loadEngineFor(UChar32 c)165 ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
166     UErrorCode status = U_ZERO_ERROR;
167     UScriptCode code = uscript_getScript(c, &status);
168     if (U_SUCCESS(status)) {
169         const LanguageBreakEngine *engine = nullptr;
170         // Try to use LSTM first
171         const LSTMData *data = CreateLSTMDataForScript(code, status);
172         if (U_SUCCESS(status)) {
173             if (data != nullptr) {
174                 engine = CreateLSTMBreakEngine(code, data, status);
175                 if (U_SUCCESS(status) && engine != nullptr) {
176                     return engine;
177                 }
178                 if (engine != nullptr) {
179                     delete engine;
180                     engine = nullptr;
181                 } else {
182                     DeleteLSTMData(data);
183                 }
184             }
185         }
186         status = U_ZERO_ERROR;  // fallback to dictionary based
187         DictionaryMatcher *m = loadDictionaryMatcherFor(code);
188         if (m != NULL) {
189             switch(code) {
190             case USCRIPT_THAI:
191                 engine = new ThaiBreakEngine(m, status);
192                 break;
193             case USCRIPT_LAO:
194                 engine = new LaoBreakEngine(m, status);
195                 break;
196             case USCRIPT_MYANMAR:
197                 engine = new BurmeseBreakEngine(m, status);
198                 break;
199             case USCRIPT_KHMER:
200                 engine = new KhmerBreakEngine(m, status);
201                 break;
202 
203 #if !UCONFIG_NO_NORMALIZATION
204                 // CJK not available w/o normalization
205             case USCRIPT_HANGUL:
206                 engine = new CjkBreakEngine(m, kKorean, status);
207                 break;
208 
209             // use same BreakEngine and dictionary for both Chinese and Japanese
210             case USCRIPT_HIRAGANA:
211             case USCRIPT_KATAKANA:
212             case USCRIPT_HAN:
213                 engine = new CjkBreakEngine(m, kChineseJapanese, status);
214                 break;
215 #if 0
216             // TODO: Have to get some characters with script=common handled
217             // by CjkBreakEngine (e.g. U+309B). Simply subjecting
218             // them to CjkBreakEngine does not work. The engine has to
219             // special-case them.
220             case USCRIPT_COMMON:
221             {
222                 UBlockCode block = ublock_getCode(code);
223                 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
224                    engine = new CjkBreakEngine(dict, kChineseJapanese, status);
225                 break;
226             }
227 #endif
228 #endif
229 
230             default:
231                 break;
232             }
233             if (engine == NULL) {
234                 delete m;
235             }
236             else if (U_FAILURE(status)) {
237                 delete engine;
238                 engine = NULL;
239             }
240             return engine;
241         }
242     }
243     return NULL;
244 }
245 
246 DictionaryMatcher *
loadDictionaryMatcherFor(UScriptCode script)247 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
248     UErrorCode status = U_ZERO_ERROR;
249     // open root from brkitr tree.
250     UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
251     b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
252     int32_t dictnlength = 0;
253     const UChar *dictfname =
254         ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
255     if (U_FAILURE(status)) {
256         ures_close(b);
257         return NULL;
258     }
259     CharString dictnbuf;
260     CharString ext;
261     const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength);  // last dot
262     if (extStart != NULL) {
263         int32_t len = (int32_t)(extStart - dictfname);
264         ext.appendInvariantChars(UnicodeString(false, extStart + 1, dictnlength - len - 1), status);
265         dictnlength = len;
266     }
267     dictnbuf.appendInvariantChars(UnicodeString(false, dictfname, dictnlength), status);
268     ures_close(b);
269 
270     UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
271     if (U_SUCCESS(status)) {
272         // build trie
273         const uint8_t *data = (const uint8_t *)udata_getMemory(file);
274         const int32_t *indexes = (const int32_t *)data;
275         const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
276         const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
277         DictionaryMatcher *m = NULL;
278         if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
279             const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
280             const char *characters = (const char *)(data + offset);
281             m = new BytesDictionaryMatcher(characters, transform, file);
282         }
283         else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
284             const UChar *characters = (const UChar *)(data + offset);
285             m = new UCharsDictionaryMatcher(characters, file);
286         }
287         if (m == NULL) {
288             // no matcher exists to take ownership - either we are an invalid
289             // type or memory allocation failed
290             udata_close(file);
291         }
292         return m;
293     } else if (dictfname != NULL) {
294         // we don't have a dictionary matcher.
295         // returning NULL here will cause us to fail to find a dictionary break engine, as expected
296         status = U_ZERO_ERROR;
297         return NULL;
298     }
299     return NULL;
300 }
301 
302 U_NAMESPACE_END
303 
304 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
305