• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  ************************************************************************************
3  * Copyright (C) 2006-2009, International Business Machines Corporation and others. *
4  * All Rights Reserved.                                                             *
5  ************************************************************************************
6  */
7 
8 #include "unicode/utypes.h"
9 
10 #if !UCONFIG_NO_BREAK_ITERATION
11 
12 #include "brkeng.h"
13 #include "dictbe.h"
14 #include "triedict.h"
15 #include "unicode/uchar.h"
16 #include "unicode/uniset.h"
17 #include "unicode/chariter.h"
18 #include "unicode/ures.h"
19 #include "unicode/udata.h"
20 #include "unicode/putil.h"
21 #include "unicode/ustring.h"
22 #include "unicode/uscript.h"
23 #include "uvector.h"
24 #include "umutex.h"
25 #include "uresimp.h"
26 #include "ubrkimpl.h"
27 
28 U_NAMESPACE_BEGIN
29 
30 /*
31  ******************************************************************
32  */
33 
LanguageBreakEngine()34 LanguageBreakEngine::LanguageBreakEngine() {
35 }
36 
~LanguageBreakEngine()37 LanguageBreakEngine::~LanguageBreakEngine() {
38 }
39 
40 /*
41  ******************************************************************
42  */
43 
LanguageBreakFactory()44 LanguageBreakFactory::LanguageBreakFactory() {
45 }
46 
~LanguageBreakFactory()47 LanguageBreakFactory::~LanguageBreakFactory() {
48 }
49 
50 /*
51  ******************************************************************
52  */
53 
UnhandledEngine(UErrorCode &)54 UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) {
55     for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
56         fHandled[i] = 0;
57     }
58 }
59 
~UnhandledEngine()60 UnhandledEngine::~UnhandledEngine() {
61     for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
62         if (fHandled[i] != 0) {
63             delete fHandled[i];
64         }
65     }
66 }
67 
68 UBool
handles(UChar32 c,int32_t breakType) const69 UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
70     return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))
71         && fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
72 }
73 
74 int32_t
findBreaks(UText * text,int32_t startPos,int32_t endPos,UBool reverse,int32_t breakType,UStack &) const75 UnhandledEngine::findBreaks( UText *text,
76                                  int32_t startPos,
77                                  int32_t endPos,
78                                  UBool reverse,
79                                  int32_t breakType,
80                                  UStack &/*foundBreaks*/ ) const {
81     if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
82         UChar32 c = utext_current32(text);
83         if (reverse) {
84             while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
85                 c = utext_previous32(text);
86             }
87         }
88         else {
89             while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
90                 utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
91                 c = utext_current32(text);
92             }
93         }
94     }
95     return 0;
96 }
97 
98 void
handleCharacter(UChar32 c,int32_t breakType)99 UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
100     if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
101         if (fHandled[breakType] == 0) {
102             fHandled[breakType] = new UnicodeSet();
103             if (fHandled[breakType] == 0) {
104                 return;
105             }
106         }
107         if (!fHandled[breakType]->contains(c)) {
108             UErrorCode status = U_ZERO_ERROR;
109             // Apply the entire script of the character.
110             int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
111             fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
112         }
113     }
114 }
115 
116 /*
117  ******************************************************************
118  */
119 
ICULanguageBreakFactory(UErrorCode &)120 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
121     fEngines = 0;
122 }
123 
~ICULanguageBreakFactory()124 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
125     if (fEngines != 0) {
126         delete fEngines;
127     }
128 }
129 
130 U_NAMESPACE_END
131 U_CDECL_BEGIN
_deleteEngine(void * obj)132 static void U_CALLCONV _deleteEngine(void *obj) {
133     delete (const U_NAMESPACE_QUALIFIER LanguageBreakEngine *) obj;
134 }
135 U_CDECL_END
136 U_NAMESPACE_BEGIN
137 
138 const LanguageBreakEngine *
getEngineFor(UChar32 c,int32_t breakType)139 ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
140     UBool       needsInit;
141     int32_t     i;
142     const LanguageBreakEngine *lbe = NULL;
143     UErrorCode  status = U_ZERO_ERROR;
144 
145     // TODO: The global mutex should not be used.
146     // The global mutex should only be used for short periods.
147     // A ICULanguageBreakFactory specific mutex should be used.
148     umtx_lock(NULL);
149     needsInit = (UBool)(fEngines == NULL);
150     if (!needsInit) {
151         i = fEngines->size();
152         while (--i >= 0) {
153             lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
154             if (lbe != NULL && lbe->handles(c, breakType)) {
155                 break;
156             }
157             lbe = NULL;
158         }
159     }
160     umtx_unlock(NULL);
161 
162     if (lbe != NULL) {
163         return lbe;
164     }
165 
166     if (needsInit) {
167         UStack  *engines = new UStack(_deleteEngine, NULL, status);
168         if (U_SUCCESS(status) && engines == NULL) {
169             status = U_MEMORY_ALLOCATION_ERROR;
170         }
171         else if (U_FAILURE(status)) {
172             delete engines;
173             engines = NULL;
174         }
175         else {
176             umtx_lock(NULL);
177             if (fEngines == NULL) {
178                 fEngines = engines;
179                 engines = NULL;
180             }
181             umtx_unlock(NULL);
182             delete engines;
183         }
184     }
185 
186     if (fEngines == NULL) {
187         return NULL;
188     }
189 
190     // We didn't find an engine the first time through, or there was no
191     // stack. Create an engine.
192     const LanguageBreakEngine *newlbe = loadEngineFor(c, breakType);
193 
194     // Now get the lock, and see if someone else has created it in the
195     // meantime
196     umtx_lock(NULL);
197     i = fEngines->size();
198     while (--i >= 0) {
199         lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
200         if (lbe != NULL && lbe->handles(c, breakType)) {
201             break;
202         }
203         lbe = NULL;
204     }
205     if (lbe == NULL && newlbe != NULL) {
206         fEngines->push((void *)newlbe, status);
207         lbe = newlbe;
208         newlbe = NULL;
209     }
210     umtx_unlock(NULL);
211 
212     delete newlbe;
213 
214     return lbe;
215 }
216 
217 const LanguageBreakEngine *
loadEngineFor(UChar32 c,int32_t breakType)218 ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
219     UErrorCode status = U_ZERO_ERROR;
220     UScriptCode code = uscript_getScript(c, &status);
221     if (U_SUCCESS(status)) {
222         const CompactTrieDictionary *dict = loadDictionaryFor(code, breakType);
223         if (dict != NULL) {
224             const LanguageBreakEngine *engine = NULL;
225             switch(code) {
226             case USCRIPT_THAI:
227                 engine = new ThaiBreakEngine(dict, status);
228                 break;
229 
230             case USCRIPT_HANGUL:
231                 engine = new CjkBreakEngine(dict, kKorean, status);
232                 break;
233 
234             // use same BreakEngine and dictionary for both Chinese and Japanese
235             case USCRIPT_HIRAGANA:
236             case USCRIPT_KATAKANA:
237             case USCRIPT_HAN:
238                 engine = new CjkBreakEngine(dict, kChineseJapanese, status);
239                 break;
240 #if 0
241             // TODO: Have to get some characters with script=common handled
242             // by CjkBreakEngine (e.g. U+309B). Simply subjecting
243             // them to CjkBreakEngine does not work. The engine has to
244             // special-case them.
245             case USCRIPT_COMMON:
246             {
247                 UBlockCode block = ublock_getCode(code);
248                 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
249                    engine = new CjkBreakEngine(dict, kChineseJapanese, status);
250                 break;
251             }
252 #endif
253             default:
254                 break;
255             }
256             if (engine == NULL) {
257                 delete dict;
258             }
259             else if (U_FAILURE(status)) {
260                 delete engine;
261                 engine = NULL;
262             }
263             return engine;
264         }
265     }
266     return NULL;
267 }
268 
269 const CompactTrieDictionary *
loadDictionaryFor(UScriptCode script,int32_t)270 ICULanguageBreakFactory::loadDictionaryFor(UScriptCode script, int32_t /*breakType*/) {
271     UErrorCode status = U_ZERO_ERROR;
272     // Open root from brkitr tree.
273     char dictnbuff[256];
274     char ext[4]={'\0'};
275 
276     UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
277     b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
278     b = ures_getByKeyWithFallback(b, uscript_getShortName(script), b, &status);
279     int32_t dictnlength = 0;
280     const UChar *dictfname = ures_getString(b, &dictnlength, &status);
281     if (U_SUCCESS(status) && (size_t)dictnlength >= sizeof(dictnbuff)) {
282         dictnlength = 0;
283         status = U_BUFFER_OVERFLOW_ERROR;
284     }
285     if (U_SUCCESS(status) && dictfname) {
286         UChar* extStart=u_strchr(dictfname, 0x002e);
287         int len = 0;
288         if(extStart!=NULL){
289             len = (int)(extStart-dictfname);
290             u_UCharsToChars(extStart+1, ext, sizeof(ext)); // nul terminates the buff
291             u_UCharsToChars(dictfname, dictnbuff, len);
292         }
293         dictnbuff[len]=0; // nul terminate
294     }
295     ures_close(b);
296     UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext, dictnbuff, &status);
297     if (U_SUCCESS(status)) {
298         const CompactTrieDictionary *dict = new CompactTrieDictionary(
299             file, status);
300         if (U_SUCCESS(status) && dict == NULL) {
301             status = U_MEMORY_ALLOCATION_ERROR;
302         }
303         if (U_FAILURE(status)) {
304             delete dict;
305             dict = NULL;
306         }
307         return dict;
308     } else if (dictfname != NULL){
309         //create dummy dict if dictionary filename not valid
310         UChar c = 0x0020;
311         status = U_ZERO_ERROR;
312         MutableTrieDictionary *mtd = new MutableTrieDictionary(c, status, TRUE);
313         mtd->addWord(&c, 1, status, 1);
314         return new CompactTrieDictionary(*mtd, status);
315     }
316     return NULL;
317 }
318 
319 U_NAMESPACE_END
320 
321 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
322