• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4  *******************************************************************************
5  * Copyright (C) 2006-2014, International Business Machines Corporation   *
6  * and others. All Rights Reserved.                                            *
7  *******************************************************************************
8  */
9 
10 #ifndef DICTBE_H
11 #define DICTBE_H
12 
13 #include "unicode/utypes.h"
14 #include "unicode/uniset.h"
15 #include "unicode/utext.h"
16 
17 #include "brkeng.h"
18 #include "hash.h"
19 #include "mlbe.h"
20 #include "uvectr32.h"
21 
22 U_NAMESPACE_BEGIN
23 
24 class DictionaryMatcher;
25 class MlBreakEngine;
26 class Normalizer2;
27 
28 /*******************************************************************
29  * DictionaryBreakEngine
30  */
31 
32 /**
33  * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
34  * dictionary to determine language-specific breaks.</p>
35  *
36  * <p>After it is constructed a DictionaryBreakEngine may be shared between
37  * threads without synchronization.</p>
38  */
39 class DictionaryBreakEngine : public LanguageBreakEngine {
40  private:
41     /**
42      * The set of characters handled by this engine
43      * @internal
44      */
45 
46   UnicodeSet    fSet;
47 
48  public:
49 
50   /**
51    * <p>Constructor </p>
52    */
53   DictionaryBreakEngine();
54 
55   /**
56    * <p>Virtual destructor.</p>
57    */
58   virtual ~DictionaryBreakEngine();
59 
60   /**
61    * <p>Indicate whether this engine handles a particular character for
62    * a particular kind of break.</p>
63    *
64    * @param c A character which begins a run that the engine might handle
65    * @return true if this engine handles the particular character and break
66    * type.
67    */
68   virtual UBool handles(UChar32 c) const override;
69 
70   /**
71    * <p>Find any breaks within a run in the supplied text.</p>
72    *
73    * @param text A UText representing the text. The iterator is left at
74    * the end of the run of characters which the engine is capable of handling
75    * that starts from the first character in the range.
76    * @param startPos The start of the run within the supplied text.
77    * @param endPos The end of the run within the supplied text.
78    * @param foundBreaks vector of int32_t to receive the break positions
79    * @param status Information on any errors encountered.
80    * @return The number of breaks found.
81    */
82   virtual int32_t findBreaks( UText *text,
83                               int32_t startPos,
84                               int32_t endPos,
85                               UVector32 &foundBreaks,
86                               UBool isPhraseBreaking,
87                               UErrorCode& status ) const override;
88 
89  protected:
90 
91  /**
92   * <p>Set the character set handled by this engine.</p>
93   *
94   * @param set A UnicodeSet of the set of characters handled by the engine
95   */
96   virtual void setCharacters( const UnicodeSet &set );
97 
98  /**
99   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
100   *
101   * @param text A UText representing the text
102   * @param rangeStart The start of the range of dictionary characters
103   * @param rangeEnd The end of the range of dictionary characters
104   * @param foundBreaks Output of C array of int32_t break positions, or 0
105   * @param status Information on any errors encountered.
106   * @return The number of breaks found
107   */
108   virtual int32_t divideUpDictionaryRange( UText *text,
109                                            int32_t rangeStart,
110                                            int32_t rangeEnd,
111                                            UVector32 &foundBreaks,
112                                            UBool isPhraseBreaking,
113                                            UErrorCode& status) const = 0;
114 
115 };
116 
117 /*******************************************************************
118  * ThaiBreakEngine
119  */
120 
121 /**
122  * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
123  * dictionary and heuristics to determine Thai-specific breaks.</p>
124  *
125  * <p>After it is constructed a ThaiBreakEngine may be shared between
126  * threads without synchronization.</p>
127  */
128 class ThaiBreakEngine : public DictionaryBreakEngine {
129  private:
130     /**
131      * The set of characters handled by this engine
132      * @internal
133      */
134 
135   UnicodeSet                fEndWordSet;
136   UnicodeSet                fBeginWordSet;
137   UnicodeSet                fSuffixSet;
138   UnicodeSet                fMarkSet;
139   DictionaryMatcher  *fDictionary;
140 
141  public:
142 
143   /**
144    * <p>Default constructor.</p>
145    *
146    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
147    * engine is deleted.
148    */
149   ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
150 
151   /**
152    * <p>Virtual destructor.</p>
153    */
154   virtual ~ThaiBreakEngine();
155 
156  protected:
157  /**
158   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
159   *
160   * @param text A UText representing the text
161   * @param rangeStart The start of the range of dictionary characters
162   * @param rangeEnd The end of the range of dictionary characters
163   * @param foundBreaks Output of C array of int32_t break positions, or 0
164   * @param status Information on any errors encountered.
165   * @return The number of breaks found
166   */
167   virtual int32_t divideUpDictionaryRange( UText *text,
168                                            int32_t rangeStart,
169                                            int32_t rangeEnd,
170                                            UVector32 &foundBreaks,
171                                            UBool isPhraseBreaking,
172                                            UErrorCode& status) const override;
173 
174 };
175 
176 /*******************************************************************
177  * LaoBreakEngine
178  */
179 
180 /**
181  * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
182  * dictionary and heuristics to determine Lao-specific breaks.</p>
183  *
184  * <p>After it is constructed a LaoBreakEngine may be shared between
185  * threads without synchronization.</p>
186  */
187 class LaoBreakEngine : public DictionaryBreakEngine {
188  private:
189     /**
190      * The set of characters handled by this engine
191      * @internal
192      */
193 
194   UnicodeSet                fEndWordSet;
195   UnicodeSet                fBeginWordSet;
196   UnicodeSet                fMarkSet;
197   DictionaryMatcher  *fDictionary;
198 
199  public:
200 
201   /**
202    * <p>Default constructor.</p>
203    *
204    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
205    * engine is deleted.
206    */
207   LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
208 
209   /**
210    * <p>Virtual destructor.</p>
211    */
212   virtual ~LaoBreakEngine();
213 
214  protected:
215  /**
216   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
217   *
218   * @param text A UText representing the text
219   * @param rangeStart The start of the range of dictionary characters
220   * @param rangeEnd The end of the range of dictionary characters
221   * @param foundBreaks Output of C array of int32_t break positions, or 0
222   * @param status Information on any errors encountered.
223   * @return The number of breaks found
224   */
225   virtual int32_t divideUpDictionaryRange( UText *text,
226                                            int32_t rangeStart,
227                                            int32_t rangeEnd,
228                                            UVector32 &foundBreaks,
229                                            UBool isPhraseBreaking,
230                                            UErrorCode& status) const override;
231 
232 };
233 
234 /*******************************************************************
235  * BurmeseBreakEngine
236  */
237 
238 /**
239  * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
240  * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
241  *
242  * <p>After it is constructed a BurmeseBreakEngine may be shared between
243  * threads without synchronization.</p>
244  */
245 class BurmeseBreakEngine : public DictionaryBreakEngine {
246  private:
247     /**
248      * The set of characters handled by this engine
249      * @internal
250      */
251 
252   UnicodeSet                fEndWordSet;
253   UnicodeSet                fBeginWordSet;
254   UnicodeSet                fMarkSet;
255   DictionaryMatcher  *fDictionary;
256 
257  public:
258 
259   /**
260    * <p>Default constructor.</p>
261    *
262    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
263    * engine is deleted.
264    */
265   BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
266 
267   /**
268    * <p>Virtual destructor.</p>
269    */
270   virtual ~BurmeseBreakEngine();
271 
272  protected:
273  /**
274   * <p>Divide up a range of known dictionary characters.</p>
275   *
276   * @param text A UText representing the text
277   * @param rangeStart The start of the range of dictionary characters
278   * @param rangeEnd The end of the range of dictionary characters
279   * @param foundBreaks Output of C array of int32_t break positions, or 0
280   * @param status Information on any errors encountered.
281   * @return The number of breaks found
282   */
283   virtual int32_t divideUpDictionaryRange( UText *text,
284                                            int32_t rangeStart,
285                                            int32_t rangeEnd,
286                                            UVector32 &foundBreaks,
287                                            UBool isPhraseBreaking,
288                                            UErrorCode& status) const override;
289 
290 };
291 
292 /*******************************************************************
293  * KhmerBreakEngine
294  */
295 
296 /**
297  * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
298  * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
299  *
300  * <p>After it is constructed a KhmerBreakEngine may be shared between
301  * threads without synchronization.</p>
302  */
303 class KhmerBreakEngine : public DictionaryBreakEngine {
304  private:
305     /**
306      * The set of characters handled by this engine
307      * @internal
308      */
309 
310   UnicodeSet                fEndWordSet;
311   UnicodeSet                fBeginWordSet;
312   UnicodeSet                fMarkSet;
313   DictionaryMatcher  *fDictionary;
314 
315  public:
316 
317   /**
318    * <p>Default constructor.</p>
319    *
320    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
321    * engine is deleted.
322    */
323   KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
324 
325   /**
326    * <p>Virtual destructor.</p>
327    */
328   virtual ~KhmerBreakEngine();
329 
330  protected:
331  /**
332   * <p>Divide up a range of known dictionary characters.</p>
333   *
334   * @param text A UText representing the text
335   * @param rangeStart The start of the range of dictionary characters
336   * @param rangeEnd The end of the range of dictionary characters
337   * @param foundBreaks Output of C array of int32_t break positions, or 0
338   * @param status Information on any errors encountered.
339   * @return The number of breaks found
340   */
341   virtual int32_t divideUpDictionaryRange( UText *text,
342                                            int32_t rangeStart,
343                                            int32_t rangeEnd,
344                                            UVector32 &foundBreaks,
345                                            UBool isPhraseBreaking,
346                                            UErrorCode& status) const override;
347 
348 };
349 
350 #if !UCONFIG_NO_NORMALIZATION
351 
352 /*******************************************************************
353  * CjkBreakEngine
354  */
355 
356 //indicates language/script that the CjkBreakEngine will handle
357 enum LanguageType {
358     kKorean,
359     kChineseJapanese
360 };
361 
362 /**
363  * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
364  * dictionary with costs associated with each word and
365  * Viterbi decoding to determine CJK-specific breaks.</p>
366  */
367 class CjkBreakEngine : public DictionaryBreakEngine {
368  protected:
369     /**
370      * The set of characters handled by this engine
371      * @internal
372      */
373   UnicodeSet                fHangulWordSet;
374   UnicodeSet                fDigitOrOpenPunctuationOrAlphabetSet;
375   UnicodeSet                fClosePunctuationSet;
376 
377   DictionaryMatcher        *fDictionary;
378   const Normalizer2        *nfkcNorm2;
379   MlBreakEngine            *fMlBreakEngine;
380   bool                      isCj;
381 
382  private:
383   // Load Japanese extensions.
384   void loadJapaneseExtensions(UErrorCode& error);
385   // Load Japanese Hiragana.
386   void loadHiragana(UErrorCode& error);
387   // Initialize fSkipSet by loading Japanese Hiragana and extensions.
388   void initJapanesePhraseParameter(UErrorCode& error);
389 
390   Hashtable fSkipSet;
391 
392  public:
393 
394     /**
395      * <p>Default constructor.</p>
396      *
397      * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
398      * engine is deleted. The DictionaryMatcher must contain costs for each word
399      * in order for the dictionary to work properly.
400      */
401   CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
402 
403     /**
404      * <p>Virtual destructor.</p>
405      */
406   virtual ~CjkBreakEngine();
407 
408  protected:
409     /**
410      * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
411      *
412      * @param text A UText representing the text
413      * @param rangeStart The start of the range of dictionary characters
414      * @param rangeEnd The end of the range of dictionary characters
415      * @param foundBreaks Output of C array of int32_t break positions, or 0
416      * @param status Information on any errors encountered.
417      * @return The number of breaks found
418      */
419   virtual int32_t divideUpDictionaryRange( UText *text,
420           int32_t rangeStart,
421           int32_t rangeEnd,
422           UVector32 &foundBreaks,
423           UBool isPhraseBreaking,
424           UErrorCode& status) const override;
425 
426 };
427 
428 #endif
429 
430 U_NAMESPACE_END
431 
432     /* DICTBE_H */
433 #endif
434