• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4  *******************************************************************************
5  * Copyright (C) 2006-2014, International Business Machines Corporation   *
6  * and others. All Rights Reserved.                                            *
7  *******************************************************************************
8  */
9 
10 #ifndef DICTBE_H
11 #define DICTBE_H
12 
13 #include "unicode/utypes.h"
14 #include "unicode/uniset.h"
15 #include "unicode/utext.h"
16 
17 #include "brkeng.h"
18 #include "uvectr32.h"
19 
20 U_NAMESPACE_BEGIN
21 
22 class DictionaryMatcher;
23 class Normalizer2;
24 
25 /*******************************************************************
26  * DictionaryBreakEngine
27  */
28 
29 /**
30  * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
31  * dictionary to determine language-specific breaks.</p>
32  *
33  * <p>After it is constructed a DictionaryBreakEngine may be shared between
34  * threads without synchronization.</p>
35  */
36 class DictionaryBreakEngine : public LanguageBreakEngine {
37  private:
38     /**
39      * The set of characters handled by this engine
40      * @internal
41      */
42 
43   UnicodeSet    fSet;
44 
45     /**
46      * The set of break types handled by this engine
47      * @internal
48      */
49 
50   uint32_t      fTypes;
51 
52   /**
53    * <p>Default constructor.</p>
54    *
55    */
56   DictionaryBreakEngine();
57 
58  public:
59 
60   /**
61    * <p>Constructor setting the break types handled.</p>
62    *
63    * @param breakTypes A bitmap of types handled by the engine.
64    */
65   DictionaryBreakEngine( uint32_t breakTypes );
66 
67   /**
68    * <p>Virtual destructor.</p>
69    */
70   virtual ~DictionaryBreakEngine();
71 
72   /**
73    * <p>Indicate whether this engine handles a particular character for
74    * a particular kind of break.</p>
75    *
76    * @param c A character which begins a run that the engine might handle
77    * @param breakType The type of text break which the caller wants to determine
78    * @return TRUE if this engine handles the particular character and break
79    * type.
80    */
81   virtual UBool handles( UChar32 c, int32_t breakType ) const;
82 
83   /**
84    * <p>Find any breaks within a run in the supplied text.</p>
85    *
86    * @param text A UText representing the text. The iterator is left at
87    * the end of the run of characters which the engine is capable of handling
88    * that starts from the first character in the range.
89    * @param startPos The start of the run within the supplied text.
90    * @param endPos The end of the run within the supplied text.
91    * @param breakType The type of break desired, or -1.
92    * @param foundBreaks vector of int32_t to receive the break positions
93    * @return The number of breaks found.
94    */
95   virtual int32_t findBreaks( UText *text,
96                               int32_t startPos,
97                               int32_t endPos,
98                               int32_t breakType,
99                               UVector32 &foundBreaks ) const;
100 
101  protected:
102 
103  /**
104   * <p>Set the character set handled by this engine.</p>
105   *
106   * @param set A UnicodeSet of the set of characters handled by the engine
107   */
108   virtual void setCharacters( const UnicodeSet &set );
109 
110  /**
111   * <p>Set the break types handled by this engine.</p>
112   *
113   * @param breakTypes A bitmap of types handled by the engine.
114   */
115 //  virtual void setBreakTypes( uint32_t breakTypes );
116 
117  /**
118   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
119   *
120   * @param text A UText representing the text
121   * @param rangeStart The start of the range of dictionary characters
122   * @param rangeEnd The end of the range of dictionary characters
123   * @param foundBreaks Output of C array of int32_t break positions, or 0
124   * @return The number of breaks found
125   */
126   virtual int32_t divideUpDictionaryRange( UText *text,
127                                            int32_t rangeStart,
128                                            int32_t rangeEnd,
129                                            UVector32 &foundBreaks ) const = 0;
130 
131 };
132 
133 /*******************************************************************
134  * ThaiBreakEngine
135  */
136 
137 /**
138  * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
139  * dictionary and heuristics to determine Thai-specific breaks.</p>
140  *
141  * <p>After it is constructed a ThaiBreakEngine may be shared between
142  * threads without synchronization.</p>
143  */
144 class ThaiBreakEngine : public DictionaryBreakEngine {
145  private:
146     /**
147      * The set of characters handled by this engine
148      * @internal
149      */
150 
151   UnicodeSet                fThaiWordSet;
152   UnicodeSet                fEndWordSet;
153   UnicodeSet                fBeginWordSet;
154   UnicodeSet                fSuffixSet;
155   UnicodeSet                fMarkSet;
156   DictionaryMatcher  *fDictionary;
157 
158  public:
159 
160   /**
161    * <p>Default constructor.</p>
162    *
163    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
164    * engine is deleted.
165    */
166   ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
167 
168   /**
169    * <p>Virtual destructor.</p>
170    */
171   virtual ~ThaiBreakEngine();
172 
173  protected:
174  /**
175   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
176   *
177   * @param text A UText representing the text
178   * @param rangeStart The start of the range of dictionary characters
179   * @param rangeEnd The end of the range of dictionary characters
180   * @param foundBreaks Output of C array of int32_t break positions, or 0
181   * @return The number of breaks found
182   */
183   virtual int32_t divideUpDictionaryRange( UText *text,
184                                            int32_t rangeStart,
185                                            int32_t rangeEnd,
186                                            UVector32 &foundBreaks ) const;
187 
188 };
189 
190 /*******************************************************************
191  * LaoBreakEngine
192  */
193 
194 /**
195  * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
196  * dictionary and heuristics to determine Lao-specific breaks.</p>
197  *
198  * <p>After it is constructed a LaoBreakEngine may be shared between
199  * threads without synchronization.</p>
200  */
201 class LaoBreakEngine : public DictionaryBreakEngine {
202  private:
203     /**
204      * The set of characters handled by this engine
205      * @internal
206      */
207 
208   UnicodeSet                fLaoWordSet;
209   UnicodeSet                fEndWordSet;
210   UnicodeSet                fBeginWordSet;
211   UnicodeSet                fMarkSet;
212   DictionaryMatcher  *fDictionary;
213 
214  public:
215 
216   /**
217    * <p>Default constructor.</p>
218    *
219    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
220    * engine is deleted.
221    */
222   LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
223 
224   /**
225    * <p>Virtual destructor.</p>
226    */
227   virtual ~LaoBreakEngine();
228 
229  protected:
230  /**
231   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
232   *
233   * @param text A UText representing the text
234   * @param rangeStart The start of the range of dictionary characters
235   * @param rangeEnd The end of the range of dictionary characters
236   * @param foundBreaks Output of C array of int32_t break positions, or 0
237   * @return The number of breaks found
238   */
239   virtual int32_t divideUpDictionaryRange( UText *text,
240                                            int32_t rangeStart,
241                                            int32_t rangeEnd,
242                                            UVector32 &foundBreaks ) const;
243 
244 };
245 
246 /*******************************************************************
247  * BurmeseBreakEngine
248  */
249 
250 /**
251  * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
252  * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
253  *
254  * <p>After it is constructed a BurmeseBreakEngine may be shared between
255  * threads without synchronization.</p>
256  */
257 class BurmeseBreakEngine : public DictionaryBreakEngine {
258  private:
259     /**
260      * The set of characters handled by this engine
261      * @internal
262      */
263 
264   UnicodeSet                fBurmeseWordSet;
265   UnicodeSet                fEndWordSet;
266   UnicodeSet                fBeginWordSet;
267   UnicodeSet                fMarkSet;
268   DictionaryMatcher  *fDictionary;
269 
270  public:
271 
272   /**
273    * <p>Default constructor.</p>
274    *
275    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
276    * engine is deleted.
277    */
278   BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
279 
280   /**
281    * <p>Virtual destructor.</p>
282    */
283   virtual ~BurmeseBreakEngine();
284 
285  protected:
286  /**
287   * <p>Divide up a range of known dictionary characters.</p>
288   *
289   * @param text A UText representing the text
290   * @param rangeStart The start of the range of dictionary characters
291   * @param rangeEnd The end of the range of dictionary characters
292   * @param foundBreaks Output of C array of int32_t break positions, or 0
293   * @return The number of breaks found
294   */
295   virtual int32_t divideUpDictionaryRange( UText *text,
296                                            int32_t rangeStart,
297                                            int32_t rangeEnd,
298                                            UVector32 &foundBreaks ) const;
299 
300 };
301 
302 /*******************************************************************
303  * KhmerBreakEngine
304  */
305 
306 /**
307  * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
308  * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
309  *
310  * <p>After it is constructed a KhmerBreakEngine may be shared between
311  * threads without synchronization.</p>
312  */
313 class KhmerBreakEngine : public DictionaryBreakEngine {
314  private:
315     /**
316      * The set of characters handled by this engine
317      * @internal
318      */
319 
320   UnicodeSet                fKhmerWordSet;
321   UnicodeSet                fEndWordSet;
322   UnicodeSet                fBeginWordSet;
323   UnicodeSet                fMarkSet;
324   DictionaryMatcher  *fDictionary;
325 
326  public:
327 
328   /**
329    * <p>Default constructor.</p>
330    *
331    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
332    * engine is deleted.
333    */
334   KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
335 
336   /**
337    * <p>Virtual destructor.</p>
338    */
339   virtual ~KhmerBreakEngine();
340 
341  protected:
342  /**
343   * <p>Divide up a range of known dictionary characters.</p>
344   *
345   * @param text A UText representing the text
346   * @param rangeStart The start of the range of dictionary characters
347   * @param rangeEnd The end of the range of dictionary characters
348   * @param foundBreaks Output of C array of int32_t break positions, or 0
349   * @return The number of breaks found
350   */
351   virtual int32_t divideUpDictionaryRange( UText *text,
352                                            int32_t rangeStart,
353                                            int32_t rangeEnd,
354                                            UVector32 &foundBreaks ) const;
355 
356 };
357 
358 #if !UCONFIG_NO_NORMALIZATION
359 
360 /*******************************************************************
361  * CjkBreakEngine
362  */
363 
364 //indicates language/script that the CjkBreakEngine will handle
365 enum LanguageType {
366     kKorean,
367     kChineseJapanese
368 };
369 
370 /**
371  * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
372  * dictionary with costs associated with each word and
373  * Viterbi decoding to determine CJK-specific breaks.</p>
374  */
375 class CjkBreakEngine : public DictionaryBreakEngine {
376  protected:
377     /**
378      * The set of characters handled by this engine
379      * @internal
380      */
381   UnicodeSet                fHangulWordSet;
382   UnicodeSet                fHanWordSet;
383   UnicodeSet                fKatakanaWordSet;
384   UnicodeSet                fHiraganaWordSet;
385 
386   DictionaryMatcher        *fDictionary;
387   const Normalizer2        *nfkcNorm2;
388 
389  public:
390 
391     /**
392      * <p>Default constructor.</p>
393      *
394      * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
395      * engine is deleted. The DictionaryMatcher must contain costs for each word
396      * in order for the dictionary to work properly.
397      */
398   CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
399 
400     /**
401      * <p>Virtual destructor.</p>
402      */
403   virtual ~CjkBreakEngine();
404 
405  protected:
406     /**
407      * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
408      *
409      * @param text A UText representing the text
410      * @param rangeStart The start of the range of dictionary characters
411      * @param rangeEnd The end of the range of dictionary characters
412      * @param foundBreaks Output of C array of int32_t break positions, or 0
413      * @return The number of breaks found
414      */
415   virtual int32_t divideUpDictionaryRange( UText *text,
416           int32_t rangeStart,
417           int32_t rangeEnd,
418           UVector32 &foundBreaks ) const;
419 
420 };
421 
422 #endif
423 
424 U_NAMESPACE_END
425 
426     /* DICTBE_H */
427 #endif
428