• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  *******************************************************************************
3  * Copyright (C) 2006,2012-2013, International Business Machines Corporation   *
4  * and others. All Rights Reserved.                                            *
5  *******************************************************************************
6  */
7 
8 #ifndef DICTBE_H
9 #define DICTBE_H
10 
11 #include "unicode/utypes.h"
12 #include "unicode/uniset.h"
13 #include "unicode/utext.h"
14 
15 #include "brkeng.h"
16 
17 U_NAMESPACE_BEGIN
18 
19 class DictionaryMatcher;
20 
21 /*******************************************************************
22  * DictionaryBreakEngine
23  */
24 
25 /**
26  * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
27  * dictionary to determine language-specific breaks.</p>
28  *
29  * <p>After it is constructed a DictionaryBreakEngine may be shared between
30  * threads without synchronization.</p>
31  */
32 class DictionaryBreakEngine : public LanguageBreakEngine {
33  private:
34     /**
35      * The set of characters handled by this engine
36      * @internal
37      */
38 
39   UnicodeSet    fSet;
40 
41     /**
42      * The set of break types handled by this engine
43      * @internal
44      */
45 
46   uint32_t      fTypes;
47 
48   /**
49    * <p>Default constructor.</p>
50    *
51    */
52   DictionaryBreakEngine();
53 
54  public:
55 
56   /**
57    * <p>Constructor setting the break types handled.</p>
58    *
59    * @param breakTypes A bitmap of types handled by the engine.
60    */
61   DictionaryBreakEngine( uint32_t breakTypes );
62 
63   /**
64    * <p>Virtual destructor.</p>
65    */
66   virtual ~DictionaryBreakEngine();
67 
68   /**
69    * <p>Indicate whether this engine handles a particular character for
70    * a particular kind of break.</p>
71    *
72    * @param c A character which begins a run that the engine might handle
73    * @param breakType The type of text break which the caller wants to determine
74    * @return TRUE if this engine handles the particular character and break
75    * type.
76    */
77   virtual UBool handles( UChar32 c, int32_t breakType ) const;
78 
79   /**
80    * <p>Find any breaks within a run in the supplied text.</p>
81    *
82    * @param text A UText representing the text. The iterator is left at
83    * the end of the run of characters which the engine is capable of handling
84    * that starts from the first (or last) character in the range.
85    * @param startPos The start of the run within the supplied text.
86    * @param endPos The end of the run within the supplied text.
87    * @param reverse Whether the caller is looking for breaks in a reverse
88    * direction.
89    * @param breakType The type of break desired, or -1.
90    * @param foundBreaks An allocated C array of the breaks found, if any
91    * @return The number of breaks found.
92    */
93   virtual int32_t findBreaks( UText *text,
94                               int32_t startPos,
95                               int32_t endPos,
96                               UBool reverse,
97                               int32_t breakType,
98                               UStack &foundBreaks ) const;
99 
100  protected:
101 
102  /**
103   * <p>Set the character set handled by this engine.</p>
104   *
105   * @param set A UnicodeSet of the set of characters handled by the engine
106   */
107   virtual void setCharacters( const UnicodeSet &set );
108 
109  /**
110   * <p>Set the break types handled by this engine.</p>
111   *
112   * @param breakTypes A bitmap of types handled by the engine.
113   */
114 //  virtual void setBreakTypes( uint32_t breakTypes );
115 
116  /**
117   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
118   *
119   * @param text A UText representing the text
120   * @param rangeStart The start of the range of dictionary characters
121   * @param rangeEnd The end of the range of dictionary characters
122   * @param foundBreaks Output of C array of int32_t break positions, or 0
123   * @return The number of breaks found
124   */
125   virtual int32_t divideUpDictionaryRange( UText *text,
126                                            int32_t rangeStart,
127                                            int32_t rangeEnd,
128                                            UStack &foundBreaks ) const = 0;
129 
130 };
131 
132 /*******************************************************************
133  * ThaiBreakEngine
134  */
135 
136 /**
137  * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
138  * dictionary and heuristics to determine Thai-specific breaks.</p>
139  *
140  * <p>After it is constructed a ThaiBreakEngine may be shared between
141  * threads without synchronization.</p>
142  */
143 class ThaiBreakEngine : public DictionaryBreakEngine {
144  private:
145     /**
146      * The set of characters handled by this engine
147      * @internal
148      */
149 
150   UnicodeSet                fThaiWordSet;
151   UnicodeSet                fEndWordSet;
152   UnicodeSet                fBeginWordSet;
153   UnicodeSet                fSuffixSet;
154   UnicodeSet                fMarkSet;
155   DictionaryMatcher  *fDictionary;
156 
157  public:
158 
159   /**
160    * <p>Default constructor.</p>
161    *
162    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
163    * engine is deleted.
164    */
165   ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
166 
167   /**
168    * <p>Virtual destructor.</p>
169    */
170   virtual ~ThaiBreakEngine();
171 
172  protected:
173  /**
174   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
175   *
176   * @param text A UText representing the text
177   * @param rangeStart The start of the range of dictionary characters
178   * @param rangeEnd The end of the range of dictionary characters
179   * @param foundBreaks Output of C array of int32_t break positions, or 0
180   * @return The number of breaks found
181   */
182   virtual int32_t divideUpDictionaryRange( UText *text,
183                                            int32_t rangeStart,
184                                            int32_t rangeEnd,
185                                            UStack &foundBreaks ) const;
186 
187 };
188 
189 /*******************************************************************
190  * LaoBreakEngine
191  */
192 
193 /**
194  * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
195  * dictionary and heuristics to determine Lao-specific breaks.</p>
196  *
197  * <p>After it is constructed a LaoBreakEngine may be shared between
198  * threads without synchronization.</p>
199  */
200 class LaoBreakEngine : public DictionaryBreakEngine {
201  private:
202     /**
203      * The set of characters handled by this engine
204      * @internal
205      */
206 
207   UnicodeSet                fLaoWordSet;
208   UnicodeSet                fEndWordSet;
209   UnicodeSet                fBeginWordSet;
210   UnicodeSet                fMarkSet;
211   DictionaryMatcher  *fDictionary;
212 
213  public:
214 
215   /**
216    * <p>Default constructor.</p>
217    *
218    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
219    * engine is deleted.
220    */
221   LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
222 
223   /**
224    * <p>Virtual destructor.</p>
225    */
226   virtual ~LaoBreakEngine();
227 
228  protected:
229  /**
230   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
231   *
232   * @param text A UText representing the text
233   * @param rangeStart The start of the range of dictionary characters
234   * @param rangeEnd The end of the range of dictionary characters
235   * @param foundBreaks Output of C array of int32_t break positions, or 0
236   * @return The number of breaks found
237   */
238   virtual int32_t divideUpDictionaryRange( UText *text,
239                                            int32_t rangeStart,
240                                            int32_t rangeEnd,
241                                            UStack &foundBreaks ) const;
242 
243 };
244 
245 /*******************************************************************
246  * KhmerBreakEngine
247  */
248 
249 /**
250  * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
251  * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
252  *
253  * <p>After it is constructed a KhmerBreakEngine may be shared between
254  * threads without synchronization.</p>
255  */
256 class KhmerBreakEngine : public DictionaryBreakEngine {
257  private:
258     /**
259      * The set of characters handled by this engine
260      * @internal
261      */
262 
263   UnicodeSet                fKhmerWordSet;
264   UnicodeSet                fEndWordSet;
265   UnicodeSet                fBeginWordSet;
266   UnicodeSet                fMarkSet;
267   DictionaryMatcher  *fDictionary;
268 
269  public:
270 
271   /**
272    * <p>Default constructor.</p>
273    *
274    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
275    * engine is deleted.
276    */
277   KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
278 
279   /**
280    * <p>Virtual destructor.</p>
281    */
282   virtual ~KhmerBreakEngine();
283 
284  protected:
285  /**
286   * <p>Divide up a range of known dictionary characters.</p>
287   *
288   * @param text A UText representing the text
289   * @param rangeStart The start of the range of dictionary characters
290   * @param rangeEnd The end of the range of dictionary characters
291   * @param foundBreaks Output of C array of int32_t break positions, or 0
292   * @return The number of breaks found
293   */
294   virtual int32_t divideUpDictionaryRange( UText *text,
295                                            int32_t rangeStart,
296                                            int32_t rangeEnd,
297                                            UStack &foundBreaks ) const;
298 
299 };
300 
301 #if !UCONFIG_NO_NORMALIZATION
302 
303 /*******************************************************************
304  * CjkBreakEngine
305  */
306 
307 //indicates language/script that the CjkBreakEngine will handle
308 enum LanguageType {
309     kKorean,
310     kChineseJapanese
311 };
312 
313 /**
314  * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
315  * dictionary with costs associated with each word and
316  * Viterbi decoding to determine CJK-specific breaks.</p>
317  */
318 class CjkBreakEngine : public DictionaryBreakEngine {
319  protected:
320     /**
321      * The set of characters handled by this engine
322      * @internal
323      */
324   UnicodeSet                fHangulWordSet;
325   UnicodeSet                fHanWordSet;
326   UnicodeSet                fKatakanaWordSet;
327   UnicodeSet                fHiraganaWordSet;
328 
329   DictionaryMatcher  *fDictionary;
330 
331  public:
332 
333     /**
334      * <p>Default constructor.</p>
335      *
336      * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
337      * engine is deleted. The DictionaryMatcher must contain costs for each word
338      * in order for the dictionary to work properly.
339      */
340   CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
341 
342     /**
343      * <p>Virtual destructor.</p>
344      */
345   virtual ~CjkBreakEngine();
346 
347  protected:
348     /**
349      * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
350      *
351      * @param text A UText representing the text
352      * @param rangeStart The start of the range of dictionary characters
353      * @param rangeEnd The end of the range of dictionary characters
354      * @param foundBreaks Output of C array of int32_t break positions, or 0
355      * @return The number of breaks found
356      */
357   virtual int32_t divideUpDictionaryRange( UText *text,
358           int32_t rangeStart,
359           int32_t rangeEnd,
360           UStack &foundBreaks ) const;
361 
362 };
363 
364 #endif
365 
366 U_NAMESPACE_END
367 
368     /* DICTBE_H */
369 #endif
370