• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4  *******************************************************************************
5  * Copyright (C) 2006-2014, International Business Machines Corporation   *
6  * and others. All Rights Reserved.                                            *
7  *******************************************************************************
8  */
9 
10 #ifndef DICTBE_H
11 #define DICTBE_H
12 
13 #include "unicode/utypes.h"
14 #include "unicode/uniset.h"
15 #include "unicode/utext.h"
16 
17 #include "brkeng.h"
18 #include "uvectr32.h"
19 
20 U_NAMESPACE_BEGIN
21 
22 class DictionaryMatcher;
23 class Normalizer2;
24 
25 /*******************************************************************
26  * DictionaryBreakEngine
27  */
28 
29 /**
30  * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
31  * dictionary to determine language-specific breaks.</p>
32  *
33  * <p>After it is constructed a DictionaryBreakEngine may be shared between
34  * threads without synchronization.</p>
35  */
36 class DictionaryBreakEngine : public LanguageBreakEngine {
37  private:
38     /**
39      * The set of characters handled by this engine
40      * @internal
41      */
42 
43   UnicodeSet    fSet;
44 
45  public:
46 
47   /**
48    * <p>Constructor </p>
49    */
50   DictionaryBreakEngine();
51 
52   /**
53    * <p>Virtual destructor.</p>
54    */
55   virtual ~DictionaryBreakEngine();
56 
57   /**
58    * <p>Indicate whether this engine handles a particular character for
59    * a particular kind of break.</p>
60    *
61    * @param c A character which begins a run that the engine might handle
62    * @return true if this engine handles the particular character and break
63    * type.
64    */
65   virtual UBool handles(UChar32 c) const override;
66 
67   /**
68    * <p>Find any breaks within a run in the supplied text.</p>
69    *
70    * @param text A UText representing the text. The iterator is left at
71    * the end of the run of characters which the engine is capable of handling
72    * that starts from the first character in the range.
73    * @param startPos The start of the run within the supplied text.
74    * @param endPos The end of the run within the supplied text.
75    * @param foundBreaks vector of int32_t to receive the break positions
76    * @param status Information on any errors encountered.
77    * @return The number of breaks found.
78    */
79   virtual int32_t findBreaks( UText *text,
80                               int32_t startPos,
81                               int32_t endPos,
82                               UVector32 &foundBreaks,
83                               UErrorCode& status ) const override;
84 
85  protected:
86 
87  /**
88   * <p>Set the character set handled by this engine.</p>
89   *
90   * @param set A UnicodeSet of the set of characters handled by the engine
91   */
92   virtual void setCharacters( const UnicodeSet &set );
93 
94  /**
95   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
96   *
97   * @param text A UText representing the text
98   * @param rangeStart The start of the range of dictionary characters
99   * @param rangeEnd The end of the range of dictionary characters
100   * @param foundBreaks Output of C array of int32_t break positions, or 0
101   * @param status Information on any errors encountered.
102   * @return The number of breaks found
103   */
104   virtual int32_t divideUpDictionaryRange( UText *text,
105                                            int32_t rangeStart,
106                                            int32_t rangeEnd,
107                                            UVector32 &foundBreaks,
108                                            UErrorCode& status) const = 0;
109 
110 };
111 
112 /*******************************************************************
113  * ThaiBreakEngine
114  */
115 
116 /**
117  * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
118  * dictionary and heuristics to determine Thai-specific breaks.</p>
119  *
120  * <p>After it is constructed a ThaiBreakEngine may be shared between
121  * threads without synchronization.</p>
122  */
123 class ThaiBreakEngine : public DictionaryBreakEngine {
124  private:
125     /**
126      * The set of characters handled by this engine
127      * @internal
128      */
129 
130   UnicodeSet                fThaiWordSet;
131   UnicodeSet                fEndWordSet;
132   UnicodeSet                fBeginWordSet;
133   UnicodeSet                fSuffixSet;
134   UnicodeSet                fMarkSet;
135   DictionaryMatcher  *fDictionary;
136 
137  public:
138 
139   /**
140    * <p>Default constructor.</p>
141    *
142    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
143    * engine is deleted.
144    */
145   ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
146 
147   /**
148    * <p>Virtual destructor.</p>
149    */
150   virtual ~ThaiBreakEngine();
151 
152  protected:
153  /**
154   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
155   *
156   * @param text A UText representing the text
157   * @param rangeStart The start of the range of dictionary characters
158   * @param rangeEnd The end of the range of dictionary characters
159   * @param foundBreaks Output of C array of int32_t break positions, or 0
160   * @param status Information on any errors encountered.
161   * @return The number of breaks found
162   */
163   virtual int32_t divideUpDictionaryRange( UText *text,
164                                            int32_t rangeStart,
165                                            int32_t rangeEnd,
166                                            UVector32 &foundBreaks,
167                                            UErrorCode& status) const override;
168 
169 };
170 
171 /*******************************************************************
172  * LaoBreakEngine
173  */
174 
175 /**
176  * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
177  * dictionary and heuristics to determine Lao-specific breaks.</p>
178  *
179  * <p>After it is constructed a LaoBreakEngine may be shared between
180  * threads without synchronization.</p>
181  */
182 class LaoBreakEngine : public DictionaryBreakEngine {
183  private:
184     /**
185      * The set of characters handled by this engine
186      * @internal
187      */
188 
189   UnicodeSet                fLaoWordSet;
190   UnicodeSet                fEndWordSet;
191   UnicodeSet                fBeginWordSet;
192   UnicodeSet                fMarkSet;
193   DictionaryMatcher  *fDictionary;
194 
195  public:
196 
197   /**
198    * <p>Default constructor.</p>
199    *
200    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
201    * engine is deleted.
202    */
203   LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
204 
205   /**
206    * <p>Virtual destructor.</p>
207    */
208   virtual ~LaoBreakEngine();
209 
210  protected:
211  /**
212   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
213   *
214   * @param text A UText representing the text
215   * @param rangeStart The start of the range of dictionary characters
216   * @param rangeEnd The end of the range of dictionary characters
217   * @param foundBreaks Output of C array of int32_t break positions, or 0
218   * @param status Information on any errors encountered.
219   * @return The number of breaks found
220   */
221   virtual int32_t divideUpDictionaryRange( UText *text,
222                                            int32_t rangeStart,
223                                            int32_t rangeEnd,
224                                            UVector32 &foundBreaks,
225                                            UErrorCode& status) const override;
226 
227 };
228 
229 /*******************************************************************
230  * BurmeseBreakEngine
231  */
232 
233 /**
234  * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
235  * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
236  *
237  * <p>After it is constructed a BurmeseBreakEngine may be shared between
238  * threads without synchronization.</p>
239  */
240 class BurmeseBreakEngine : public DictionaryBreakEngine {
241  private:
242     /**
243      * The set of characters handled by this engine
244      * @internal
245      */
246 
247   UnicodeSet                fBurmeseWordSet;
248   UnicodeSet                fEndWordSet;
249   UnicodeSet                fBeginWordSet;
250   UnicodeSet                fMarkSet;
251   DictionaryMatcher  *fDictionary;
252 
253  public:
254 
255   /**
256    * <p>Default constructor.</p>
257    *
258    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
259    * engine is deleted.
260    */
261   BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
262 
263   /**
264    * <p>Virtual destructor.</p>
265    */
266   virtual ~BurmeseBreakEngine();
267 
268  protected:
269  /**
270   * <p>Divide up a range of known dictionary characters.</p>
271   *
272   * @param text A UText representing the text
273   * @param rangeStart The start of the range of dictionary characters
274   * @param rangeEnd The end of the range of dictionary characters
275   * @param foundBreaks Output of C array of int32_t break positions, or 0
276   * @param status Information on any errors encountered.
277   * @return The number of breaks found
278   */
279   virtual int32_t divideUpDictionaryRange( UText *text,
280                                            int32_t rangeStart,
281                                            int32_t rangeEnd,
282                                            UVector32 &foundBreaks,
283                                            UErrorCode& status) const override;
284 
285 };
286 
287 /*******************************************************************
288  * KhmerBreakEngine
289  */
290 
291 /**
292  * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
293  * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
294  *
295  * <p>After it is constructed a KhmerBreakEngine may be shared between
296  * threads without synchronization.</p>
297  */
298 class KhmerBreakEngine : public DictionaryBreakEngine {
299  private:
300     /**
301      * The set of characters handled by this engine
302      * @internal
303      */
304 
305   UnicodeSet                fKhmerWordSet;
306   UnicodeSet                fEndWordSet;
307   UnicodeSet                fBeginWordSet;
308   UnicodeSet                fMarkSet;
309   DictionaryMatcher  *fDictionary;
310 
311  public:
312 
313   /**
314    * <p>Default constructor.</p>
315    *
316    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
317    * engine is deleted.
318    */
319   KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
320 
321   /**
322    * <p>Virtual destructor.</p>
323    */
324   virtual ~KhmerBreakEngine();
325 
326  protected:
327  /**
328   * <p>Divide up a range of known dictionary characters.</p>
329   *
330   * @param text A UText representing the text
331   * @param rangeStart The start of the range of dictionary characters
332   * @param rangeEnd The end of the range of dictionary characters
333   * @param foundBreaks Output of C array of int32_t break positions, or 0
334   * @param status Information on any errors encountered.
335   * @return The number of breaks found
336   */
337   virtual int32_t divideUpDictionaryRange( UText *text,
338                                            int32_t rangeStart,
339                                            int32_t rangeEnd,
340                                            UVector32 &foundBreaks,
341                                            UErrorCode& status) const override;
342 
343 };
344 
345 #if !UCONFIG_NO_NORMALIZATION
346 
347 /*******************************************************************
348  * CjkBreakEngine
349  */
350 
351 //indicates language/script that the CjkBreakEngine will handle
352 enum LanguageType {
353     kKorean,
354     kChineseJapanese
355 };
356 
357 /**
358  * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
359  * dictionary with costs associated with each word and
360  * Viterbi decoding to determine CJK-specific breaks.</p>
361  */
362 class CjkBreakEngine : public DictionaryBreakEngine {
363  protected:
364     /**
365      * The set of characters handled by this engine
366      * @internal
367      */
368   UnicodeSet                fHangulWordSet;
369   UnicodeSet                fHanWordSet;
370   UnicodeSet                fKatakanaWordSet;
371   UnicodeSet                fHiraganaWordSet;
372 
373   DictionaryMatcher        *fDictionary;
374   const Normalizer2        *nfkcNorm2;
375 
376  public:
377 
378     /**
379      * <p>Default constructor.</p>
380      *
381      * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
382      * engine is deleted. The DictionaryMatcher must contain costs for each word
383      * in order for the dictionary to work properly.
384      */
385   CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
386 
387     /**
388      * <p>Virtual destructor.</p>
389      */
390   virtual ~CjkBreakEngine();
391 
392  protected:
393     /**
394      * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
395      *
396      * @param text A UText representing the text
397      * @param rangeStart The start of the range of dictionary characters
398      * @param rangeEnd The end of the range of dictionary characters
399      * @param foundBreaks Output of C array of int32_t break positions, or 0
400      * @param status Information on any errors encountered.
401      * @return The number of breaks found
402      */
403   virtual int32_t divideUpDictionaryRange( UText *text,
404           int32_t rangeStart,
405           int32_t rangeEnd,
406           UVector32 &foundBreaks,
407           UErrorCode& status) const override;
408 
409 };
410 
411 #endif
412 
413 U_NAMESPACE_END
414 
415     /* DICTBE_H */
416 #endif
417