• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4  *******************************************************************************
5  * Copyright (C) 2006-2014, International Business Machines Corporation   *
6  * and others. All Rights Reserved.                                            *
7  *******************************************************************************
8  */
9 
10 #ifndef DICTBE_H
11 #define DICTBE_H
12 
13 #include "unicode/utypes.h"
14 #include "unicode/uniset.h"
15 #include "unicode/utext.h"
16 
17 #include "brkeng.h"
18 #include "hash.h"
19 #include "uvectr32.h"
20 
21 U_NAMESPACE_BEGIN
22 
23 class DictionaryMatcher;
24 class Normalizer2;
25 
26 /*******************************************************************
27  * DictionaryBreakEngine
28  */
29 
30 /**
31  * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
32  * dictionary to determine language-specific breaks.</p>
33  *
34  * <p>After it is constructed a DictionaryBreakEngine may be shared between
35  * threads without synchronization.</p>
36  */
37 class DictionaryBreakEngine : public LanguageBreakEngine {
38  private:
39     /**
40      * The set of characters handled by this engine
41      * @internal
42      */
43 
44   UnicodeSet    fSet;
45 
46  public:
47 
48   /**
49    * <p>Constructor </p>
50    */
51   DictionaryBreakEngine();
52 
53   /**
54    * <p>Virtual destructor.</p>
55    */
56   virtual ~DictionaryBreakEngine();
57 
58   /**
59    * <p>Indicate whether this engine handles a particular character for
60    * a particular kind of break.</p>
61    *
62    * @param c A character which begins a run that the engine might handle
63    * @return true if this engine handles the particular character and break
64    * type.
65    */
66   virtual UBool handles(UChar32 c) const override;
67 
68   /**
69    * <p>Find any breaks within a run in the supplied text.</p>
70    *
71    * @param text A UText representing the text. The iterator is left at
72    * the end of the run of characters which the engine is capable of handling
73    * that starts from the first character in the range.
74    * @param startPos The start of the run within the supplied text.
75    * @param endPos The end of the run within the supplied text.
76    * @param foundBreaks vector of int32_t to receive the break positions
77    * @param status Information on any errors encountered.
78    * @return The number of breaks found.
79    */
80   virtual int32_t findBreaks( UText *text,
81                               int32_t startPos,
82                               int32_t endPos,
83                               UVector32 &foundBreaks,
84                               UBool isPhraseBreaking,
85                               UErrorCode& status ) const override;
86 
87  protected:
88 
89  /**
90   * <p>Set the character set handled by this engine.</p>
91   *
92   * @param set A UnicodeSet of the set of characters handled by the engine
93   */
94   virtual void setCharacters( const UnicodeSet &set );
95 
96  /**
97   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
98   *
99   * @param text A UText representing the text
100   * @param rangeStart The start of the range of dictionary characters
101   * @param rangeEnd The end of the range of dictionary characters
102   * @param foundBreaks Output of C array of int32_t break positions, or 0
103   * @param status Information on any errors encountered.
104   * @return The number of breaks found
105   */
106   virtual int32_t divideUpDictionaryRange( UText *text,
107                                            int32_t rangeStart,
108                                            int32_t rangeEnd,
109                                            UVector32 &foundBreaks,
110                                            UBool isPhraseBreaking,
111                                            UErrorCode& status) const = 0;
112 
113 };
114 
115 /*******************************************************************
116  * ThaiBreakEngine
117  */
118 
119 /**
120  * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
121  * dictionary and heuristics to determine Thai-specific breaks.</p>
122  *
123  * <p>After it is constructed a ThaiBreakEngine may be shared between
124  * threads without synchronization.</p>
125  */
126 class ThaiBreakEngine : public DictionaryBreakEngine {
127  private:
128     /**
129      * The set of characters handled by this engine
130      * @internal
131      */
132 
133   UnicodeSet                fEndWordSet;
134   UnicodeSet                fBeginWordSet;
135   UnicodeSet                fSuffixSet;
136   UnicodeSet                fMarkSet;
137   DictionaryMatcher  *fDictionary;
138 
139  public:
140 
141   /**
142    * <p>Default constructor.</p>
143    *
144    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
145    * engine is deleted.
146    */
147   ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
148 
149   /**
150    * <p>Virtual destructor.</p>
151    */
152   virtual ~ThaiBreakEngine();
153 
154  protected:
155  /**
156   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
157   *
158   * @param text A UText representing the text
159   * @param rangeStart The start of the range of dictionary characters
160   * @param rangeEnd The end of the range of dictionary characters
161   * @param foundBreaks Output of C array of int32_t break positions, or 0
162   * @param status Information on any errors encountered.
163   * @return The number of breaks found
164   */
165   virtual int32_t divideUpDictionaryRange( UText *text,
166                                            int32_t rangeStart,
167                                            int32_t rangeEnd,
168                                            UVector32 &foundBreaks,
169                                            UBool isPhraseBreaking,
170                                            UErrorCode& status) const override;
171 
172 };
173 
174 /*******************************************************************
175  * LaoBreakEngine
176  */
177 
178 /**
179  * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
180  * dictionary and heuristics to determine Lao-specific breaks.</p>
181  *
182  * <p>After it is constructed a LaoBreakEngine may be shared between
183  * threads without synchronization.</p>
184  */
185 class LaoBreakEngine : public DictionaryBreakEngine {
186  private:
187     /**
188      * The set of characters handled by this engine
189      * @internal
190      */
191 
192   UnicodeSet                fEndWordSet;
193   UnicodeSet                fBeginWordSet;
194   UnicodeSet                fMarkSet;
195   DictionaryMatcher  *fDictionary;
196 
197  public:
198 
199   /**
200    * <p>Default constructor.</p>
201    *
202    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
203    * engine is deleted.
204    */
205   LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
206 
207   /**
208    * <p>Virtual destructor.</p>
209    */
210   virtual ~LaoBreakEngine();
211 
212  protected:
213  /**
214   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
215   *
216   * @param text A UText representing the text
217   * @param rangeStart The start of the range of dictionary characters
218   * @param rangeEnd The end of the range of dictionary characters
219   * @param foundBreaks Output of C array of int32_t break positions, or 0
220   * @param status Information on any errors encountered.
221   * @return The number of breaks found
222   */
223   virtual int32_t divideUpDictionaryRange( UText *text,
224                                            int32_t rangeStart,
225                                            int32_t rangeEnd,
226                                            UVector32 &foundBreaks,
227                                            UBool isPhraseBreaking,
228                                            UErrorCode& status) const override;
229 
230 };
231 
232 /*******************************************************************
233  * BurmeseBreakEngine
234  */
235 
236 /**
237  * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
238  * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
239  *
240  * <p>After it is constructed a BurmeseBreakEngine may be shared between
241  * threads without synchronization.</p>
242  */
243 class BurmeseBreakEngine : public DictionaryBreakEngine {
244  private:
245     /**
246      * The set of characters handled by this engine
247      * @internal
248      */
249 
250   UnicodeSet                fEndWordSet;
251   UnicodeSet                fBeginWordSet;
252   UnicodeSet                fMarkSet;
253   DictionaryMatcher  *fDictionary;
254 
255  public:
256 
257   /**
258    * <p>Default constructor.</p>
259    *
260    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
261    * engine is deleted.
262    */
263   BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
264 
265   /**
266    * <p>Virtual destructor.</p>
267    */
268   virtual ~BurmeseBreakEngine();
269 
270  protected:
271  /**
272   * <p>Divide up a range of known dictionary characters.</p>
273   *
274   * @param text A UText representing the text
275   * @param rangeStart The start of the range of dictionary characters
276   * @param rangeEnd The end of the range of dictionary characters
277   * @param foundBreaks Output of C array of int32_t break positions, or 0
278   * @param status Information on any errors encountered.
279   * @return The number of breaks found
280   */
281   virtual int32_t divideUpDictionaryRange( UText *text,
282                                            int32_t rangeStart,
283                                            int32_t rangeEnd,
284                                            UVector32 &foundBreaks,
285                                            UBool isPhraseBreaking,
286                                            UErrorCode& status) const override;
287 
288 };
289 
290 /*******************************************************************
291  * KhmerBreakEngine
292  */
293 
294 /**
295  * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
296  * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
297  *
298  * <p>After it is constructed a KhmerBreakEngine may be shared between
299  * threads without synchronization.</p>
300  */
301 class KhmerBreakEngine : public DictionaryBreakEngine {
302  private:
303     /**
304      * The set of characters handled by this engine
305      * @internal
306      */
307 
308   UnicodeSet                fEndWordSet;
309   UnicodeSet                fBeginWordSet;
310   UnicodeSet                fMarkSet;
311   DictionaryMatcher  *fDictionary;
312 
313  public:
314 
315   /**
316    * <p>Default constructor.</p>
317    *
318    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
319    * engine is deleted.
320    */
321   KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
322 
323   /**
324    * <p>Virtual destructor.</p>
325    */
326   virtual ~KhmerBreakEngine();
327 
328  protected:
329  /**
330   * <p>Divide up a range of known dictionary characters.</p>
331   *
332   * @param text A UText representing the text
333   * @param rangeStart The start of the range of dictionary characters
334   * @param rangeEnd The end of the range of dictionary characters
335   * @param foundBreaks Output of C array of int32_t break positions, or 0
336   * @param status Information on any errors encountered.
337   * @return The number of breaks found
338   */
339   virtual int32_t divideUpDictionaryRange( UText *text,
340                                            int32_t rangeStart,
341                                            int32_t rangeEnd,
342                                            UVector32 &foundBreaks,
343                                            UBool isPhraseBreaking,
344                                            UErrorCode& status) const override;
345 
346 };
347 
348 #if !UCONFIG_NO_NORMALIZATION
349 
350 /*******************************************************************
351  * CjkBreakEngine
352  */
353 
354 //indicates language/script that the CjkBreakEngine will handle
355 enum LanguageType {
356     kKorean,
357     kChineseJapanese
358 };
359 
360 /**
361  * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
362  * dictionary with costs associated with each word and
363  * Viterbi decoding to determine CJK-specific breaks.</p>
364  */
365 class CjkBreakEngine : public DictionaryBreakEngine {
366  protected:
367     /**
368      * The set of characters handled by this engine
369      * @internal
370      */
371   UnicodeSet                fHangulWordSet;
372   UnicodeSet                fDigitOrOpenPunctuationOrAlphabetSet;
373   UnicodeSet                fClosePunctuationSet;
374 
375   DictionaryMatcher        *fDictionary;
376   const Normalizer2        *nfkcNorm2;
377 
378  private:
379   // Load Japanese extensions.
380   void loadJapaneseExtensions(UErrorCode& error);
381   // Load Japanese Hiragana.
382   void loadHiragana(UErrorCode& error);
383   // Initialize fSkipSet by loading Japanese Hiragana and extensions.
384   void initJapanesePhraseParameter(UErrorCode& error);
385 
386   Hashtable fSkipSet;
387 
388  public:
389 
390     /**
391      * <p>Default constructor.</p>
392      *
393      * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
394      * engine is deleted. The DictionaryMatcher must contain costs for each word
395      * in order for the dictionary to work properly.
396      */
397   CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
398 
399     /**
400      * <p>Virtual destructor.</p>
401      */
402   virtual ~CjkBreakEngine();
403 
404  protected:
405     /**
406      * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
407      *
408      * @param text A UText representing the text
409      * @param rangeStart The start of the range of dictionary characters
410      * @param rangeEnd The end of the range of dictionary characters
411      * @param foundBreaks Output of C array of int32_t break positions, or 0
412      * @param status Information on any errors encountered.
413      * @return The number of breaks found
414      */
415   virtual int32_t divideUpDictionaryRange( UText *text,
416           int32_t rangeStart,
417           int32_t rangeEnd,
418           UVector32 &foundBreaks,
419           UBool isPhraseBreaking,
420           UErrorCode& status) const override;
421 
422 };
423 
424 #endif
425 
426 U_NAMESPACE_END
427 
428     /* DICTBE_H */
429 #endif
430