• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4  ************************************************************************************
5  * Copyright (C) 2006-2012, International Business Machines Corporation and others. *
6  * All Rights Reserved.                                                             *
7  ************************************************************************************
8  */
9 
10 #ifndef BRKENG_H
11 #define BRKENG_H
12 
13 #include "unicode/utypes.h"
14 #include "unicode/uobject.h"
15 #include "unicode/utext.h"
16 #include "unicode/uscript.h"
17 
18 U_NAMESPACE_BEGIN
19 
20 class UnicodeSet;
21 class UStack;
22 class UVector32;
23 class DictionaryMatcher;
24 
25 /*******************************************************************
26  * LanguageBreakEngine
27  */
28 
29 /**
30  * <p>LanguageBreakEngines implement language-specific knowledge for
31  * finding text boundaries within a run of characters belonging to a
32  * specific set. The boundaries will be of a specific kind, e.g. word,
33  * line, etc.</p>
34  *
35  * <p>LanguageBreakEngines should normally be implemented so as to
36  * be shared between threads without locking.</p>
37  */
38 class LanguageBreakEngine : public UMemory {
39  public:
40 
41   /**
42    * <p>Default constructor.</p>
43    *
44    */
45   LanguageBreakEngine();
46 
47   /**
48    * <p>Virtual destructor.</p>
49    */
50   virtual ~LanguageBreakEngine();
51 
52  /**
53   * <p>Indicate whether this engine handles a particular character for
54   * a particular kind of break.</p>
55   *
56   * @param c A character which begins a run that the engine might handle
57   * @param breakType The type of text break which the caller wants to determine
58   * @return TRUE if this engine handles the particular character and break
59   * type.
60   */
61   virtual UBool handles(UChar32 c, int32_t breakType) const = 0;
62 
63  /**
64   * <p>Find any breaks within a run in the supplied text.</p>
65   *
66   * @param text A UText representing the text. The
67   * iterator is left at the end of the run of characters which the engine
68   * is capable of handling.
69   * @param startPos The start of the run within the supplied text.
70   * @param endPos The end of the run within the supplied text.
71   * @param breakType The type of break desired, or -1.
72   * @param foundBreaks A Vector of int32_t to receive the breaks.
73   * @return The number of breaks found.
74   */
75   virtual int32_t findBreaks( UText *text,
76                               int32_t startPos,
77                               int32_t endPos,
78                               int32_t breakType,
79                               UVector32 &foundBreaks ) const = 0;
80 
81 };
82 
83 /*******************************************************************
84  * LanguageBreakFactory
85  */
86 
87 /**
88  * <p>LanguageBreakFactorys find and return a LanguageBreakEngine
89  * that can determine breaks for characters in a specific set, if
90  * such an object can be found.</p>
91  *
92  * <p>If a LanguageBreakFactory is to be shared between threads,
93  * appropriate synchronization must be used; there is none internal
94  * to the factory.</p>
95  *
96  * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can
97  * normally be shared between threads without synchronization, unless
98  * the specific subclass of LanguageBreakFactory indicates otherwise.</p>
99  *
100  * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine
101  * it returns when it itself is deleted, unless the specific subclass of
102  * LanguageBreakFactory indicates otherwise. Naturally, the factory should
103  * not be deleted until the LanguageBreakEngines it has returned are no
104  * longer needed.</p>
105  */
106 class LanguageBreakFactory : public UMemory {
107  public:
108 
109   /**
110    * <p>Default constructor.</p>
111    *
112    */
113   LanguageBreakFactory();
114 
115   /**
116    * <p>Virtual destructor.</p>
117    */
118   virtual ~LanguageBreakFactory();
119 
120  /**
121   * <p>Find and return a LanguageBreakEngine that can find the desired
122   * kind of break for the set of characters to which the supplied
123   * character belongs. It is up to the set of available engines to
124   * determine what the sets of characters are.</p>
125   *
126   * @param c A character that begins a run for which a LanguageBreakEngine is
127   * sought.
128   * @param breakType The kind of text break for which a LanguageBreakEngine is
129   * sought.
130   * @return A LanguageBreakEngine with the desired characteristics, or 0.
131   */
132   virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType) = 0;
133 
134 };
135 
136 /*******************************************************************
137  * UnhandledEngine
138  */
139 
140 /**
141  * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that
142  * handles characters that no other LanguageBreakEngine is available to
143  * handle. It is told the character and the type of break; at its
144  * discretion it may handle more than the specified character (e.g.,
145  * the entire script to which that character belongs.</p>
146  *
147  * <p>UnhandledEngines may not be shared between threads without
148  * external synchronization.</p>
149  */
150 
151 class UnhandledEngine : public LanguageBreakEngine {
152  private:
153 
154     /**
155      * The sets of characters handled, for each break type
156      * @internal
157      */
158 
159   UnicodeSet    *fHandled[4];
160 
161  public:
162 
163   /**
164    * <p>Default constructor.</p>
165    *
166    */
167   UnhandledEngine(UErrorCode &status);
168 
169   /**
170    * <p>Virtual destructor.</p>
171    */
172   virtual ~UnhandledEngine();
173 
174  /**
175   * <p>Indicate whether this engine handles a particular character for
176   * a particular kind of break.</p>
177   *
178   * @param c A character which begins a run that the engine might handle
179   * @param breakType The type of text break which the caller wants to determine
180   * @return TRUE if this engine handles the particular character and break
181   * type.
182   */
183   virtual UBool handles(UChar32 c, int32_t breakType) const;
184 
185  /**
186   * <p>Find any breaks within a run in the supplied text.</p>
187   *
188   * @param text A UText representing the text (TODO: UText). The
189   * iterator is left at the end of the run of characters which the engine
190   * is capable of handling.
191   * @param startPos The start of the run within the supplied text.
192   * @param endPos The end of the run within the supplied text.
193   * @param breakType The type of break desired, or -1.
194   * @param foundBreaks An allocated C array of the breaks found, if any
195   * @return The number of breaks found.
196   */
197   virtual int32_t findBreaks( UText *text,
198                               int32_t startPos,
199                               int32_t endPos,
200                               int32_t breakType,
201                               UVector32 &foundBreaks ) const;
202 
203  /**
204   * <p>Tell the engine to handle a particular character and break type.</p>
205   *
206   * @param c A character which the engine should handle
207   * @param breakType The type of text break for which the engine should handle c
208   */
209   virtual void handleCharacter(UChar32 c, int32_t breakType);
210 
211 };
212 
213 /*******************************************************************
214  * ICULanguageBreakFactory
215  */
216 
217 /**
218  * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for
219  * ICU. It creates dictionary-based LanguageBreakEngines from dictionary
220  * data in the ICU data file.</p>
221  */
222 class ICULanguageBreakFactory : public LanguageBreakFactory {
223  private:
224 
225     /**
226      * The stack of break engines created by this factory
227      * @internal
228      */
229 
230   UStack    *fEngines;
231 
232  public:
233 
234   /**
235    * <p>Standard constructor.</p>
236    *
237    */
238   ICULanguageBreakFactory(UErrorCode &status);
239 
240   /**
241    * <p>Virtual destructor.</p>
242    */
243   virtual ~ICULanguageBreakFactory();
244 
245  /**
246   * <p>Find and return a LanguageBreakEngine that can find the desired
247   * kind of break for the set of characters to which the supplied
248   * character belongs. It is up to the set of available engines to
249   * determine what the sets of characters are.</p>
250   *
251   * @param c A character that begins a run for which a LanguageBreakEngine is
252   * sought.
253   * @param breakType The kind of text break for which a LanguageBreakEngine is
254   * sought.
255   * @return A LanguageBreakEngine with the desired characteristics, or 0.
256   */
257   virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType);
258 
259 protected:
260  /**
261   * <p>Create a LanguageBreakEngine for the set of characters to which
262   * the supplied character belongs, for the specified break type.</p>
263   *
264   * @param c A character that begins a run for which a LanguageBreakEngine is
265   * sought.
266   * @param breakType The kind of text break for which a LanguageBreakEngine is
267   * sought.
268   * @return A LanguageBreakEngine with the desired characteristics, or 0.
269   */
270   virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, int32_t breakType);
271 
272   /**
273    * <p>Create a DictionaryMatcher for the specified script and break type.</p>
274    * @param script An ISO 15924 script code that identifies the dictionary to be
275    * created.
276    * @param breakType The kind of text break for which a dictionary is
277    * sought.
278    * @return A DictionaryMatcher with the desired characteristics, or NULL.
279    */
280   virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType);
281 };
282 
283 U_NAMESPACE_END
284 
285     /* BRKENG_H */
286 #endif
287