• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4  ************************************************************************************
5  * Copyright (C) 2006-2012, International Business Machines Corporation and others. *
6  * All Rights Reserved.                                                             *
7  ************************************************************************************
8  */
9 
10 #ifndef BRKENG_H
11 #define BRKENG_H
12 
13 #include "unicode/utypes.h"
14 #include "unicode/uobject.h"
15 #include "unicode/utext.h"
16 #include "unicode/uscript.h"
17 
18 U_NAMESPACE_BEGIN
19 
20 class UnicodeSet;
21 class UStack;
22 class UVector32;
23 class DictionaryMatcher;
24 
25 /*******************************************************************
26  * LanguageBreakEngine
27  */
28 
29 /**
30  * <p>LanguageBreakEngines implement language-specific knowledge for
31  * finding text boundaries within a run of characters belonging to a
32  * specific set. The boundaries will be of a specific kind, e.g. word,
33  * line, etc.</p>
34  *
35  * <p>LanguageBreakEngines should normally be implemented so as to
36  * be shared between threads without locking.</p>
37  */
38 class LanguageBreakEngine : public UMemory {
39  public:
40 
41   /**
42    * <p>Default constructor.</p>
43    *
44    */
45   LanguageBreakEngine();
46 
47   /**
48    * <p>Virtual destructor.</p>
49    */
50   virtual ~LanguageBreakEngine();
51 
52  /**
53   * <p>Indicate whether this engine handles a particular character for
54   * a particular kind of break.</p>
55   *
56   * @param c A character which begins a run that the engine might handle
57   * @return true if this engine handles the particular character and break
58   * type.
59   */
60   virtual UBool handles(UChar32 c) const = 0;
61 
62  /**
63   * <p>Find any breaks within a run in the supplied text.</p>
64   *
65   * @param text A UText representing the text. The
66   * iterator is left at the end of the run of characters which the engine
67   * is capable of handling.
68   * @param startPos The start of the run within the supplied text.
69   * @param endPos The end of the run within the supplied text.
70   * @param foundBreaks A Vector of int32_t to receive the breaks.
71   * @param status Information on any errors encountered.
72   * @return The number of breaks found.
73   */
74   virtual int32_t findBreaks( UText *text,
75                               int32_t startPos,
76                               int32_t endPos,
77                               UVector32 &foundBreaks,
78                               UErrorCode &status) const = 0;
79 
80 };
81 
82 /*******************************************************************
83  * LanguageBreakFactory
84  */
85 
86 /**
87  * <p>LanguageBreakFactorys find and return a LanguageBreakEngine
88  * that can determine breaks for characters in a specific set, if
89  * such an object can be found.</p>
90  *
91  * <p>If a LanguageBreakFactory is to be shared between threads,
92  * appropriate synchronization must be used; there is none internal
93  * to the factory.</p>
94  *
95  * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can
96  * normally be shared between threads without synchronization, unless
97  * the specific subclass of LanguageBreakFactory indicates otherwise.</p>
98  *
99  * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine
100  * it returns when it itself is deleted, unless the specific subclass of
101  * LanguageBreakFactory indicates otherwise. Naturally, the factory should
102  * not be deleted until the LanguageBreakEngines it has returned are no
103  * longer needed.</p>
104  */
105 class LanguageBreakFactory : public UMemory {
106  public:
107 
108   /**
109    * <p>Default constructor.</p>
110    *
111    */
112   LanguageBreakFactory();
113 
114   /**
115    * <p>Virtual destructor.</p>
116    */
117   virtual ~LanguageBreakFactory();
118 
119  /**
120   * <p>Find and return a LanguageBreakEngine that can find the desired
121   * kind of break for the set of characters to which the supplied
122   * character belongs. It is up to the set of available engines to
123   * determine what the sets of characters are.</p>
124   *
125   * @param c A character that begins a run for which a LanguageBreakEngine is
126   * sought.
127   * @return A LanguageBreakEngine with the desired characteristics, or 0.
128   */
129   virtual const LanguageBreakEngine *getEngineFor(UChar32 c) = 0;
130 
131 };
132 
133 /*******************************************************************
134  * UnhandledEngine
135  */
136 
137 /**
138  * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that
139  * handles characters that no other LanguageBreakEngine is available to
140  * handle. It is told the character and the type of break; at its
141  * discretion it may handle more than the specified character (e.g.,
142  * the entire script to which that character belongs.</p>
143  *
144  * <p>UnhandledEngines may not be shared between threads without
145  * external synchronization.</p>
146  */
147 
148 class UnhandledEngine : public LanguageBreakEngine {
149  private:
150 
151     /**
152      * The sets of characters handled.
153      * @internal
154      */
155 
156   UnicodeSet    *fHandled;
157 
158  public:
159 
160   /**
161    * <p>Default constructor.</p>
162    *
163    */
164   UnhandledEngine(UErrorCode &status);
165 
166   /**
167    * <p>Virtual destructor.</p>
168    */
169   virtual ~UnhandledEngine();
170 
171  /**
172   * <p>Indicate whether this engine handles a particular character for
173   * a particular kind of break.</p>
174   *
175   * @param c A character which begins a run that the engine might handle
176   * @return true if this engine handles the particular character and break
177   * type.
178   */
179   virtual UBool handles(UChar32 c) const override;
180 
181  /**
182   * <p>Find any breaks within a run in the supplied text.</p>
183   *
184   * @param text A UText representing the text (TODO: UText). The
185   * iterator is left at the end of the run of characters which the engine
186   * is capable of handling.
187   * @param startPos The start of the run within the supplied text.
188   * @param endPos The end of the run within the supplied text.
189   * @param foundBreaks An allocated C array of the breaks found, if any
190   * @param status Information on any errors encountered.
191   * @return The number of breaks found.
192   */
193   virtual int32_t findBreaks( UText *text,
194                               int32_t startPos,
195                               int32_t endPos,
196                               UVector32 &foundBreaks,
197                               UErrorCode &status) const override;
198 
199  /**
200   * <p>Tell the engine to handle a particular character and break type.</p>
201   *
202   * @param c A character which the engine should handle
203   */
204   virtual void handleCharacter(UChar32 c);
205 
206 };
207 
208 /*******************************************************************
209  * ICULanguageBreakFactory
210  */
211 
212 /**
213  * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for
214  * ICU. It creates dictionary-based LanguageBreakEngines from dictionary
215  * data in the ICU data file.</p>
216  */
217 class ICULanguageBreakFactory : public LanguageBreakFactory {
218  private:
219 
220     /**
221      * The stack of break engines created by this factory
222      * @internal
223      */
224 
225   UStack    *fEngines;
226 
227  public:
228 
229   /**
230    * <p>Standard constructor.</p>
231    *
232    */
233   ICULanguageBreakFactory(UErrorCode &status);
234 
235   /**
236    * <p>Virtual destructor.</p>
237    */
238   virtual ~ICULanguageBreakFactory();
239 
240  /**
241   * <p>Find and return a LanguageBreakEngine that can find the desired
242   * kind of break for the set of characters to which the supplied
243   * character belongs. It is up to the set of available engines to
244   * determine what the sets of characters are.</p>
245   *
246   * @param c A character that begins a run for which a LanguageBreakEngine is
247   * sought.
248   * @return A LanguageBreakEngine with the desired characteristics, or 0.
249   */
250   virtual const LanguageBreakEngine *getEngineFor(UChar32 c) override;
251 
252 protected:
253  /**
254   * <p>Create a LanguageBreakEngine for the set of characters to which
255   * the supplied character belongs, for the specified break type.</p>
256   *
257   * @param c A character that begins a run for which a LanguageBreakEngine is
258   * sought.
259   * @return A LanguageBreakEngine with the desired characteristics, or 0.
260   */
261   virtual const LanguageBreakEngine *loadEngineFor(UChar32 c);
262 
263   /**
264    * <p>Create a DictionaryMatcher for the specified script and break type.</p>
265    * @param script An ISO 15924 script code that identifies the dictionary to be
266    * created.
267    * @return A DictionaryMatcher with the desired characteristics, or NULL.
268    */
269   virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script);
270 };
271 
272 U_NAMESPACE_END
273 
274     /* BRKENG_H */
275 #endif
276