• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4  ************************************************************************************
5  * Copyright (C) 2006-2012, International Business Machines Corporation and others. *
6  * All Rights Reserved.                                                             *
7  ************************************************************************************
8  */
9 
10 #ifndef BRKENG_H
11 #define BRKENG_H
12 
13 #include "unicode/utypes.h"
14 #include "unicode/uobject.h"
15 #include "unicode/utext.h"
16 #include "unicode/uscript.h"
17 
18 U_NAMESPACE_BEGIN
19 
20 class UnicodeSet;
21 class UStack;
22 class UVector32;
23 class DictionaryMatcher;
24 
25 /*******************************************************************
26  * LanguageBreakEngine
27  */
28 
29 /**
30  * <p>LanguageBreakEngines implement language-specific knowledge for
31  * finding text boundaries within a run of characters belonging to a
32  * specific set. The boundaries will be of a specific kind, e.g. word,
33  * line, etc.</p>
34  *
35  * <p>LanguageBreakEngines should normally be implemented so as to
36  * be shared between threads without locking.</p>
37  */
38 class LanguageBreakEngine : public UMemory {
39  public:
40 
41   /**
42    * <p>Default constructor.</p>
43    *
44    */
45   LanguageBreakEngine();
46 
47   /**
48    * <p>Virtual destructor.</p>
49    */
50   virtual ~LanguageBreakEngine();
51 
52  /**
53   * <p>Indicate whether this engine handles a particular character for
54   * a particular kind of break.</p>
55   *
56   * @param c A character which begins a run that the engine might handle
57   * @return true if this engine handles the particular character and break
58   * type.
59   */
60   virtual UBool handles(UChar32 c) const = 0;
61 
62  /**
63   * <p>Find any breaks within a run in the supplied text.</p>
64   *
65   * @param text A UText representing the text. The
66   * iterator is left at the end of the run of characters which the engine
67   * is capable of handling.
68   * @param startPos The start of the run within the supplied text.
69   * @param endPos The end of the run within the supplied text.
70   * @param foundBreaks A Vector of int32_t to receive the breaks.
71   * @param status Information on any errors encountered.
72   * @return The number of breaks found.
73   */
74   virtual int32_t findBreaks( UText *text,
75                               int32_t startPos,
76                               int32_t endPos,
77                               UVector32 &foundBreaks,
78                               UBool isPhraseBreaking,
79                               UErrorCode &status) const = 0;
80 
81 };
82 
83 /*******************************************************************
84  * LanguageBreakFactory
85  */
86 
87 /**
88  * <p>LanguageBreakFactorys find and return a LanguageBreakEngine
89  * that can determine breaks for characters in a specific set, if
90  * such an object can be found.</p>
91  *
92  * <p>If a LanguageBreakFactory is to be shared between threads,
93  * appropriate synchronization must be used; there is none internal
94  * to the factory.</p>
95  *
96  * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can
97  * normally be shared between threads without synchronization, unless
98  * the specific subclass of LanguageBreakFactory indicates otherwise.</p>
99  *
100  * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine
101  * it returns when it itself is deleted, unless the specific subclass of
102  * LanguageBreakFactory indicates otherwise. Naturally, the factory should
103  * not be deleted until the LanguageBreakEngines it has returned are no
104  * longer needed.</p>
105  */
106 class LanguageBreakFactory : public UMemory {
107  public:
108 
109   /**
110    * <p>Default constructor.</p>
111    *
112    */
113   LanguageBreakFactory();
114 
115   /**
116    * <p>Virtual destructor.</p>
117    */
118   virtual ~LanguageBreakFactory();
119 
120  /**
121   * <p>Find and return a LanguageBreakEngine that can find the desired
122   * kind of break for the set of characters to which the supplied
123   * character belongs. It is up to the set of available engines to
124   * determine what the sets of characters are.</p>
125   *
126   * @param c A character that begins a run for which a LanguageBreakEngine is
127   * sought.
128   * @return A LanguageBreakEngine with the desired characteristics, or 0.
129   */
130   virtual const LanguageBreakEngine *getEngineFor(UChar32 c) = 0;
131 
132 };
133 
134 /*******************************************************************
135  * UnhandledEngine
136  */
137 
138 /**
139  * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that
140  * handles characters that no other LanguageBreakEngine is available to
141  * handle. It is told the character and the type of break; at its
142  * discretion it may handle more than the specified character (e.g.,
143  * the entire script to which that character belongs.</p>
144  *
145  * <p>UnhandledEngines may not be shared between threads without
146  * external synchronization.</p>
147  */
148 
149 class UnhandledEngine : public LanguageBreakEngine {
150  private:
151 
152     /**
153      * The sets of characters handled.
154      * @internal
155      */
156 
157   UnicodeSet    *fHandled;
158 
159  public:
160 
161   /**
162    * <p>Default constructor.</p>
163    *
164    */
165   UnhandledEngine(UErrorCode &status);
166 
167   /**
168    * <p>Virtual destructor.</p>
169    */
170   virtual ~UnhandledEngine();
171 
172  /**
173   * <p>Indicate whether this engine handles a particular character for
174   * a particular kind of break.</p>
175   *
176   * @param c A character which begins a run that the engine might handle
177   * @return true if this engine handles the particular character and break
178   * type.
179   */
180   virtual UBool handles(UChar32 c) const override;
181 
182  /**
183   * <p>Find any breaks within a run in the supplied text.</p>
184   *
185   * @param text A UText representing the text (TODO: UText). The
186   * iterator is left at the end of the run of characters which the engine
187   * is capable of handling.
188   * @param startPos The start of the run within the supplied text.
189   * @param endPos The end of the run within the supplied text.
190   * @param foundBreaks An allocated C array of the breaks found, if any
191   * @param status Information on any errors encountered.
192   * @return The number of breaks found.
193   */
194   virtual int32_t findBreaks( UText *text,
195                               int32_t startPos,
196                               int32_t endPos,
197                               UVector32 &foundBreaks,
198                               UBool isPhraseBreaking,
199                               UErrorCode &status) const override;
200 
201  /**
202   * <p>Tell the engine to handle a particular character and break type.</p>
203   *
204   * @param c A character which the engine should handle
205   */
206   virtual void handleCharacter(UChar32 c);
207 
208 };
209 
210 /*******************************************************************
211  * ICULanguageBreakFactory
212  */
213 
214 /**
215  * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for
216  * ICU. It creates dictionary-based LanguageBreakEngines from dictionary
217  * data in the ICU data file.</p>
218  */
219 class ICULanguageBreakFactory : public LanguageBreakFactory {
220  private:
221 
222     /**
223      * The stack of break engines created by this factory
224      * @internal
225      */
226 
227   UStack    *fEngines;
228 
229  public:
230 
231   /**
232    * <p>Standard constructor.</p>
233    *
234    */
235   ICULanguageBreakFactory(UErrorCode &status);
236 
237   /**
238    * <p>Virtual destructor.</p>
239    */
240   virtual ~ICULanguageBreakFactory();
241 
242  /**
243   * <p>Find and return a LanguageBreakEngine that can find the desired
244   * kind of break for the set of characters to which the supplied
245   * character belongs. It is up to the set of available engines to
246   * determine what the sets of characters are.</p>
247   *
248   * @param c A character that begins a run for which a LanguageBreakEngine is
249   * sought.
250   * @return A LanguageBreakEngine with the desired characteristics, or 0.
251   */
252   virtual const LanguageBreakEngine *getEngineFor(UChar32 c) override;
253 
254 protected:
255  /**
256   * <p>Create a LanguageBreakEngine for the set of characters to which
257   * the supplied character belongs, for the specified break type.</p>
258   *
259   * @param c A character that begins a run for which a LanguageBreakEngine is
260   * sought.
261   * @return A LanguageBreakEngine with the desired characteristics, or 0.
262   */
263   virtual const LanguageBreakEngine *loadEngineFor(UChar32 c);
264 
265   /**
266    * <p>Create a DictionaryMatcher for the specified script and break type.</p>
267    * @param script An ISO 15924 script code that identifies the dictionary to be
268    * created.
269    * @return A DictionaryMatcher with the desired characteristics, or nullptr.
270    */
271   virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script);
272 };
273 
274 U_NAMESPACE_END
275 
276     /* BRKENG_H */
277 #endif
278