• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  ******************************************************************************
3  *   Copyright (C) 1996-2012, International Business Machines                 *
4  *   Corporation and others.  All Rights Reserved.                            *
5  ******************************************************************************
6  */
7 
8 /**
9  * \file
10  * \brief Originally, added as C++ API for Collation data used to compute minLengthInChars
11  * \internal
12  */
13 
14 /*
15  * Note: This module was incldued in ICU 4.0.1 as @internal technology preview for supporting
16  * Boyer-Moore string search API. For now, only SSearchTest depends on this module. I temporaly
17  * moved the module from i18n directory to intltest, because we have no plan to publish this
18  * as public API. (2012-12-18 yoshito)
19  */
20 
21 #ifndef COLL_DATA_H
22 #define COLL_DATA_H
23 
24 #include "unicode/utypes.h"
25 
26 #if !UCONFIG_NO_COLLATION
27 
28 #include "unicode/ucol.h"
29 #include "unicode/unistr.h"
30 
31  /**
32   * The size of the internal CE buffer in a <code>CEList</code> object
33   */
34 #define CELIST_BUFFER_SIZE 4
35 
36 /**
37  * \def INSTRUMENT_CELIST
38  * Define this to enable the <code>CEList</code> objects to collect
39  * statistics.
40  */
41 
42  /**
43   * The size of the initial list in a <code>StringList</code> object.
44   */
45 #define STRING_LIST_BUFFER_SIZE 16
46 
47 U_NAMESPACE_USE
48 
49  /**
50   * This object holds a list of CEs generated from a particular
51   * <code>UnicodeString</code>
52   *
53   */
54 class CEList
55 {
56 public:
57     /**
58      * Construct a <code>CEList</code> object.
59      *
60      * @param coll - the Collator used to collect the CEs.
61      * @param string - the string for which to collect the CEs.
62      * @param status - will be set if any errors occur.
63      *
64      * Note: if on return, status is set to an error code,
65      * the only safe thing to do with this object is to call
66      * the destructor.
67      */
68     CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status);
69 
70     /**
71      * The destructor.
72      */
73     ~CEList();
74 
75     /**
76      * Return the number of CEs in the list.
77      *
78      * @return the number of CEs in the list.
79      */
80     int32_t size() const;
81 
82     /**
83      * Get a particular CE from the list.
84      *
85      * @param index - the index of the CE to return
86      *
87      * @return the CE, or <code>0</code> if <code>index</code> is out of range
88      */
89     uint32_t get(int32_t index) const;
90 
91     /**
92      * Check if the CEs in another <code>CEList</code> match the
93      * suffix of this list starting at a give offset.
94      *
95      * @param offset - the offset of the suffix
96      * @param other - the other <code>CEList</code>
97      *
98      * @return <code>TRUE</code> if the CEs match, <code>FALSE</code> otherwise.
99      */
100     UBool matchesAt(int32_t offset, const CEList *other) const;
101 
102     /**
103      * The index operator.
104      *
105      * @param index - the index
106      *
107      * @return a reference to the given CE in the list
108      */
109     uint32_t &operator[](int32_t index) const;
110 
111 private:
112     void add(uint32_t ce, UErrorCode &status);
113 
114     uint32_t ceBuffer[CELIST_BUFFER_SIZE];
115     uint32_t *ces;
116     int32_t listMax;
117     int32_t listSize;
118 };
119 
120 /**
121  * StringList
122  *
123  * This object holds a list of <code>UnicodeString</code> objects.
124  */
125 class StringList
126 {
127 public:
128     /**
129      * Construct an empty <code>StringList</code>
130      *
131      * @param status - will be set if any errors occur.
132      *
133      * Note: if on return, status is set to an error code,
134      * the only safe thing to do with this object is to call
135      * the destructor.
136      */
137     StringList(UErrorCode &status);
138 
139     /**
140      * The destructor.
141      */
142     ~StringList();
143 
144     /**
145      * Add a string to the list.
146      *
147      * @param string - the string to add
148      * @param status - will be set if any errors occur.
149      */
150     void add(const UnicodeString *string, UErrorCode &status);
151 
152     /**
153      * Add an array of Unicode code points to the list.
154      *
155      * @param chars - the address of the array of code points
156      * @param count - the number of code points in the array
157      * @param status - will be set if any errors occur.
158      */
159     void add(const UChar *chars, int32_t count, UErrorCode &status);
160 
161     /**
162      * Get a particular string from the list.
163      *
164      * @param index - the index of the string
165      *
166      * @return a pointer to the <code>UnicodeString</code> or <code>NULL</code>
167      *         if <code>index</code> is out of bounds.
168      */
169     const UnicodeString *get(int32_t index) const;
170 
171     /**
172      * Get the number of stings in the list.
173      *
174      * @return the number of strings in the list.
175      */
176     int32_t size() const;
177 
178 private:
179     UnicodeString *strings;
180     int32_t listMax;
181     int32_t listSize;
182 };
183 
184 
185 /*
186  * Forward references to internal classes.
187  */
188 class StringToCEsMap;
189 class CEToStringsMap;
190 
191 /**
192  * CollData
193  *
194  * This class holds the Collator-specific data needed to
195  * compute the length of the shortest string that can
196  * generate a partcular list of CEs.
197  *
198  * <code>CollData</code> objects are quite expensive to compute. Because
199  * of this, they are cached. When you call <code>CollData::open</code> it
200  * returns a reference counted cached object. When you call <code>CollData::close</code>
201  * the reference count on the object is decremented but the object is not deleted.
202  *
203  * If you do not need to reuse any unreferenced objects in the cache, you can call
204  * <code>CollData::flushCollDataCache</code>. If you no longer need any <code>CollData</code>
205  * objects, you can call <code>CollData::freeCollDataCache</code>
206  */
207 class CollData
208 {
209 public:
210     /**
211      * Construct a <code>CollData</code> object.
212      *
213      * @param collator - the collator
214      * @param status - will be set if any errors occur.
215      */
216     CollData(UCollator *collator, UErrorCode &status);
217 
218     /**
219      * The destructor.
220      */
221     ~CollData();
222 
223     /**
224      * Get the <code>UCollator</code> object used to create this object.
225      * The object returned may not be the exact object that was used to
226      * create this object, but it will have the same behavior.
227      */
228     UCollator *getCollator() const;
229 
230     /**
231      * Get a list of all the strings which generate a list
232      * of CEs starting with a given CE.
233      *
234      * @param ce - the CE
235      *
236      * return a <code>StringList</code> object containing all
237      *        the stirngs, or <code>NULL</code> if there are
238      *        no such strings.
239      */
240     const StringList *getStringList(int32_t ce) const;
241 
242     /**
243      * Get a list of the CEs generated by a partcular stirng.
244      *
245      * @param string - the string
246      *
247      * @return a <code>CEList</code> object containt the CEs. You
248      *         must call <code>freeCEList</code> when you are finished
249      *         using the <code>CEList</code>/
250      */
251     const CEList *getCEList(const UnicodeString *string) const;
252 
253     /**
254      * Release a <code>CEList</code> returned by <code>getCEList</code>.
255      *
256      * @param list - the <code>CEList</code> to free.
257      */
258     void freeCEList(const CEList *list);
259 
260     /**
261      * Return the length of the shortest string that will generate
262      * the given list of CEs.
263      *
264      * @param ces - the CEs
265      * @param offset - the offset of the first CE in the list to use.
266      *
267      * @return the length of the shortest string.
268      */
269     int32_t minLengthInChars(const CEList *ces, int32_t offset) const;
270 
271 
272     /**
273      * Return the length of the shortest string that will generate
274      * the given list of CEs.
275      *
276      * Note: the algorithm used to do this computation is recursive. To
277      * limit the amount of recursion, a "history" list is used to record
278      * the best answer starting at a particular offset in the list of CEs.
279      * If the same offset is visited again during the recursion, the answer
280      * in the history list is used.
281      *
282      * @param ces - the CEs
283      * @param offset - the offset of the first CE in the list to use.
284      * @param history - the history list. Must be at least as long as
285      *                 the number of cEs in the <code>CEList</code>
286      *
287      * @return the length of the shortest string.
288      */
289    int32_t minLengthInChars(const CEList *ces, int32_t offset, int32_t *history) const;
290 
291 private:
292     UCollator      *coll;
293     CEToStringsMap *ceToCharsStartingWith;
294 
295     uint32_t minHan;
296     uint32_t maxHan;
297 
298     uint32_t jamoLimits[4];
299 };
300 
301 #endif // #if !UCONFIG_NO_COLLATION
302 #endif // #ifndef COLL_DATA_H
303