1 /* 2 ****************************************************************************** 3 * Copyright (C) 1996-2012, International Business Machines * 4 * Corporation and others. All Rights Reserved. * 5 ****************************************************************************** 6 */ 7 8 /** 9 * \file 10 * \brief Originally, added as C++ API for Collation data used to compute minLengthInChars 11 * \internal 12 */ 13 14 /* 15 * Note: This module was incldued in ICU 4.0.1 as @internal technology preview for supporting 16 * Boyer-Moore string search API. For now, only SSearchTest depends on this module. I temporaly 17 * moved the module from i18n directory to intltest, because we have no plan to publish this 18 * as public API. (2012-12-18 yoshito) 19 */ 20 21 #ifndef COLL_DATA_H 22 #define COLL_DATA_H 23 24 #include "unicode/utypes.h" 25 26 #if !UCONFIG_NO_COLLATION 27 28 #include "unicode/ucol.h" 29 #include "unicode/unistr.h" 30 31 /** 32 * The size of the internal CE buffer in a <code>CEList</code> object 33 */ 34 #define CELIST_BUFFER_SIZE 4 35 36 /** 37 * \def INSTRUMENT_CELIST 38 * Define this to enable the <code>CEList</code> objects to collect 39 * statistics. 40 */ 41 42 /** 43 * The size of the initial list in a <code>StringList</code> object. 44 */ 45 #define STRING_LIST_BUFFER_SIZE 16 46 47 U_NAMESPACE_USE 48 49 /** 50 * This object holds a list of CEs generated from a particular 51 * <code>UnicodeString</code> 52 * 53 */ 54 class CEList 55 { 56 public: 57 /** 58 * Construct a <code>CEList</code> object. 59 * 60 * @param coll - the Collator used to collect the CEs. 61 * @param string - the string for which to collect the CEs. 62 * @param status - will be set if any errors occur. 63 * 64 * Note: if on return, status is set to an error code, 65 * the only safe thing to do with this object is to call 66 * the destructor. 67 */ 68 CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status); 69 70 /** 71 * The destructor. 72 */ 73 ~CEList(); 74 75 /** 76 * Return the number of CEs in the list. 77 * 78 * @return the number of CEs in the list. 79 */ 80 int32_t size() const; 81 82 /** 83 * Get a particular CE from the list. 84 * 85 * @param index - the index of the CE to return 86 * 87 * @return the CE, or <code>0</code> if <code>index</code> is out of range 88 */ 89 uint32_t get(int32_t index) const; 90 91 /** 92 * Check if the CEs in another <code>CEList</code> match the 93 * suffix of this list starting at a give offset. 94 * 95 * @param offset - the offset of the suffix 96 * @param other - the other <code>CEList</code> 97 * 98 * @return <code>TRUE</code> if the CEs match, <code>FALSE</code> otherwise. 99 */ 100 UBool matchesAt(int32_t offset, const CEList *other) const; 101 102 /** 103 * The index operator. 104 * 105 * @param index - the index 106 * 107 * @return a reference to the given CE in the list 108 */ 109 uint32_t &operator[](int32_t index) const; 110 111 private: 112 void add(uint32_t ce, UErrorCode &status); 113 114 uint32_t ceBuffer[CELIST_BUFFER_SIZE]; 115 uint32_t *ces; 116 int32_t listMax; 117 int32_t listSize; 118 }; 119 120 /** 121 * StringList 122 * 123 * This object holds a list of <code>UnicodeString</code> objects. 124 */ 125 class StringList 126 { 127 public: 128 /** 129 * Construct an empty <code>StringList</code> 130 * 131 * @param status - will be set if any errors occur. 132 * 133 * Note: if on return, status is set to an error code, 134 * the only safe thing to do with this object is to call 135 * the destructor. 136 */ 137 StringList(UErrorCode &status); 138 139 /** 140 * The destructor. 141 */ 142 ~StringList(); 143 144 /** 145 * Add a string to the list. 146 * 147 * @param string - the string to add 148 * @param status - will be set if any errors occur. 149 */ 150 void add(const UnicodeString *string, UErrorCode &status); 151 152 /** 153 * Add an array of Unicode code points to the list. 154 * 155 * @param chars - the address of the array of code points 156 * @param count - the number of code points in the array 157 * @param status - will be set if any errors occur. 158 */ 159 void add(const UChar *chars, int32_t count, UErrorCode &status); 160 161 /** 162 * Get a particular string from the list. 163 * 164 * @param index - the index of the string 165 * 166 * @return a pointer to the <code>UnicodeString</code> or <code>NULL</code> 167 * if <code>index</code> is out of bounds. 168 */ 169 const UnicodeString *get(int32_t index) const; 170 171 /** 172 * Get the number of stings in the list. 173 * 174 * @return the number of strings in the list. 175 */ 176 int32_t size() const; 177 178 private: 179 UnicodeString *strings; 180 int32_t listMax; 181 int32_t listSize; 182 }; 183 184 185 /* 186 * Forward references to internal classes. 187 */ 188 class StringToCEsMap; 189 class CEToStringsMap; 190 191 /** 192 * CollData 193 * 194 * This class holds the Collator-specific data needed to 195 * compute the length of the shortest string that can 196 * generate a partcular list of CEs. 197 * 198 * <code>CollData</code> objects are quite expensive to compute. Because 199 * of this, they are cached. When you call <code>CollData::open</code> it 200 * returns a reference counted cached object. When you call <code>CollData::close</code> 201 * the reference count on the object is decremented but the object is not deleted. 202 * 203 * If you do not need to reuse any unreferenced objects in the cache, you can call 204 * <code>CollData::flushCollDataCache</code>. If you no longer need any <code>CollData</code> 205 * objects, you can call <code>CollData::freeCollDataCache</code> 206 */ 207 class CollData 208 { 209 public: 210 /** 211 * Construct a <code>CollData</code> object. 212 * 213 * @param collator - the collator 214 * @param status - will be set if any errors occur. 215 */ 216 CollData(UCollator *collator, UErrorCode &status); 217 218 /** 219 * The destructor. 220 */ 221 ~CollData(); 222 223 /** 224 * Get the <code>UCollator</code> object used to create this object. 225 * The object returned may not be the exact object that was used to 226 * create this object, but it will have the same behavior. 227 */ 228 UCollator *getCollator() const; 229 230 /** 231 * Get a list of all the strings which generate a list 232 * of CEs starting with a given CE. 233 * 234 * @param ce - the CE 235 * 236 * return a <code>StringList</code> object containing all 237 * the stirngs, or <code>NULL</code> if there are 238 * no such strings. 239 */ 240 const StringList *getStringList(int32_t ce) const; 241 242 /** 243 * Get a list of the CEs generated by a partcular stirng. 244 * 245 * @param string - the string 246 * 247 * @return a <code>CEList</code> object containt the CEs. You 248 * must call <code>freeCEList</code> when you are finished 249 * using the <code>CEList</code>/ 250 */ 251 const CEList *getCEList(const UnicodeString *string) const; 252 253 /** 254 * Release a <code>CEList</code> returned by <code>getCEList</code>. 255 * 256 * @param list - the <code>CEList</code> to free. 257 */ 258 void freeCEList(const CEList *list); 259 260 /** 261 * Return the length of the shortest string that will generate 262 * the given list of CEs. 263 * 264 * @param ces - the CEs 265 * @param offset - the offset of the first CE in the list to use. 266 * 267 * @return the length of the shortest string. 268 */ 269 int32_t minLengthInChars(const CEList *ces, int32_t offset) const; 270 271 272 /** 273 * Return the length of the shortest string that will generate 274 * the given list of CEs. 275 * 276 * Note: the algorithm used to do this computation is recursive. To 277 * limit the amount of recursion, a "history" list is used to record 278 * the best answer starting at a particular offset in the list of CEs. 279 * If the same offset is visited again during the recursion, the answer 280 * in the history list is used. 281 * 282 * @param ces - the CEs 283 * @param offset - the offset of the first CE in the list to use. 284 * @param history - the history list. Must be at least as long as 285 * the number of cEs in the <code>CEList</code> 286 * 287 * @return the length of the shortest string. 288 */ 289 int32_t minLengthInChars(const CEList *ces, int32_t offset, int32_t *history) const; 290 291 private: 292 UCollator *coll; 293 CEToStringsMap *ceToCharsStartingWith; 294 295 uint32_t minHan; 296 uint32_t maxHan; 297 298 uint32_t jamoLimits[4]; 299 }; 300 301 #endif // #if !UCONFIG_NO_COLLATION 302 #endif // #ifndef COLL_DATA_H 303