1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * Copyright (C) 1996-2012, International Business Machines * 6 * Corporation and others. All Rights Reserved. * 7 ****************************************************************************** 8 */ 9 10 /** 11 * \file 12 * \brief Originally, added as C++ API for Collation data used to compute minLengthInChars 13 * \internal 14 */ 15 16 /* 17 * Note: This module was included in ICU 4.0.1 as @internal technology preview for supporting 18 * Boyer-Moore string search API. For now, only SSearchTest depends on this module. 19 * I temporarily moved the module from i18n directory to intltest, because we have no plan to 20 * publish this as public API. (2012-12-18 yoshito) 21 */ 22 23 #ifndef COLL_DATA_H 24 #define COLL_DATA_H 25 26 #include "unicode/utypes.h" 27 28 #if !UCONFIG_NO_COLLATION 29 30 #include "unicode/ucol.h" 31 #include "unicode/unistr.h" 32 33 /** 34 * The size of the internal CE buffer in a <code>CEList</code> object 35 */ 36 #define CELIST_BUFFER_SIZE 4 37 38 /** 39 * \def INSTRUMENT_CELIST 40 * Define this to enable the <code>CEList</code> objects to collect 41 * statistics. 42 */ 43 44 /** 45 * The size of the initial list in a <code>StringList</code> object. 46 */ 47 #define STRING_LIST_BUFFER_SIZE 16 48 49 U_NAMESPACE_USE 50 51 /** 52 * This object holds a list of CEs generated from a particular 53 * <code>UnicodeString</code> 54 * 55 */ 56 class CEList 57 { 58 public: 59 /** 60 * Construct a <code>CEList</code> object. 61 * 62 * @param coll - the Collator used to collect the CEs. 63 * @param string - the string for which to collect the CEs. 64 * @param status - will be set if any errors occur. 65 * 66 * Note: if on return, status is set to an error code, 67 * the only safe thing to do with this object is to call 68 * the destructor. 69 */ 70 CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status); 71 72 /** 73 * The destructor. 74 */ 75 ~CEList(); 76 77 /** 78 * Return the number of CEs in the list. 79 * 80 * @return the number of CEs in the list. 81 */ 82 int32_t size() const; 83 84 /** 85 * Get a particular CE from the list. 86 * 87 * @param index - the index of the CE to return 88 * 89 * @return the CE, or <code>0</code> if <code>index</code> is out of range 90 */ 91 uint32_t get(int32_t index) const; 92 93 /** 94 * Check if the CEs in another <code>CEList</code> match the 95 * suffix of this list starting at a give offset. 96 * 97 * @param offset - the offset of the suffix 98 * @param other - the other <code>CEList</code> 99 * 100 * @return <code>true</code> if the CEs match, <code>false</code> otherwise. 101 */ 102 UBool matchesAt(int32_t offset, const CEList *other) const; 103 104 /** 105 * The index operator. 106 * 107 * @param index - the index 108 * 109 * @return a reference to the given CE in the list 110 */ 111 uint32_t &operator[](int32_t index) const; 112 113 private: 114 void add(uint32_t ce, UErrorCode &status); 115 116 uint32_t ceBuffer[CELIST_BUFFER_SIZE]; 117 uint32_t *ces; 118 int32_t listMax; 119 int32_t listSize; 120 }; 121 122 /** 123 * StringList 124 * 125 * This object holds a list of <code>UnicodeString</code> objects. 126 */ 127 class StringList 128 { 129 public: 130 /** 131 * Construct an empty <code>StringList</code> 132 * 133 * @param status - will be set if any errors occur. 134 * 135 * Note: if on return, status is set to an error code, 136 * the only safe thing to do with this object is to call 137 * the destructor. 138 */ 139 StringList(UErrorCode &status); 140 141 /** 142 * The destructor. 143 */ 144 ~StringList(); 145 146 /** 147 * Add a string to the list. 148 * 149 * @param string - the string to add 150 * @param status - will be set if any errors occur. 151 */ 152 void add(const UnicodeString *string, UErrorCode &status); 153 154 /** 155 * Add an array of Unicode code points to the list. 156 * 157 * @param chars - the address of the array of code points 158 * @param count - the number of code points in the array 159 * @param status - will be set if any errors occur. 160 */ 161 void add(const UChar *chars, int32_t count, UErrorCode &status); 162 163 /** 164 * Get a particular string from the list. 165 * 166 * @param index - the index of the string 167 * 168 * @return a pointer to the <code>UnicodeString</code> or <code>NULL</code> 169 * if <code>index</code> is out of bounds. 170 */ 171 const UnicodeString *get(int32_t index) const; 172 173 /** 174 * Get the number of strings in the list. 175 * 176 * @return the number of strings in the list. 177 */ 178 int32_t size() const; 179 180 private: 181 UnicodeString *strings; 182 int32_t listMax; 183 int32_t listSize; 184 }; 185 186 187 /* 188 * Forward references to internal classes. 189 */ 190 class StringToCEsMap; 191 class CEToStringsMap; 192 193 /** 194 * CollData 195 * 196 * This class holds the Collator-specific data needed to 197 * compute the length of the shortest string that can 198 * generate a particular list of CEs. 199 * 200 * <code>CollData</code> objects are quite expensive to compute. Because 201 * of this, they are cached. When you call <code>CollData::open</code> it 202 * returns a reference counted cached object. When you call <code>CollData::close</code> 203 * the reference count on the object is decremented but the object is not deleted. 204 * 205 * If you do not need to reuse any unreferenced objects in the cache, you can call 206 * <code>CollData::flushCollDataCache</code>. If you no longer need any <code>CollData</code> 207 * objects, you can call <code>CollData::freeCollDataCache</code> 208 */ 209 class CollData 210 { 211 public: 212 /** 213 * Construct a <code>CollData</code> object. 214 * 215 * @param collator - the collator 216 * @param status - will be set if any errors occur. 217 */ 218 CollData(UCollator *collator, UErrorCode &status); 219 220 /** 221 * The destructor. 222 */ 223 ~CollData(); 224 225 /** 226 * Get the <code>UCollator</code> object used to create this object. 227 * The object returned may not be the exact object that was used to 228 * create this object, but it will have the same behavior. 229 */ 230 UCollator *getCollator() const; 231 232 /** 233 * Get a list of all the strings which generate a list 234 * of CEs starting with a given CE. 235 * 236 * @param ce - the CE 237 * 238 * return a <code>StringList</code> object containing all 239 * the strings, or <code>NULL</code> if there are 240 * no such strings. 241 */ 242 const StringList *getStringList(int32_t ce) const; 243 244 /** 245 * Get a list of the CEs generated by a particular string. 246 * 247 * @param string - the string 248 * 249 * @return a <code>CEList</code> object containing the CEs. You 250 * must call <code>freeCEList</code> when you are finished 251 * using the <code>CEList</code>/ 252 */ 253 const CEList *getCEList(const UnicodeString *string) const; 254 255 /** 256 * Release a <code>CEList</code> returned by <code>getCEList</code>. 257 * 258 * @param list - the <code>CEList</code> to free. 259 */ 260 void freeCEList(const CEList *list); 261 262 /** 263 * Return the length of the shortest string that will generate 264 * the given list of CEs. 265 * 266 * @param ces - the CEs 267 * @param offset - the offset of the first CE in the list to use. 268 * 269 * @return the length of the shortest string. 270 */ 271 int32_t minLengthInChars(const CEList *ces, int32_t offset) const; 272 273 274 /** 275 * Return the length of the shortest string that will generate 276 * the given list of CEs. 277 * 278 * Note: the algorithm used to do this computation is recursive. To 279 * limit the amount of recursion, a "history" list is used to record 280 * the best answer starting at a particular offset in the list of CEs. 281 * If the same offset is visited again during the recursion, the answer 282 * in the history list is used. 283 * 284 * @param ces - the CEs 285 * @param offset - the offset of the first CE in the list to use. 286 * @param history - the history list. Must be at least as long as 287 * the number of cEs in the <code>CEList</code> 288 * 289 * @return the length of the shortest string. 290 */ 291 int32_t minLengthInChars(const CEList *ces, int32_t offset, int32_t *history) const; 292 293 private: 294 UCollator *coll; 295 CEToStringsMap *ceToCharsStartingWith; 296 297 uint32_t minHan; 298 uint32_t maxHan; 299 300 uint32_t jamoLimits[4]; 301 }; 302 303 #endif // #if !UCONFIG_NO_COLLATION 304 #endif // #ifndef COLL_DATA_H 305