1 // © 2020 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 // uniquecharstr.h 5 // created: 2020sep01 Frank Yung-Fong Tang 6 7 #ifndef __UNIQUECHARSTR_H__ 8 #define __UNIQUECHARSTR_H__ 9 10 #include "charstr.h" 11 #include "uassert.h" 12 #include "uhash.h" 13 14 U_NAMESPACE_BEGIN 15 16 /** 17 * Stores NUL-terminated strings with duplicate elimination. 18 * Checks for unique UTF-16 string pointers and converts to invariant characters. 19 * 20 * Intended to be stack-allocated. Add strings, get a unique number for each, 21 * freeze the object, get a char * pointer for each string, 22 * call orphanCharStrings() to capture the string storage, and let this object go out of scope. 23 */ 24 class UniqueCharStrings { 25 public: UniqueCharStrings(UErrorCode & errorCode)26 UniqueCharStrings(UErrorCode &errorCode) : strings(nullptr) { 27 // Note: We hash on string contents but store stable char16_t * pointers. 28 // If the strings are stored in resource bundles which should be built with 29 // duplicate elimination, then we should be able to hash on just the pointer values. 30 uhash_init(&map, uhash_hashUChars, uhash_compareUChars, uhash_compareLong, &errorCode); 31 if (U_FAILURE(errorCode)) { return; } 32 strings = new CharString(); 33 if (strings == nullptr) { 34 errorCode = U_MEMORY_ALLOCATION_ERROR; 35 } 36 } ~UniqueCharStrings()37 ~UniqueCharStrings() { 38 uhash_close(&map); 39 delete strings; 40 } 41 42 /** Returns/orphans the CharString that contains all strings. */ orphanCharStrings()43 CharString *orphanCharStrings() { 44 CharString *result = strings; 45 strings = nullptr; 46 return result; 47 } 48 49 /** 50 * Adds a string and returns a unique number for it. 51 * The string's buffer contents must not change, nor move around in memory, 52 * while this UniqueCharStrings is in use. 53 * The string contents must be NUL-terminated exactly at s.length(). 54 * 55 * Best used with read-only-alias UnicodeString objects that point to 56 * stable storage, such as strings returned by resource bundle functions. 57 */ add(const UnicodeString & s,UErrorCode & errorCode)58 int32_t add(const UnicodeString &s, UErrorCode &errorCode) { 59 if (U_FAILURE(errorCode)) { return 0; } 60 if (isFrozen) { 61 errorCode = U_NO_WRITE_PERMISSION; 62 return 0; 63 } 64 // The string points into the resource bundle. 65 const char16_t *p = s.getBuffer(); 66 int32_t oldIndex = uhash_geti(&map, p); 67 if (oldIndex != 0) { // found duplicate 68 return oldIndex; 69 } 70 // Explicit NUL terminator for the previous string. 71 // The strings object is also terminated with one implicit NUL. 72 strings->append(0, errorCode); 73 int32_t newIndex = strings->length(); 74 strings->appendInvariantChars(s, errorCode); 75 uhash_puti(&map, const_cast<char16_t *>(p), newIndex, &errorCode); 76 return newIndex; 77 } 78 freeze()79 void freeze() { isFrozen = true; } 80 81 /** 82 * Returns a string pointer for its unique number, if this object is frozen. 83 * Otherwise nullptr. 84 */ get(int32_t i)85 const char *get(int32_t i) const { 86 U_ASSERT(isFrozen); 87 return isFrozen && i > 0 ? strings->data() + i : nullptr; 88 } 89 90 private: 91 UHashtable map; 92 CharString *strings; 93 bool isFrozen = false; 94 }; 95 96 U_NAMESPACE_END 97 98 #endif // __UNIQUECHARSTR_H__ 99