1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 1999-2014 International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: rbbidata.h 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * RBBI data formats Includes 16 * 17 * Structs that describes the format of the Binary RBBI data, 18 * as it is stored in ICU's data file. 19 * 20 * RBBIDataWrapper - Instances of this class sit between the 21 * raw data structs and the RulesBasedBreakIterator objects 22 * that are created by applications. The wrapper class 23 * provides reference counting for the underlying data, 24 * and direct pointers to data that would not otherwise 25 * be accessible without ugly pointer arithmetic. The 26 * wrapper does not attempt to provide any higher level 27 * abstractions for the data itself. 28 * 29 * There will be only one instance of RBBIDataWrapper for any 30 * set of RBBI run time data being shared by instances 31 * (clones) of RulesBasedBreakIterator. 32 */ 33 34 #ifndef __RBBIDATA_H__ 35 #define __RBBIDATA_H__ 36 37 #include "unicode/utypes.h" 38 #include "unicode/udata.h" 39 #include "udataswp.h" 40 41 /** 42 * Swap RBBI data. See udataswp.h. 43 * @internal 44 */ 45 U_CAPI int32_t U_EXPORT2 46 ubrk_swap(const UDataSwapper *ds, 47 const void *inData, int32_t length, void *outData, 48 UErrorCode *pErrorCode); 49 50 #ifdef __cplusplus 51 52 #include "unicode/uobject.h" 53 #include "unicode/unistr.h" 54 #include "unicode/uversion.h" 55 #include "umutex.h" 56 #include "utrie2.h" 57 58 U_NAMESPACE_BEGIN 59 60 // The current RBBI data format version. 61 static const uint8_t RBBI_DATA_FORMAT_VERSION[] = {4, 0, 0, 0}; 62 63 /* 64 * The following structs map exactly onto the raw data from ICU common data file. 65 */ 66 struct RBBIDataHeader { 67 uint32_t fMagic; /* == 0xbla0 */ 68 UVersionInfo fFormatVersion; /* Data Format. Same as the value in struct UDataInfo */ 69 /* if there is one associated with this data. */ 70 /* (version originates in rbbi, is copied to UDataInfo) */ 71 uint32_t fLength; /* Total length in bytes of this RBBI Data, */ 72 /* including all sections, not just the header. */ 73 uint32_t fCatCount; /* Number of character categories. */ 74 75 /* */ 76 /* Offsets and sizes of each of the subsections within the RBBI data. */ 77 /* All offsets are bytes from the start of the RBBIDataHeader. */ 78 /* All sizes are in bytes. */ 79 /* */ 80 uint32_t fFTable; /* forward state transition table. */ 81 uint32_t fFTableLen; 82 uint32_t fRTable; /* Offset to the reverse state transition table. */ 83 uint32_t fRTableLen; 84 uint32_t fSFTable; /* safe point forward transition table */ 85 uint32_t fSFTableLen; 86 uint32_t fSRTable; /* safe point reverse transition table */ 87 uint32_t fSRTableLen; 88 uint32_t fTrie; /* Offset to Trie data for character categories */ 89 uint32_t fTrieLen; 90 uint32_t fRuleSource; /* Offset to the source for for the break */ 91 uint32_t fRuleSourceLen; /* rules. Stored UChar *. */ 92 uint32_t fStatusTable; /* Offset to the table of rule status values */ 93 uint32_t fStatusTableLen; 94 95 uint32_t fReserved[6]; /* Reserved for expansion */ 96 97 }; 98 99 100 101 struct RBBIStateTableRow { 102 int16_t fAccepting; /* Non-zero if this row is for an accepting state. */ 103 /* Value 0: not an accepting state. */ 104 /* -1: Unconditional Accepting state. */ 105 /* positive: Look-ahead match has completed. */ 106 /* Actual boundary position happened earlier */ 107 /* Value here == fLookAhead in earlier */ 108 /* state, at actual boundary pos. */ 109 int16_t fLookAhead; /* Non-zero if this row is for a state that */ 110 /* corresponds to a '/' in the rule source. */ 111 /* Value is the same as the fAccepting */ 112 /* value for the rule (which will appear */ 113 /* in a different state. */ 114 int16_t fTagIdx; /* Non-zero if this row covers a {tagged} position */ 115 /* from a rule. Value is the index in the */ 116 /* StatusTable of the set of matching */ 117 /* tags (rule status values) */ 118 int16_t fReserved; 119 uint16_t fNextState[2]; /* Next State, indexed by char category. */ 120 /* This array does not have two elements */ 121 /* Array Size is actually fData->fHeader->fCatCount */ 122 /* CAUTION: see RBBITableBuilder::getTableSize() */ 123 /* before changing anything here. */ 124 }; 125 126 127 struct RBBIStateTable { 128 uint32_t fNumStates; /* Number of states. */ 129 uint32_t fRowLen; /* Length of a state table row, in bytes. */ 130 uint32_t fFlags; /* Option Flags for this state table */ 131 uint32_t fReserved; /* reserved */ 132 char fTableData[4]; /* First RBBIStateTableRow begins here. */ 133 /* (making it char[] simplifies ugly address */ 134 /* arithmetic for indexing variable length rows.) */ 135 }; 136 137 typedef enum { 138 RBBI_LOOKAHEAD_HARD_BREAK = 1, 139 RBBI_BOF_REQUIRED = 2 140 } RBBIStateTableFlags; 141 142 143 /* */ 144 /* The reference counting wrapper class */ 145 /* */ 146 class RBBIDataWrapper : public UMemory { 147 public: 148 enum EDontAdopt { 149 kDontAdopt 150 }; 151 RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status); 152 RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt dontAdopt, UErrorCode &status); 153 RBBIDataWrapper(UDataMemory* udm, UErrorCode &status); 154 ~RBBIDataWrapper(); 155 156 static UBool isDataVersionAcceptable(const UVersionInfo version); 157 158 void init0(); 159 void init(const RBBIDataHeader *data, UErrorCode &status); 160 RBBIDataWrapper *addReference(); 161 void removeReference(); 162 UBool operator ==(const RBBIDataWrapper &other) const; 163 int32_t hashCode(); 164 const UnicodeString &getRuleSourceString() const; 165 #ifdef RBBI_DEBUG 166 void printData(); 167 void printTable(const char *heading, const RBBIStateTable *table); 168 #else 169 #define printData() 170 #define printTable(heading, table) 171 #endif 172 173 /* */ 174 /* Pointers to items within the data */ 175 /* */ 176 const RBBIDataHeader *fHeader; 177 const RBBIStateTable *fForwardTable; 178 const RBBIStateTable *fReverseTable; 179 const RBBIStateTable *fSafeFwdTable; 180 const RBBIStateTable *fSafeRevTable; 181 const UChar *fRuleSource; 182 const int32_t *fRuleStatusTable; 183 184 /* number of int32_t values in the rule status table. Used to sanity check indexing */ 185 int32_t fStatusMaxIdx; 186 187 UTrie2 *fTrie; 188 189 private: 190 u_atomic_int32_t fRefCount; 191 UDataMemory *fUDataMem; 192 UnicodeString fRuleString; 193 UBool fDontFreeData; 194 195 RBBIDataWrapper(const RBBIDataWrapper &other); /* forbid copying of this class */ 196 RBBIDataWrapper &operator=(const RBBIDataWrapper &other); /* forbid copying of this class */ 197 }; 198 199 200 201 U_NAMESPACE_END 202 203 #endif /* C++ */ 204 205 #endif 206