• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 1999-2014 International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  rbbidata.h
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   RBBI data formats  Includes
16 *
17 *                          Structs that describes the format of the Binary RBBI data,
18 *                          as it is stored in ICU's data file.
19 *
20 *      RBBIDataWrapper  -  Instances of this class sit between the
21 *                          raw data structs and the RulesBasedBreakIterator objects
22 *                          that are created by applications.  The wrapper class
23 *                          provides reference counting for the underlying data,
24 *                          and direct pointers to data that would not otherwise
25 *                          be accessible without ugly pointer arithmetic.  The
26 *                          wrapper does not attempt to provide any higher level
27 *                          abstractions for the data itself.
28 *
29 *                          There will be only one instance of RBBIDataWrapper for any
30 *                          set of RBBI run time data being shared by instances
31 *                          (clones) of RulesBasedBreakIterator.
32 */
33 
34 #ifndef __RBBIDATA_H__
35 #define __RBBIDATA_H__
36 
37 #include "unicode/utypes.h"
38 #include "unicode/udata.h"
39 #include "udataswp.h"
40 
41 /**
42  * Swap RBBI data. See udataswp.h.
43  * @internal
44  */
45 U_CAPI int32_t U_EXPORT2
46 ubrk_swap(const UDataSwapper *ds,
47           const void *inData, int32_t length, void *outData,
48           UErrorCode *pErrorCode);
49 
50 #ifdef __cplusplus
51 
52 #include "unicode/uobject.h"
53 #include "unicode/unistr.h"
54 #include "unicode/uversion.h"
55 #include "umutex.h"
56 #include "utrie2.h"
57 
58 U_NAMESPACE_BEGIN
59 
60 // The current RBBI data format version.
61 static const uint8_t RBBI_DATA_FORMAT_VERSION[] = {4, 0, 0, 0};
62 
63 /*
64  *   The following structs map exactly onto the raw data from ICU common data file.
65  */
66 struct RBBIDataHeader {
67     uint32_t         fMagic;           /*  == 0xbla0                                               */
68     UVersionInfo     fFormatVersion;   /* Data Format.  Same as the value in struct UDataInfo      */
69                                        /*   if there is one associated with this data.             */
70                                        /*     (version originates in rbbi, is copied to UDataInfo) */
71     uint32_t         fLength;          /*  Total length in bytes of this RBBI Data,                */
72                                        /*      including all sections, not just the header.        */
73     uint32_t         fCatCount;        /*  Number of character categories.                         */
74 
75     /*                                                                        */
76     /*  Offsets and sizes of each of the subsections within the RBBI data.    */
77     /*  All offsets are bytes from the start of the RBBIDataHeader.           */
78     /*  All sizes are in bytes.                                               */
79     /*                                                                        */
80     uint32_t         fFTable;         /*  forward state transition table. */
81     uint32_t         fFTableLen;
82     uint32_t         fRTable;         /*  Offset to the reverse state transition table. */
83     uint32_t         fRTableLen;
84     uint32_t         fSFTable;        /*  safe point forward transition table */
85     uint32_t         fSFTableLen;
86     uint32_t         fSRTable;        /*  safe point reverse transition table */
87     uint32_t         fSRTableLen;
88     uint32_t         fTrie;           /*  Offset to Trie data for character categories */
89     uint32_t         fTrieLen;
90     uint32_t         fRuleSource;     /*  Offset to the source for for the break */
91     uint32_t         fRuleSourceLen;  /*    rules.  Stored UChar *. */
92     uint32_t         fStatusTable;    /* Offset to the table of rule status values */
93     uint32_t         fStatusTableLen;
94 
95     uint32_t         fReserved[6];    /*  Reserved for expansion */
96 
97 };
98 
99 
100 
101 struct  RBBIStateTableRow {
102     int16_t          fAccepting;    /*  Non-zero if this row is for an accepting state.   */
103                                     /*  Value 0: not an accepting state.                  */
104                                     /*       -1: Unconditional Accepting state.           */
105                                     /*    positive:  Look-ahead match has completed.      */
106                                     /*           Actual boundary position happened earlier */
107                                     /*           Value here == fLookAhead in earlier      */
108                                     /*              state, at actual boundary pos.        */
109     int16_t          fLookAhead;    /*  Non-zero if this row is for a state that          */
110                                     /*    corresponds to a '/' in the rule source.        */
111                                     /*    Value is the same as the fAccepting             */
112                                     /*      value for the rule (which will appear         */
113                                     /*      in a different state.                         */
114     int16_t          fTagIdx;       /*  Non-zero if this row covers a {tagged} position   */
115                                     /*     from a rule.  Value is the index in the        */
116                                     /*     StatusTable of the set of matching             */
117                                     /*     tags (rule status values)                      */
118     int16_t          fReserved;
119     uint16_t         fNextState[2]; /*  Next State, indexed by char category.             */
120                                     /*  This array does not have two elements             */
121                                     /*    Array Size is actually fData->fHeader->fCatCount         */
122                                     /*    CAUTION:  see RBBITableBuilder::getTableSize()  */
123                                     /*              before changing anything here.        */
124 };
125 
126 
127 struct RBBIStateTable {
128     uint32_t         fNumStates;    /*  Number of states.                                 */
129     uint32_t         fRowLen;       /*  Length of a state table row, in bytes.            */
130     uint32_t         fFlags;        /*  Option Flags for this state table                 */
131     uint32_t         fReserved;     /*  reserved                                          */
132     char             fTableData[4]; /*  First RBBIStateTableRow begins here.              */
133                                     /*    (making it char[] simplifies ugly address       */
134                                     /*     arithmetic for indexing variable length rows.) */
135 };
136 
137 typedef enum {
138     RBBI_LOOKAHEAD_HARD_BREAK = 1,
139     RBBI_BOF_REQUIRED = 2
140 } RBBIStateTableFlags;
141 
142 
143 /*                                        */
144 /*   The reference counting wrapper class */
145 /*                                        */
146 class RBBIDataWrapper : public UMemory {
147 public:
148     enum EDontAdopt {
149         kDontAdopt
150     };
151     RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status);
152     RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt dontAdopt, UErrorCode &status);
153     RBBIDataWrapper(UDataMemory* udm, UErrorCode &status);
154     ~RBBIDataWrapper();
155 
156     static UBool          isDataVersionAcceptable(const UVersionInfo version);
157 
158     void                  init0();
159     void                  init(const RBBIDataHeader *data, UErrorCode &status);
160     RBBIDataWrapper      *addReference();
161     void                  removeReference();
162     UBool                 operator ==(const RBBIDataWrapper &other) const;
163     int32_t               hashCode();
164     const UnicodeString  &getRuleSourceString() const;
165 #ifdef RBBI_DEBUG
166     void                  printData();
167     void                  printTable(const char *heading, const RBBIStateTable *table);
168 #else
169     #define printData()
170     #define printTable(heading, table)
171 #endif
172 
173     /*                                     */
174     /*   Pointers to items within the data */
175     /*                                     */
176     const RBBIDataHeader     *fHeader;
177     const RBBIStateTable     *fForwardTable;
178     const RBBIStateTable     *fReverseTable;
179     const RBBIStateTable     *fSafeFwdTable;
180     const RBBIStateTable     *fSafeRevTable;
181     const UChar              *fRuleSource;
182     const int32_t            *fRuleStatusTable;
183 
184     /* number of int32_t values in the rule status table.   Used to sanity check indexing */
185     int32_t             fStatusMaxIdx;
186 
187     UTrie2             *fTrie;
188 
189 private:
190     u_atomic_int32_t    fRefCount;
191     UDataMemory        *fUDataMem;
192     UnicodeString       fRuleString;
193     UBool               fDontFreeData;
194 
195     RBBIDataWrapper(const RBBIDataWrapper &other); /*  forbid copying of this class */
196     RBBIDataWrapper &operator=(const RBBIDataWrapper &other); /*  forbid copying of this class */
197 };
198 
199 
200 
201 U_NAMESPACE_END
202 
203 #endif /* C++ */
204 
205 #endif
206