• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 //  rbbisetb.h
3 /*
4 **********************************************************************
5 *   Copyright (c) 2001-2005, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 */
9 
10 #ifndef RBBISETB_H
11 #define RBBISETB_H
12 
13 #include "unicode/utypes.h"
14 #include "unicode/uobject.h"
15 #include "rbbirb.h"
16 #include "uvector.h"
17 
18 struct  UNewTrie;
19 
20 U_NAMESPACE_BEGIN
21 
22 //
23 //  RBBISetBuilder   Derives the character categories used by the runtime RBBI engine
24 //                   from the Unicode Sets appearing in the source  RBBI rules, and
25 //                   creates the TRIE table used to map from Unicode to the
26 //                   character categories.
27 //
28 
29 
30 //
31 //  RangeDescriptor
32 //
33 //     Each of the non-overlapping character ranges gets one of these descriptors.
34 //     All of them are strung together in a linked list, which is kept in order
35 //     (by character)
36 //
37 class RangeDescriptor : public UMemory {
38 public:
39     UChar32            fStartChar;      // Start of range, unicode 32 bit value.
40     UChar32            fEndChar;        // End of range, unicode 32 bit value.
41     int32_t            fNum;            // runtime-mapped input value for this range.
42     UVector           *fIncludesSets;   // vector of the the original
43                                         //   Unicode sets that include this range.
44                                         //    (Contains ptrs to uset nodes)
45     RangeDescriptor   *fNext;           // Next RangeDescriptor in the linked list.
46 
47     RangeDescriptor(UErrorCode &status);
48     RangeDescriptor(const RangeDescriptor &other, UErrorCode &status);
49     ~RangeDescriptor();
50     void split(UChar32 where, UErrorCode &status);   // Spit this range in two at "where", with
51                                         //   where appearing in the second (higher) part.
52     void setDictionaryFlag();           // Check whether this range appears as part of
53                                         //   the Unicode set named "dictionary"
54 
55 private:
56     RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class
57     RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class
58 };
59 
60 
61 //
62 //  RBBISetBuilder   Handles processing of Unicode Sets from RBBI rules.
63 //
64 //      Starting with the rules parse tree from the scanner,
65 //
66 //                   -  Enumerate the set of UnicodeSets that are referenced
67 //                      by the RBBI rules.
68 //                   -  compute a derived set of non-overlapping UnicodeSets
69 //                      that will correspond to columns in the state table for
70 //                      the RBBI execution engine.
71 //                   -  construct the trie table that maps input characters
72 //                      to set numbers in the non-overlapping set of sets.
73 //
74 
75 
76 class RBBISetBuilder : public UMemory {
77 public:
78     RBBISetBuilder(RBBIRuleBuilder *rb);
79     ~RBBISetBuilder();
80 
81     void     build();
82     void     addValToSets(UVector *sets,      uint32_t val);
83     void     addValToSet (RBBINode *usetNode, uint32_t val);
84     int32_t  getNumCharCategories() const;   // CharCategories are the same as input symbol set to the
85                                              //    runtime state machine, which are the same as
86                                              //    columns in the DFA state table
87     int32_t  getTrieSize() /*const*/;        // Size in bytes of the serialized Trie.
88     void     serializeTrie(uint8_t *where);  // write out the serialized Trie.
89     UChar32  getFirstChar(int32_t  val) const;
90     UBool    sawBOF() const;                 // Indicate whether any references to the {bof} pseudo
91                                              //   character were encountered.
92 #ifdef RBBI_DEBUG
93     void     printSets();
94     void     printRanges();
95     void     printRangeGroups();
96 #else
97     #define printSets()
98     #define printRanges()
99     #define printRangeGroups()
100 #endif
101 
102 private:
103     void           numberSets();
104 
105     RBBIRuleBuilder       *fRB;             // The RBBI Rule Compiler that owns us.
106     UErrorCode            *fStatus;
107 
108     RangeDescriptor       *fRangeList;      // Head of the linked list of RangeDescriptors
109 
110     UNewTrie              *fTrie;           // The mapping TRIE that is the end result of processing
111     uint32_t              fTrieSize;        //  the Unicode Sets.
112 
113     // Groups correspond to character categories -
114     //       groups of ranges that are in the same original UnicodeSets.
115     //       fGroupCount is the index of the last used group.
116     //       fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
117     //       State table column 0 is not used.  Column 1 is for end-of-input.
118     //       column 2 is for group 0.  Funny counting.
119     int32_t               fGroupCount;
120 
121     UBool                 fSawBOF;
122 
123     RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class
124     RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class
125 };
126 
127 
128 
129 U_NAMESPACE_END
130 #endif
131