1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 2007, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 * file name: unisetspan.h
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2007mar01
14 * created by: Markus W. Scherer
15 */
16
17 #ifndef __UNISETSPAN_H__
18 #define __UNISETSPAN_H__
19
20 #include "unicode/utypes.h"
21 #include "unicode/uniset.h"
22
23 U_NAMESPACE_BEGIN
24
25 /*
26 * Implement span() etc. for a set with strings.
27 * Avoid recursion because of its exponential complexity.
28 * Instead, try multiple paths at once and track them with an IndexList.
29 */
30 class UnicodeSetStringSpan : public UMemory {
31 public:
32 /*
33 * Which span() variant will be used?
34 * The object is either built for one variant and used once,
35 * or built for all and may be used many times.
36 */
37 enum {
38 FWD = 0x20,
39 BACK = 0x10,
40 UTF16 = 8,
41 UTF8 = 4,
42 CONTAINED = 2,
43 NOT_CONTAINED = 1,
44
45 ALL = 0x3f,
46
47 FWD_UTF16_CONTAINED = FWD | UTF16 | CONTAINED,
48 FWD_UTF16_NOT_CONTAINED = FWD | UTF16 | NOT_CONTAINED,
49 FWD_UTF8_CONTAINED = FWD | UTF8 | CONTAINED,
50 FWD_UTF8_NOT_CONTAINED = FWD | UTF8 | NOT_CONTAINED,
51 BACK_UTF16_CONTAINED = BACK | UTF16 | CONTAINED,
52 BACK_UTF16_NOT_CONTAINED= BACK | UTF16 | NOT_CONTAINED,
53 BACK_UTF8_CONTAINED = BACK | UTF8 | CONTAINED,
54 BACK_UTF8_NOT_CONTAINED = BACK | UTF8 | NOT_CONTAINED
55 };
56
57 UnicodeSetStringSpan(const UnicodeSet &set, const UVector &setStrings, uint32_t which);
58
59 // Copy constructor. Assumes which==ALL for a frozen set.
60 UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, const UVector &newParentSetStrings);
61
62 ~UnicodeSetStringSpan();
63
64 /*
65 * Do the strings need to be checked in span() etc.?
66 * @return TRUE if strings need to be checked (call span() here),
67 * FALSE if not (use a BMPSet for best performance).
68 */
69 inline UBool needsStringSpanUTF16();
70 inline UBool needsStringSpanUTF8();
71
72 // For fast UnicodeSet::contains(c).
73 inline UBool contains(UChar32 c) const;
74
75 int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
76
77 int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
78
79 int32_t spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
80
81 int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
82
83 private:
84 // Special spanLength byte values.
85 enum {
86 // The spanLength is >=0xfe.
87 LONG_SPAN=0xfe,
88 // All code points in the string are contained in the parent set.
89 ALL_CP_CONTAINED=0xff
90 };
91
92 // Add a starting or ending string character to the spanNotSet
93 // so that a character span ends before any string.
94 void addToSpanNotSet(UChar32 c);
95
96 int32_t spanNot(const UChar *s, int32_t length) const;
97 int32_t spanNotBack(const UChar *s, int32_t length) const;
98 int32_t spanNotUTF8(const uint8_t *s, int32_t length) const;
99 int32_t spanNotBackUTF8(const uint8_t *s, int32_t length) const;
100
101 // Set for span(). Same as parent but without strings.
102 UnicodeSet spanSet;
103
104 // Set for span(not contained).
105 // Same as spanSet, plus characters that start or end strings.
106 UnicodeSet *pSpanNotSet;
107
108 // The strings of the parent set.
109 const UVector &strings;
110
111 // Pointer to the UTF-8 string lengths.
112 // Also pointer to further allocated storage for meta data and
113 // UTF-8 string contents as necessary.
114 int32_t *utf8Lengths;
115
116 // Pointer to the part of the (utf8Lengths) memory block that stores
117 // the lengths of span(), spanBack() etc. for each string.
118 uint8_t *spanLengths;
119
120 // Pointer to the part of the (utf8Lengths) memory block that stores
121 // the UTF-8 versions of the parent set's strings.
122 uint8_t *utf8;
123
124 // Number of bytes for all UTF-8 versions of strings together.
125 int32_t utf8Length;
126
127 // Maximum lengths of relevant strings.
128 int32_t maxLength16;
129 int32_t maxLength8;
130
131 // Set up for all variants of span()?
132 UBool all;
133
134 // Memory for small numbers and lengths of strings.
135 // For example, for 8 strings:
136 // 8 UTF-8 lengths, 8*4 bytes span lengths, 8*2 3-byte UTF-8 characters
137 // = 112 bytes = int32_t[28].
138 int32_t staticLengths[32];
139 };
140
needsStringSpanUTF16()141 UBool UnicodeSetStringSpan::needsStringSpanUTF16() {
142 return (UBool)(maxLength16!=0);
143 }
144
needsStringSpanUTF8()145 UBool UnicodeSetStringSpan::needsStringSpanUTF8() {
146 return (UBool)(maxLength8!=0);
147 }
148
contains(UChar32 c)149 UBool UnicodeSetStringSpan::contains(UChar32 c) const {
150 return spanSet.contains(c);
151 }
152
153 U_NAMESPACE_END
154
155 #endif
156