1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2009-2012, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: filterednormalizer2.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2009dec10
14 * created by: Markus W. Scherer
15 */
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_NORMALIZATION
20
21 #include "unicode/normalizer2.h"
22 #include "unicode/uniset.h"
23 #include "unicode/unistr.h"
24 #include "unicode/unorm.h"
25 #include "cpputils.h"
26
27 U_NAMESPACE_BEGIN
28
~FilteredNormalizer2()29 FilteredNormalizer2::~FilteredNormalizer2() {}
30
31 UnicodeString &
normalize(const UnicodeString & src,UnicodeString & dest,UErrorCode & errorCode) const32 FilteredNormalizer2::normalize(const UnicodeString &src,
33 UnicodeString &dest,
34 UErrorCode &errorCode) const {
35 uprv_checkCanGetBuffer(src, errorCode);
36 if(U_FAILURE(errorCode)) {
37 dest.setToBogus();
38 return dest;
39 }
40 if(&dest==&src) {
41 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
42 return dest;
43 }
44 dest.remove();
45 return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
46 }
47
48 // Internal: No argument checking, and appends to dest.
49 // Pass as input spanCondition the one that is likely to yield a non-zero
50 // span length at the start of src.
51 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
52 // USET_SPAN_SIMPLE should be passed in for the start of src
53 // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
54 // an in-filter prefix.
55 UnicodeString &
normalize(const UnicodeString & src,UnicodeString & dest,USetSpanCondition spanCondition,UErrorCode & errorCode) const56 FilteredNormalizer2::normalize(const UnicodeString &src,
57 UnicodeString &dest,
58 USetSpanCondition spanCondition,
59 UErrorCode &errorCode) const {
60 UnicodeString tempDest; // Don't throw away destination buffer between iterations.
61 for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
62 int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
63 int32_t spanLength=spanLimit-prevSpanLimit;
64 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
65 if(spanLength!=0) {
66 dest.append(src, prevSpanLimit, spanLength);
67 }
68 spanCondition=USET_SPAN_SIMPLE;
69 } else {
70 if(spanLength!=0) {
71 // Not norm2.normalizeSecondAndAppend() because we do not want
72 // to modify the non-filter part of dest.
73 dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
74 tempDest, errorCode));
75 if(U_FAILURE(errorCode)) {
76 break;
77 }
78 }
79 spanCondition=USET_SPAN_NOT_CONTAINED;
80 }
81 prevSpanLimit=spanLimit;
82 }
83 return dest;
84 }
85
86 UnicodeString &
normalizeSecondAndAppend(UnicodeString & first,const UnicodeString & second,UErrorCode & errorCode) const87 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
88 const UnicodeString &second,
89 UErrorCode &errorCode) const {
90 return normalizeSecondAndAppend(first, second, TRUE, errorCode);
91 }
92
93 UnicodeString &
append(UnicodeString & first,const UnicodeString & second,UErrorCode & errorCode) const94 FilteredNormalizer2::append(UnicodeString &first,
95 const UnicodeString &second,
96 UErrorCode &errorCode) const {
97 return normalizeSecondAndAppend(first, second, FALSE, errorCode);
98 }
99
100 UnicodeString &
normalizeSecondAndAppend(UnicodeString & first,const UnicodeString & second,UBool doNormalize,UErrorCode & errorCode) const101 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
102 const UnicodeString &second,
103 UBool doNormalize,
104 UErrorCode &errorCode) const {
105 uprv_checkCanGetBuffer(first, errorCode);
106 uprv_checkCanGetBuffer(second, errorCode);
107 if(U_FAILURE(errorCode)) {
108 return first;
109 }
110 if(&first==&second) {
111 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
112 return first;
113 }
114 if(first.isEmpty()) {
115 if(doNormalize) {
116 return normalize(second, first, errorCode);
117 } else {
118 return first=second;
119 }
120 }
121 // merge the in-filter suffix of the first string with the in-filter prefix of the second
122 int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
123 if(prefixLimit!=0) {
124 UnicodeString prefix(second.tempSubString(0, prefixLimit));
125 int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
126 if(suffixStart==0) {
127 if(doNormalize) {
128 norm2.normalizeSecondAndAppend(first, prefix, errorCode);
129 } else {
130 norm2.append(first, prefix, errorCode);
131 }
132 } else {
133 UnicodeString middle(first, suffixStart, INT32_MAX);
134 if(doNormalize) {
135 norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
136 } else {
137 norm2.append(middle, prefix, errorCode);
138 }
139 first.replace(suffixStart, INT32_MAX, middle);
140 }
141 }
142 if(prefixLimit<second.length()) {
143 UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
144 if(doNormalize) {
145 normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
146 } else {
147 first.append(rest);
148 }
149 }
150 return first;
151 }
152
153 UBool
getDecomposition(UChar32 c,UnicodeString & decomposition) const154 FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
155 return set.contains(c) && norm2.getDecomposition(c, decomposition);
156 }
157
158 UBool
getRawDecomposition(UChar32 c,UnicodeString & decomposition) const159 FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
160 return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
161 }
162
163 UChar32
composePair(UChar32 a,UChar32 b) const164 FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
165 return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
166 }
167
168 uint8_t
getCombiningClass(UChar32 c) const169 FilteredNormalizer2::getCombiningClass(UChar32 c) const {
170 return set.contains(c) ? norm2.getCombiningClass(c) : 0;
171 }
172
173 UBool
isNormalized(const UnicodeString & s,UErrorCode & errorCode) const174 FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
175 uprv_checkCanGetBuffer(s, errorCode);
176 if(U_FAILURE(errorCode)) {
177 return FALSE;
178 }
179 USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
180 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
181 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
182 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
183 spanCondition=USET_SPAN_SIMPLE;
184 } else {
185 if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
186 U_FAILURE(errorCode)
187 ) {
188 return FALSE;
189 }
190 spanCondition=USET_SPAN_NOT_CONTAINED;
191 }
192 prevSpanLimit=spanLimit;
193 }
194 return TRUE;
195 }
196
197 UNormalizationCheckResult
quickCheck(const UnicodeString & s,UErrorCode & errorCode) const198 FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
199 uprv_checkCanGetBuffer(s, errorCode);
200 if(U_FAILURE(errorCode)) {
201 return UNORM_MAYBE;
202 }
203 UNormalizationCheckResult result=UNORM_YES;
204 USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
205 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
206 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
207 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
208 spanCondition=USET_SPAN_SIMPLE;
209 } else {
210 UNormalizationCheckResult qcResult=
211 norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
212 if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
213 return qcResult;
214 } else if(qcResult==UNORM_MAYBE) {
215 result=qcResult;
216 }
217 spanCondition=USET_SPAN_NOT_CONTAINED;
218 }
219 prevSpanLimit=spanLimit;
220 }
221 return result;
222 }
223
224 int32_t
spanQuickCheckYes(const UnicodeString & s,UErrorCode & errorCode) const225 FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
226 uprv_checkCanGetBuffer(s, errorCode);
227 if(U_FAILURE(errorCode)) {
228 return 0;
229 }
230 USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
231 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
232 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
233 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
234 spanCondition=USET_SPAN_SIMPLE;
235 } else {
236 int32_t yesLimit=
237 prevSpanLimit+
238 norm2.spanQuickCheckYes(
239 s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
240 if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
241 return yesLimit;
242 }
243 spanCondition=USET_SPAN_NOT_CONTAINED;
244 }
245 prevSpanLimit=spanLimit;
246 }
247 return s.length();
248 }
249
250 UBool
hasBoundaryBefore(UChar32 c) const251 FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
252 return !set.contains(c) || norm2.hasBoundaryBefore(c);
253 }
254
255 UBool
hasBoundaryAfter(UChar32 c) const256 FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
257 return !set.contains(c) || norm2.hasBoundaryAfter(c);
258 }
259
260 UBool
isInert(UChar32 c) const261 FilteredNormalizer2::isInert(UChar32 c) const {
262 return !set.contains(c) || norm2.isInert(c);
263 }
264
265 U_NAMESPACE_END
266
267 // C API ------------------------------------------------------------------- ***
268
269 U_NAMESPACE_USE
270
271 U_CAPI UNormalizer2 * U_EXPORT2
unorm2_openFiltered(const UNormalizer2 * norm2,const USet * filterSet,UErrorCode * pErrorCode)272 unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
273 if(U_FAILURE(*pErrorCode)) {
274 return NULL;
275 }
276 if(filterSet==NULL) {
277 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
278 return NULL;
279 }
280 Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
281 *UnicodeSet::fromUSet(filterSet));
282 if(fn2==NULL) {
283 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
284 }
285 return (UNormalizer2 *)fn2;
286 }
287
288 #endif // !UCONFIG_NO_NORMALIZATION
289