1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2009-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: filterednormalizer2.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2009dec10
14 * created by: Markus W. Scherer
15 */
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_NORMALIZATION
20
21 #include "unicode/normalizer2.h"
22 #include "unicode/uniset.h"
23 #include "unicode/unistr.h"
24 #include "unicode/unorm.h"
25 #include "cpputils.h"
26
27 U_NAMESPACE_BEGIN
28
29 UnicodeString &
normalize(const UnicodeString & src,UnicodeString & dest,UErrorCode & errorCode) const30 FilteredNormalizer2::normalize(const UnicodeString &src,
31 UnicodeString &dest,
32 UErrorCode &errorCode) const {
33 uprv_checkCanGetBuffer(src, errorCode);
34 if(U_FAILURE(errorCode)) {
35 dest.setToBogus();
36 return dest;
37 }
38 if(&dest==&src) {
39 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
40 return dest;
41 }
42 dest.remove();
43 return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
44 }
45
46 // Internal: No argument checking, and appends to dest.
47 // Pass as input spanCondition the one that is likely to yield a non-zero
48 // span length at the start of src.
49 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
50 // USET_SPAN_SIMPLE should be passed in for the start of src
51 // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
52 // an in-filter prefix.
53 UnicodeString &
normalize(const UnicodeString & src,UnicodeString & dest,USetSpanCondition spanCondition,UErrorCode & errorCode) const54 FilteredNormalizer2::normalize(const UnicodeString &src,
55 UnicodeString &dest,
56 USetSpanCondition spanCondition,
57 UErrorCode &errorCode) const {
58 UnicodeString tempDest; // Don't throw away destination buffer between iterations.
59 for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
60 int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
61 int32_t spanLength=spanLimit-prevSpanLimit;
62 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
63 if(spanLength!=0) {
64 dest.append(src, prevSpanLimit, spanLength);
65 }
66 spanCondition=USET_SPAN_SIMPLE;
67 } else {
68 if(spanLength!=0) {
69 // Not norm2.normalizeSecondAndAppend() because we do not want
70 // to modify the non-filter part of dest.
71 dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
72 tempDest, errorCode));
73 if(U_FAILURE(errorCode)) {
74 break;
75 }
76 }
77 spanCondition=USET_SPAN_NOT_CONTAINED;
78 }
79 prevSpanLimit=spanLimit;
80 }
81 return dest;
82 }
83
84 UnicodeString &
normalizeSecondAndAppend(UnicodeString & first,const UnicodeString & second,UErrorCode & errorCode) const85 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
86 const UnicodeString &second,
87 UErrorCode &errorCode) const {
88 return normalizeSecondAndAppend(first, second, TRUE, errorCode);
89 }
90
91 UnicodeString &
append(UnicodeString & first,const UnicodeString & second,UErrorCode & errorCode) const92 FilteredNormalizer2::append(UnicodeString &first,
93 const UnicodeString &second,
94 UErrorCode &errorCode) const {
95 return normalizeSecondAndAppend(first, second, FALSE, errorCode);
96 }
97
98 UnicodeString &
normalizeSecondAndAppend(UnicodeString & first,const UnicodeString & second,UBool doNormalize,UErrorCode & errorCode) const99 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
100 const UnicodeString &second,
101 UBool doNormalize,
102 UErrorCode &errorCode) const {
103 uprv_checkCanGetBuffer(first, errorCode);
104 uprv_checkCanGetBuffer(second, errorCode);
105 if(U_FAILURE(errorCode)) {
106 return first;
107 }
108 if(&first==&second) {
109 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
110 return first;
111 }
112 if(first.isEmpty()) {
113 if(doNormalize) {
114 return normalize(second, first, errorCode);
115 } else {
116 return first=second;
117 }
118 }
119 // merge the in-filter suffix of the first string with the in-filter prefix of the second
120 int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
121 if(prefixLimit!=0) {
122 UnicodeString prefix(second.tempSubString(0, prefixLimit));
123 int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
124 if(suffixStart==0) {
125 if(doNormalize) {
126 norm2.normalizeSecondAndAppend(first, prefix, errorCode);
127 } else {
128 norm2.append(first, prefix, errorCode);
129 }
130 } else {
131 UnicodeString middle(first, suffixStart, INT32_MAX);
132 if(doNormalize) {
133 norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
134 } else {
135 norm2.append(middle, prefix, errorCode);
136 }
137 first.replace(suffixStart, INT32_MAX, middle);
138 }
139 }
140 if(prefixLimit<second.length()) {
141 UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
142 if(doNormalize) {
143 normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
144 } else {
145 first.append(rest);
146 }
147 }
148 return first;
149 }
150
151 UBool
getDecomposition(UChar32 c,UnicodeString & decomposition) const152 FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
153 return set.contains(c) && norm2.getDecomposition(c, decomposition);
154 }
155
156 UBool
isNormalized(const UnicodeString & s,UErrorCode & errorCode) const157 FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
158 uprv_checkCanGetBuffer(s, errorCode);
159 if(U_FAILURE(errorCode)) {
160 return FALSE;
161 }
162 USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
163 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
164 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
165 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
166 spanCondition=USET_SPAN_SIMPLE;
167 } else {
168 if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
169 U_FAILURE(errorCode)
170 ) {
171 return FALSE;
172 }
173 spanCondition=USET_SPAN_NOT_CONTAINED;
174 }
175 prevSpanLimit=spanLimit;
176 }
177 return TRUE;
178 }
179
180 UNormalizationCheckResult
quickCheck(const UnicodeString & s,UErrorCode & errorCode) const181 FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
182 uprv_checkCanGetBuffer(s, errorCode);
183 if(U_FAILURE(errorCode)) {
184 return UNORM_MAYBE;
185 }
186 UNormalizationCheckResult result=UNORM_YES;
187 USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
188 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
189 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
190 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
191 spanCondition=USET_SPAN_SIMPLE;
192 } else {
193 UNormalizationCheckResult qcResult=
194 norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
195 if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
196 return qcResult;
197 } else if(qcResult==UNORM_MAYBE) {
198 result=qcResult;
199 }
200 spanCondition=USET_SPAN_NOT_CONTAINED;
201 }
202 prevSpanLimit=spanLimit;
203 }
204 return result;
205 }
206
207 int32_t
spanQuickCheckYes(const UnicodeString & s,UErrorCode & errorCode) const208 FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
209 uprv_checkCanGetBuffer(s, errorCode);
210 if(U_FAILURE(errorCode)) {
211 return 0;
212 }
213 USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
214 for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
215 int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
216 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
217 spanCondition=USET_SPAN_SIMPLE;
218 } else {
219 int32_t yesLimit=
220 prevSpanLimit+
221 norm2.spanQuickCheckYes(
222 s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
223 if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
224 return yesLimit;
225 }
226 spanCondition=USET_SPAN_NOT_CONTAINED;
227 }
228 prevSpanLimit=spanLimit;
229 }
230 return s.length();
231 }
232
233 UBool
hasBoundaryBefore(UChar32 c) const234 FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
235 return !set.contains(c) || norm2.hasBoundaryBefore(c);
236 }
237
238 UBool
hasBoundaryAfter(UChar32 c) const239 FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
240 return !set.contains(c) || norm2.hasBoundaryAfter(c);
241 }
242
243 UBool
isInert(UChar32 c) const244 FilteredNormalizer2::isInert(UChar32 c) const {
245 return !set.contains(c) || norm2.isInert(c);
246 }
247
248 U_NAMESPACE_END
249
250 // C API ------------------------------------------------------------------- ***
251
252 U_NAMESPACE_USE
253
254 U_DRAFT UNormalizer2 * U_EXPORT2
unorm2_openFiltered(const UNormalizer2 * norm2,const USet * filterSet,UErrorCode * pErrorCode)255 unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
256 if(U_FAILURE(*pErrorCode)) {
257 return NULL;
258 }
259 if(filterSet==NULL) {
260 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
261 return NULL;
262 }
263 Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
264 *UnicodeSet::fromUSet(filterSet));
265 if(fn2==NULL) {
266 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
267 }
268 return (UNormalizer2 *)fn2;
269 }
270
271 #endif // !UCONFIG_NO_NORMALIZATION
272