• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2009-2010, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  filterednormalizer2.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2009dec10
14 *   created by: Markus W. Scherer
15 */
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_NORMALIZATION
20 
21 #include "unicode/normalizer2.h"
22 #include "unicode/uniset.h"
23 #include "unicode/unistr.h"
24 #include "unicode/unorm.h"
25 #include "cpputils.h"
26 
27 U_NAMESPACE_BEGIN
28 
29 UnicodeString &
normalize(const UnicodeString & src,UnicodeString & dest,UErrorCode & errorCode) const30 FilteredNormalizer2::normalize(const UnicodeString &src,
31                                UnicodeString &dest,
32                                UErrorCode &errorCode) const {
33     uprv_checkCanGetBuffer(src, errorCode);
34     if(U_FAILURE(errorCode)) {
35         dest.setToBogus();
36         return dest;
37     }
38     if(&dest==&src) {
39         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
40         return dest;
41     }
42     dest.remove();
43     return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
44 }
45 
46 // Internal: No argument checking, and appends to dest.
47 // Pass as input spanCondition the one that is likely to yield a non-zero
48 // span length at the start of src.
49 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
50 // USET_SPAN_SIMPLE should be passed in for the start of src
51 // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
52 // an in-filter prefix.
53 UnicodeString &
normalize(const UnicodeString & src,UnicodeString & dest,USetSpanCondition spanCondition,UErrorCode & errorCode) const54 FilteredNormalizer2::normalize(const UnicodeString &src,
55                                UnicodeString &dest,
56                                USetSpanCondition spanCondition,
57                                UErrorCode &errorCode) const {
58     UnicodeString tempDest;  // Don't throw away destination buffer between iterations.
59     for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
60         int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
61         int32_t spanLength=spanLimit-prevSpanLimit;
62         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
63             if(spanLength!=0) {
64                 dest.append(src, prevSpanLimit, spanLength);
65             }
66             spanCondition=USET_SPAN_SIMPLE;
67         } else {
68             if(spanLength!=0) {
69                 // Not norm2.normalizeSecondAndAppend() because we do not want
70                 // to modify the non-filter part of dest.
71                 dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
72                                             tempDest, errorCode));
73                 if(U_FAILURE(errorCode)) {
74                     break;
75                 }
76             }
77             spanCondition=USET_SPAN_NOT_CONTAINED;
78         }
79         prevSpanLimit=spanLimit;
80     }
81     return dest;
82 }
83 
84 UnicodeString &
normalizeSecondAndAppend(UnicodeString & first,const UnicodeString & second,UErrorCode & errorCode) const85 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
86                                               const UnicodeString &second,
87                                               UErrorCode &errorCode) const {
88     return normalizeSecondAndAppend(first, second, TRUE, errorCode);
89 }
90 
91 UnicodeString &
append(UnicodeString & first,const UnicodeString & second,UErrorCode & errorCode) const92 FilteredNormalizer2::append(UnicodeString &first,
93                             const UnicodeString &second,
94                             UErrorCode &errorCode) const {
95     return normalizeSecondAndAppend(first, second, FALSE, errorCode);
96 }
97 
98 UnicodeString &
normalizeSecondAndAppend(UnicodeString & first,const UnicodeString & second,UBool doNormalize,UErrorCode & errorCode) const99 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
100                                               const UnicodeString &second,
101                                               UBool doNormalize,
102                                               UErrorCode &errorCode) const {
103     uprv_checkCanGetBuffer(first, errorCode);
104     uprv_checkCanGetBuffer(second, errorCode);
105     if(U_FAILURE(errorCode)) {
106         return first;
107     }
108     if(&first==&second) {
109         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
110         return first;
111     }
112     if(first.isEmpty()) {
113         if(doNormalize) {
114             return normalize(second, first, errorCode);
115         } else {
116             return first=second;
117         }
118     }
119     // merge the in-filter suffix of the first string with the in-filter prefix of the second
120     int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
121     if(prefixLimit!=0) {
122         UnicodeString prefix(second.tempSubString(0, prefixLimit));
123         int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
124         if(suffixStart==0) {
125             if(doNormalize) {
126                 norm2.normalizeSecondAndAppend(first, prefix, errorCode);
127             } else {
128                 norm2.append(first, prefix, errorCode);
129             }
130         } else {
131             UnicodeString middle(first, suffixStart, INT32_MAX);
132             if(doNormalize) {
133                 norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
134             } else {
135                 norm2.append(middle, prefix, errorCode);
136             }
137             first.replace(suffixStart, INT32_MAX, middle);
138         }
139     }
140     if(prefixLimit<second.length()) {
141         UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
142         if(doNormalize) {
143             normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
144         } else {
145             first.append(rest);
146         }
147     }
148     return first;
149 }
150 
151 UBool
getDecomposition(UChar32 c,UnicodeString & decomposition) const152 FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
153     return set.contains(c) && norm2.getDecomposition(c, decomposition);
154 }
155 
156 UBool
isNormalized(const UnicodeString & s,UErrorCode & errorCode) const157 FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
158     uprv_checkCanGetBuffer(s, errorCode);
159     if(U_FAILURE(errorCode)) {
160         return FALSE;
161     }
162     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
163     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
164         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
165         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
166             spanCondition=USET_SPAN_SIMPLE;
167         } else {
168             if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
169                 U_FAILURE(errorCode)
170             ) {
171                 return FALSE;
172             }
173             spanCondition=USET_SPAN_NOT_CONTAINED;
174         }
175         prevSpanLimit=spanLimit;
176     }
177     return TRUE;
178 }
179 
180 UNormalizationCheckResult
quickCheck(const UnicodeString & s,UErrorCode & errorCode) const181 FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
182     uprv_checkCanGetBuffer(s, errorCode);
183     if(U_FAILURE(errorCode)) {
184         return UNORM_MAYBE;
185     }
186     UNormalizationCheckResult result=UNORM_YES;
187     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
188     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
189         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
190         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
191             spanCondition=USET_SPAN_SIMPLE;
192         } else {
193             UNormalizationCheckResult qcResult=
194                 norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
195             if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
196                 return qcResult;
197             } else if(qcResult==UNORM_MAYBE) {
198                 result=qcResult;
199             }
200             spanCondition=USET_SPAN_NOT_CONTAINED;
201         }
202         prevSpanLimit=spanLimit;
203     }
204     return result;
205 }
206 
207 int32_t
spanQuickCheckYes(const UnicodeString & s,UErrorCode & errorCode) const208 FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
209     uprv_checkCanGetBuffer(s, errorCode);
210     if(U_FAILURE(errorCode)) {
211         return 0;
212     }
213     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
214     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
215         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
216         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
217             spanCondition=USET_SPAN_SIMPLE;
218         } else {
219             int32_t yesLimit=
220                 prevSpanLimit+
221                 norm2.spanQuickCheckYes(
222                     s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
223             if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
224                 return yesLimit;
225             }
226             spanCondition=USET_SPAN_NOT_CONTAINED;
227         }
228         prevSpanLimit=spanLimit;
229     }
230     return s.length();
231 }
232 
233 UBool
hasBoundaryBefore(UChar32 c) const234 FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
235     return !set.contains(c) || norm2.hasBoundaryBefore(c);
236 }
237 
238 UBool
hasBoundaryAfter(UChar32 c) const239 FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
240     return !set.contains(c) || norm2.hasBoundaryAfter(c);
241 }
242 
243 UBool
isInert(UChar32 c) const244 FilteredNormalizer2::isInert(UChar32 c) const {
245     return !set.contains(c) || norm2.isInert(c);
246 }
247 
248 U_NAMESPACE_END
249 
250 // C API ------------------------------------------------------------------- ***
251 
252 U_NAMESPACE_USE
253 
254 U_DRAFT UNormalizer2 * U_EXPORT2
unorm2_openFiltered(const UNormalizer2 * norm2,const USet * filterSet,UErrorCode * pErrorCode)255 unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
256     if(U_FAILURE(*pErrorCode)) {
257         return NULL;
258     }
259     if(filterSet==NULL) {
260         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
261         return NULL;
262     }
263     Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
264                                              *UnicodeSet::fromUSet(filterSet));
265     if(fn2==NULL) {
266         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
267     }
268     return (UNormalizer2 *)fn2;
269 }
270 
271 #endif  // !UCONFIG_NO_NORMALIZATION
272