1 /*
2 ******************************************************************************
3 * Copyright (c) 1996-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************
6 * File unorm.cpp
7 *
8 * Created by: Vladimir Weinstein 12052000
9 *
10 * Modification history :
11 *
12 * Date Name Description
13 * 02/01/01 synwee Added normalization quickcheck enum and method.
14 * 02/12/01 synwee Commented out quickcheck util api has been approved
15 * Added private method for doing FCD checks
16 * 02/23/01 synwee Modified quickcheck and checkFCE to run through
17 * string for codepoints < 0x300 for the normalization
18 * mode NFC.
19 * 05/25/01+ Markus Scherer total rewrite, implement all normalization here
20 * instead of just wrappers around normlzr.cpp,
21 * load unorm.dat, support Unicode 3.1 with
22 * supplementary code points, etc.
23 * 2009-nov..2010-jan Markus Scherer total rewrite, new Normalizer2 API & code
24 */
25
26 #include "unicode/utypes.h"
27
28 #if !UCONFIG_NO_NORMALIZATION
29
30 #include "unicode/udata.h"
31 #include "unicode/ustring.h"
32 #include "unicode/uiter.h"
33 #include "unicode/unorm.h"
34 #include "unicode/unorm2.h"
35 #include "normalizer2impl.h"
36 #include "unormimp.h"
37 #include "uprops.h"
38 #include "ustr_imp.h"
39
40 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
41
42 U_NAMESPACE_USE
43
44 /* quick check functions ---------------------------------------------------- */
45
46 U_CAPI UNormalizationCheckResult U_EXPORT2
unorm_quickCheck(const UChar * src,int32_t srcLength,UNormalizationMode mode,UErrorCode * pErrorCode)47 unorm_quickCheck(const UChar *src,
48 int32_t srcLength,
49 UNormalizationMode mode,
50 UErrorCode *pErrorCode) {
51 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
52 return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
53 }
54
55 U_CAPI UNormalizationCheckResult U_EXPORT2
unorm_quickCheckWithOptions(const UChar * src,int32_t srcLength,UNormalizationMode mode,int32_t options,UErrorCode * pErrorCode)56 unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength,
57 UNormalizationMode mode, int32_t options,
58 UErrorCode *pErrorCode) {
59 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
60 if(options&UNORM_UNICODE_3_2) {
61 FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
62 return unorm2_quickCheck(
63 reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
64 src, srcLength, pErrorCode);
65 } else {
66 return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
67 }
68 }
69
70 U_CAPI UBool U_EXPORT2
unorm_isNormalized(const UChar * src,int32_t srcLength,UNormalizationMode mode,UErrorCode * pErrorCode)71 unorm_isNormalized(const UChar *src, int32_t srcLength,
72 UNormalizationMode mode,
73 UErrorCode *pErrorCode) {
74 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
75 return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
76 }
77
78 U_CAPI UBool U_EXPORT2
unorm_isNormalizedWithOptions(const UChar * src,int32_t srcLength,UNormalizationMode mode,int32_t options,UErrorCode * pErrorCode)79 unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
80 UNormalizationMode mode, int32_t options,
81 UErrorCode *pErrorCode) {
82 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
83 if(options&UNORM_UNICODE_3_2) {
84 FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
85 return unorm2_isNormalized(
86 reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
87 src, srcLength, pErrorCode);
88 } else {
89 return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
90 }
91 }
92
93 /* normalize() API ---------------------------------------------------------- */
94
95 /** Public API for normalizing. */
96 U_CAPI int32_t U_EXPORT2
unorm_normalize(const UChar * src,int32_t srcLength,UNormalizationMode mode,int32_t options,UChar * dest,int32_t destCapacity,UErrorCode * pErrorCode)97 unorm_normalize(const UChar *src, int32_t srcLength,
98 UNormalizationMode mode, int32_t options,
99 UChar *dest, int32_t destCapacity,
100 UErrorCode *pErrorCode) {
101 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
102 if(options&UNORM_UNICODE_3_2) {
103 FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
104 return unorm2_normalize(
105 reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
106 src, srcLength, dest, destCapacity, pErrorCode);
107 } else {
108 return unorm2_normalize((const UNormalizer2 *)n2,
109 src, srcLength, dest, destCapacity, pErrorCode);
110 }
111 }
112
113
114 /* iteration functions ------------------------------------------------------ */
115
116 static int32_t
unorm_iterate(UCharIterator * src,UBool forward,UChar * dest,int32_t destCapacity,UNormalizationMode mode,int32_t options,UBool doNormalize,UBool * pNeededToNormalize,UErrorCode * pErrorCode)117 unorm_iterate(UCharIterator *src, UBool forward,
118 UChar *dest, int32_t destCapacity,
119 UNormalizationMode mode, int32_t options,
120 UBool doNormalize, UBool *pNeededToNormalize,
121 UErrorCode *pErrorCode) {
122 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
123 const UnicodeSet *uni32;
124 if(options&UNORM_UNICODE_3_2) {
125 uni32=uniset_getUnicode32Instance(*pErrorCode);
126 } else {
127 uni32=NULL; // unused
128 }
129
130 if(U_FAILURE(*pErrorCode)) {
131 return 0;
132 }
133
134 FilteredNormalizer2 fn2(*n2, *uni32);
135 if(options&UNORM_UNICODE_3_2) {
136 n2=&fn2;
137 }
138
139 if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
140 src==NULL
141 ) {
142 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
143 return 0;
144 }
145
146 if(pNeededToNormalize!=NULL) {
147 *pNeededToNormalize=FALSE;
148 }
149 if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) {
150 return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
151 }
152
153 UnicodeString buffer;
154 UChar32 c;
155 if(forward) {
156 /* get one character and ignore its properties */
157 buffer.append(uiter_next32(src));
158 /* get all following characters until we see a boundary */
159 while((c=uiter_next32(src))>=0) {
160 if(n2->hasBoundaryBefore(c)) {
161 /* back out the latest movement to stop at the boundary */
162 src->move(src, -U16_LENGTH(c), UITER_CURRENT);
163 break;
164 } else {
165 buffer.append(c);
166 }
167 }
168 } else {
169 while((c=uiter_previous32(src))>=0) {
170 /* always write this character to the front of the buffer */
171 buffer.insert(0, c);
172 /* stop if this just-copied character is a boundary */
173 if(n2->hasBoundaryBefore(c)) {
174 break;
175 }
176 }
177 }
178
179 UnicodeString destString(dest, 0, destCapacity);
180 if(buffer.length()>0 && doNormalize) {
181 n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode);
182 if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) {
183 *pNeededToNormalize= destString!=buffer;
184 }
185 return destString.length();
186 } else {
187 /* just copy the source characters */
188 return buffer.extract(dest, destCapacity, *pErrorCode);
189 }
190 }
191
192 U_CAPI int32_t U_EXPORT2
unorm_previous(UCharIterator * src,UChar * dest,int32_t destCapacity,UNormalizationMode mode,int32_t options,UBool doNormalize,UBool * pNeededToNormalize,UErrorCode * pErrorCode)193 unorm_previous(UCharIterator *src,
194 UChar *dest, int32_t destCapacity,
195 UNormalizationMode mode, int32_t options,
196 UBool doNormalize, UBool *pNeededToNormalize,
197 UErrorCode *pErrorCode) {
198 return unorm_iterate(src, FALSE,
199 dest, destCapacity,
200 mode, options,
201 doNormalize, pNeededToNormalize,
202 pErrorCode);
203 }
204
205 U_CAPI int32_t U_EXPORT2
unorm_next(UCharIterator * src,UChar * dest,int32_t destCapacity,UNormalizationMode mode,int32_t options,UBool doNormalize,UBool * pNeededToNormalize,UErrorCode * pErrorCode)206 unorm_next(UCharIterator *src,
207 UChar *dest, int32_t destCapacity,
208 UNormalizationMode mode, int32_t options,
209 UBool doNormalize, UBool *pNeededToNormalize,
210 UErrorCode *pErrorCode) {
211 return unorm_iterate(src, TRUE,
212 dest, destCapacity,
213 mode, options,
214 doNormalize, pNeededToNormalize,
215 pErrorCode);
216 }
217
218 /* Concatenation of normalized strings -------------------------------------- */
219
220 U_CAPI int32_t U_EXPORT2
unorm_concatenate(const UChar * left,int32_t leftLength,const UChar * right,int32_t rightLength,UChar * dest,int32_t destCapacity,UNormalizationMode mode,int32_t options,UErrorCode * pErrorCode)221 unorm_concatenate(const UChar *left, int32_t leftLength,
222 const UChar *right, int32_t rightLength,
223 UChar *dest, int32_t destCapacity,
224 UNormalizationMode mode, int32_t options,
225 UErrorCode *pErrorCode) {
226 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
227 const UnicodeSet *uni32;
228 if(options&UNORM_UNICODE_3_2) {
229 uni32=uniset_getUnicode32Instance(*pErrorCode);
230 } else {
231 uni32=NULL; // unused
232 }
233
234 if(U_FAILURE(*pErrorCode)) {
235 return 0;
236 }
237
238 FilteredNormalizer2 fn2(*n2, *uni32);
239 if(options&UNORM_UNICODE_3_2) {
240 n2=&fn2;
241 }
242
243 if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
244 left==NULL || leftLength<-1 ||
245 right==NULL || rightLength<-1
246 ) {
247 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
248 return 0;
249 }
250
251 /* check for overlapping right and destination */
252 if( dest!=NULL &&
253 ((right>=dest && right<(dest+destCapacity)) ||
254 (rightLength>0 && dest>=right && dest<(right+rightLength)))
255 ) {
256 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
257 return 0;
258 }
259
260 /* allow left==dest */
261 UnicodeString destString;
262 if(left==dest) {
263 destString.setTo(dest, leftLength, destCapacity);
264 } else {
265 destString.setTo(dest, 0, destCapacity);
266 destString.append(left, leftLength);
267 }
268 return n2->append(destString, UnicodeString(rightLength<0, right, rightLength), *pErrorCode).
269 extract(dest, destCapacity, *pErrorCode);
270 }
271
272 #endif /* #if !UCONFIG_NO_NORMALIZATION */
273