1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 * Copyright (c) 1996-2014, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
8 * File unorm.cpp
9 *
10 * Created by: Vladimir Weinstein 12052000
11 *
12 * Modification history :
13 *
14 * Date Name Description
15 * 02/01/01 synwee Added normalization quickcheck enum and method.
16 * 02/12/01 synwee Commented out quickcheck util api has been approved
17 * Added private method for doing FCD checks
18 * 02/23/01 synwee Modified quickcheck and checkFCE to run through
19 * string for codepoints < 0x300 for the normalization
20 * mode NFC.
21 * 05/25/01+ Markus Scherer total rewrite, implement all normalization here
22 * instead of just wrappers around normlzr.cpp,
23 * load unorm.dat, support Unicode 3.1 with
24 * supplementary code points, etc.
25 * 2009-nov..2010-jan Markus Scherer total rewrite, new Normalizer2 API & code
26 */
27
28 #include "unicode/utypes.h"
29
30 #if !UCONFIG_NO_NORMALIZATION
31
32 #include "unicode/udata.h"
33 #include "unicode/ustring.h"
34 #include "unicode/uiter.h"
35 #include "unicode/unorm.h"
36 #include "unicode/unorm2.h"
37 #include "normalizer2impl.h"
38 #include "unormimp.h"
39 #include "uprops.h"
40 #include "ustr_imp.h"
41
42 U_NAMESPACE_USE
43
44 /* quick check functions ---------------------------------------------------- */
45
46 U_CAPI UNormalizationCheckResult U_EXPORT2
unorm_quickCheck(const UChar * src,int32_t srcLength,UNormalizationMode mode,UErrorCode * pErrorCode)47 unorm_quickCheck(const UChar *src,
48 int32_t srcLength,
49 UNormalizationMode mode,
50 UErrorCode *pErrorCode) {
51 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
52 return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
53 }
54
55 U_CAPI UNormalizationCheckResult U_EXPORT2
unorm_quickCheckWithOptions(const UChar * src,int32_t srcLength,UNormalizationMode mode,int32_t options,UErrorCode * pErrorCode)56 unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength,
57 UNormalizationMode mode, int32_t options,
58 UErrorCode *pErrorCode) {
59 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
60 if(options&UNORM_UNICODE_3_2) {
61 FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
62 return unorm2_quickCheck(
63 reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
64 src, srcLength, pErrorCode);
65 } else {
66 return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
67 }
68 }
69
70 U_CAPI UBool U_EXPORT2
unorm_isNormalized(const UChar * src,int32_t srcLength,UNormalizationMode mode,UErrorCode * pErrorCode)71 unorm_isNormalized(const UChar *src, int32_t srcLength,
72 UNormalizationMode mode,
73 UErrorCode *pErrorCode) {
74 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
75 return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
76 }
77
78 U_CAPI UBool U_EXPORT2
unorm_isNormalizedWithOptions(const UChar * src,int32_t srcLength,UNormalizationMode mode,int32_t options,UErrorCode * pErrorCode)79 unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
80 UNormalizationMode mode, int32_t options,
81 UErrorCode *pErrorCode) {
82 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
83 if(options&UNORM_UNICODE_3_2) {
84 FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
85 return unorm2_isNormalized(
86 reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
87 src, srcLength, pErrorCode);
88 } else {
89 return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
90 }
91 }
92
93 /* normalize() API ---------------------------------------------------------- */
94
95 /** Public API for normalizing. */
96 U_CAPI int32_t U_EXPORT2
unorm_normalize(const UChar * src,int32_t srcLength,UNormalizationMode mode,int32_t options,UChar * dest,int32_t destCapacity,UErrorCode * pErrorCode)97 unorm_normalize(const UChar *src, int32_t srcLength,
98 UNormalizationMode mode, int32_t options,
99 UChar *dest, int32_t destCapacity,
100 UErrorCode *pErrorCode) {
101 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
102 if(options&UNORM_UNICODE_3_2) {
103 FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
104 return unorm2_normalize(
105 reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
106 src, srcLength, dest, destCapacity, pErrorCode);
107 } else {
108 return unorm2_normalize((const UNormalizer2 *)n2,
109 src, srcLength, dest, destCapacity, pErrorCode);
110 }
111 }
112
113
114 /* iteration functions ------------------------------------------------------ */
115
116 static int32_t
_iterate(UCharIterator * src,UBool forward,UChar * dest,int32_t destCapacity,const Normalizer2 * n2,UBool doNormalize,UBool * pNeededToNormalize,UErrorCode * pErrorCode)117 _iterate(UCharIterator *src, UBool forward,
118 UChar *dest, int32_t destCapacity,
119 const Normalizer2 *n2,
120 UBool doNormalize, UBool *pNeededToNormalize,
121 UErrorCode *pErrorCode) {
122 if(U_FAILURE(*pErrorCode)) {
123 return 0;
124 }
125 if(destCapacity<0 || (dest==NULL && destCapacity>0) || src==NULL) {
126 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
127 return 0;
128 }
129
130 if(pNeededToNormalize!=NULL) {
131 *pNeededToNormalize=FALSE;
132 }
133 if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) {
134 return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
135 }
136
137 UnicodeString buffer;
138 UChar32 c;
139 if(forward) {
140 /* get one character and ignore its properties */
141 buffer.append(uiter_next32(src));
142 /* get all following characters until we see a boundary */
143 while((c=uiter_next32(src))>=0) {
144 if(n2->hasBoundaryBefore(c)) {
145 /* back out the latest movement to stop at the boundary */
146 src->move(src, -U16_LENGTH(c), UITER_CURRENT);
147 break;
148 } else {
149 buffer.append(c);
150 }
151 }
152 } else {
153 while((c=uiter_previous32(src))>=0) {
154 /* always write this character to the front of the buffer */
155 buffer.insert(0, c);
156 /* stop if this just-copied character is a boundary */
157 if(n2->hasBoundaryBefore(c)) {
158 break;
159 }
160 }
161 }
162
163 UnicodeString destString(dest, 0, destCapacity);
164 if(buffer.length()>0 && doNormalize) {
165 n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode);
166 if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) {
167 *pNeededToNormalize= destString!=buffer;
168 }
169 return destString.length();
170 } else {
171 /* just copy the source characters */
172 return buffer.extract(dest, destCapacity, *pErrorCode);
173 }
174 }
175
176 static int32_t
unorm_iterate(UCharIterator * src,UBool forward,UChar * dest,int32_t destCapacity,UNormalizationMode mode,int32_t options,UBool doNormalize,UBool * pNeededToNormalize,UErrorCode * pErrorCode)177 unorm_iterate(UCharIterator *src, UBool forward,
178 UChar *dest, int32_t destCapacity,
179 UNormalizationMode mode, int32_t options,
180 UBool doNormalize, UBool *pNeededToNormalize,
181 UErrorCode *pErrorCode) {
182 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
183 if(options&UNORM_UNICODE_3_2) {
184 const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode);
185 if(U_FAILURE(*pErrorCode)) {
186 return 0;
187 }
188 FilteredNormalizer2 fn2(*n2, *uni32);
189 return _iterate(src, forward, dest, destCapacity,
190 &fn2, doNormalize, pNeededToNormalize, pErrorCode);
191 }
192 return _iterate(src, forward, dest, destCapacity,
193 n2, doNormalize, pNeededToNormalize, pErrorCode);
194 }
195
196 U_CAPI int32_t U_EXPORT2
unorm_previous(UCharIterator * src,UChar * dest,int32_t destCapacity,UNormalizationMode mode,int32_t options,UBool doNormalize,UBool * pNeededToNormalize,UErrorCode * pErrorCode)197 unorm_previous(UCharIterator *src,
198 UChar *dest, int32_t destCapacity,
199 UNormalizationMode mode, int32_t options,
200 UBool doNormalize, UBool *pNeededToNormalize,
201 UErrorCode *pErrorCode) {
202 return unorm_iterate(src, FALSE,
203 dest, destCapacity,
204 mode, options,
205 doNormalize, pNeededToNormalize,
206 pErrorCode);
207 }
208
209 U_CAPI int32_t U_EXPORT2
unorm_next(UCharIterator * src,UChar * dest,int32_t destCapacity,UNormalizationMode mode,int32_t options,UBool doNormalize,UBool * pNeededToNormalize,UErrorCode * pErrorCode)210 unorm_next(UCharIterator *src,
211 UChar *dest, int32_t destCapacity,
212 UNormalizationMode mode, int32_t options,
213 UBool doNormalize, UBool *pNeededToNormalize,
214 UErrorCode *pErrorCode) {
215 return unorm_iterate(src, TRUE,
216 dest, destCapacity,
217 mode, options,
218 doNormalize, pNeededToNormalize,
219 pErrorCode);
220 }
221
222 /* Concatenation of normalized strings -------------------------------------- */
223
224 static int32_t
_concatenate(const UChar * left,int32_t leftLength,const UChar * right,int32_t rightLength,UChar * dest,int32_t destCapacity,const Normalizer2 * n2,UErrorCode * pErrorCode)225 _concatenate(const UChar *left, int32_t leftLength,
226 const UChar *right, int32_t rightLength,
227 UChar *dest, int32_t destCapacity,
228 const Normalizer2 *n2,
229 UErrorCode *pErrorCode) {
230 if(U_FAILURE(*pErrorCode)) {
231 return 0;
232 }
233 if(destCapacity<0 || (dest==NULL && destCapacity>0) ||
234 left==NULL || leftLength<-1 || right==NULL || rightLength<-1) {
235 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
236 return 0;
237 }
238
239 /* check for overlapping right and destination */
240 if( dest!=NULL &&
241 ((right>=dest && right<(dest+destCapacity)) ||
242 (rightLength>0 && dest>=right && dest<(right+rightLength)))
243 ) {
244 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
245 return 0;
246 }
247
248 /* allow left==dest */
249 UnicodeString destString;
250 if(left==dest) {
251 destString.setTo(dest, leftLength, destCapacity);
252 } else {
253 destString.setTo(dest, 0, destCapacity);
254 destString.append(left, leftLength);
255 }
256 return n2->append(destString, UnicodeString(rightLength<0, right, rightLength), *pErrorCode).
257 extract(dest, destCapacity, *pErrorCode);
258 }
259
260 U_CAPI int32_t U_EXPORT2
unorm_concatenate(const UChar * left,int32_t leftLength,const UChar * right,int32_t rightLength,UChar * dest,int32_t destCapacity,UNormalizationMode mode,int32_t options,UErrorCode * pErrorCode)261 unorm_concatenate(const UChar *left, int32_t leftLength,
262 const UChar *right, int32_t rightLength,
263 UChar *dest, int32_t destCapacity,
264 UNormalizationMode mode, int32_t options,
265 UErrorCode *pErrorCode) {
266 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
267 if(options&UNORM_UNICODE_3_2) {
268 const UnicodeSet *uni32 = uniset_getUnicode32Instance(*pErrorCode);
269 if(U_FAILURE(*pErrorCode)) {
270 return 0;
271 }
272 FilteredNormalizer2 fn2(*n2, *uni32);
273 return _concatenate(left, leftLength, right, rightLength,
274 dest, destCapacity, &fn2, pErrorCode);
275 }
276 return _concatenate(left, leftLength, right, rightLength,
277 dest, destCapacity, n2, pErrorCode);
278 }
279
280 #endif /* #if !UCONFIG_NO_NORMALIZATION */
281