• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ******************************************************************************
3 * Copyright (c) 1996-2010, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************
6 * File unorm.cpp
7 *
8 * Created by: Vladimir Weinstein 12052000
9 *
10 * Modification history :
11 *
12 * Date        Name        Description
13 * 02/01/01    synwee      Added normalization quickcheck enum and method.
14 * 02/12/01    synwee      Commented out quickcheck util api has been approved
15 *                         Added private method for doing FCD checks
16 * 02/23/01    synwee      Modified quickcheck and checkFCE to run through
17 *                         string for codepoints < 0x300 for the normalization
18 *                         mode NFC.
19 * 05/25/01+   Markus Scherer total rewrite, implement all normalization here
20 *                         instead of just wrappers around normlzr.cpp,
21 *                         load unorm.dat, support Unicode 3.1 with
22 *                         supplementary code points, etc.
23 * 2009-nov..2010-jan  Markus Scherer  total rewrite, new Normalizer2 API & code
24 */
25 
26 #include "unicode/utypes.h"
27 
28 #if !UCONFIG_NO_NORMALIZATION
29 
30 #include "unicode/udata.h"
31 #include "unicode/ustring.h"
32 #include "unicode/uiter.h"
33 #include "unicode/unorm.h"
34 #include "unicode/unorm2.h"
35 #include "normalizer2impl.h"
36 #include "unormimp.h"
37 #include "uprops.h"
38 #include "ustr_imp.h"
39 
40 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
41 
42 U_NAMESPACE_USE
43 
44 /* quick check functions ---------------------------------------------------- */
45 
46 U_CAPI UNormalizationCheckResult U_EXPORT2
unorm_quickCheck(const UChar * src,int32_t srcLength,UNormalizationMode mode,UErrorCode * pErrorCode)47 unorm_quickCheck(const UChar *src,
48                  int32_t srcLength,
49                  UNormalizationMode mode,
50                  UErrorCode *pErrorCode) {
51     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
52     return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
53 }
54 
55 U_CAPI UNormalizationCheckResult U_EXPORT2
unorm_quickCheckWithOptions(const UChar * src,int32_t srcLength,UNormalizationMode mode,int32_t options,UErrorCode * pErrorCode)56 unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength,
57                             UNormalizationMode mode, int32_t options,
58                             UErrorCode *pErrorCode) {
59     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
60     if(options&UNORM_UNICODE_3_2) {
61         FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
62         return unorm2_quickCheck(
63             reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
64             src, srcLength, pErrorCode);
65     } else {
66         return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
67     }
68 }
69 
70 U_CAPI UBool U_EXPORT2
unorm_isNormalized(const UChar * src,int32_t srcLength,UNormalizationMode mode,UErrorCode * pErrorCode)71 unorm_isNormalized(const UChar *src, int32_t srcLength,
72                    UNormalizationMode mode,
73                    UErrorCode *pErrorCode) {
74     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
75     return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
76 }
77 
78 U_CAPI UBool U_EXPORT2
unorm_isNormalizedWithOptions(const UChar * src,int32_t srcLength,UNormalizationMode mode,int32_t options,UErrorCode * pErrorCode)79 unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
80                               UNormalizationMode mode, int32_t options,
81                               UErrorCode *pErrorCode) {
82     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
83     if(options&UNORM_UNICODE_3_2) {
84         FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
85         return unorm2_isNormalized(
86             reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
87             src, srcLength, pErrorCode);
88     } else {
89         return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
90     }
91 }
92 
93 /* normalize() API ---------------------------------------------------------- */
94 
95 /** Public API for normalizing. */
96 U_CAPI int32_t U_EXPORT2
unorm_normalize(const UChar * src,int32_t srcLength,UNormalizationMode mode,int32_t options,UChar * dest,int32_t destCapacity,UErrorCode * pErrorCode)97 unorm_normalize(const UChar *src, int32_t srcLength,
98                 UNormalizationMode mode, int32_t options,
99                 UChar *dest, int32_t destCapacity,
100                 UErrorCode *pErrorCode) {
101     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
102     if(options&UNORM_UNICODE_3_2) {
103         FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
104         return unorm2_normalize(
105             reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
106             src, srcLength, dest, destCapacity, pErrorCode);
107     } else {
108         return unorm2_normalize((const UNormalizer2 *)n2,
109             src, srcLength, dest, destCapacity, pErrorCode);
110     }
111 }
112 
113 
114 /* iteration functions ------------------------------------------------------ */
115 
116 static int32_t
unorm_iterate(UCharIterator * src,UBool forward,UChar * dest,int32_t destCapacity,UNormalizationMode mode,int32_t options,UBool doNormalize,UBool * pNeededToNormalize,UErrorCode * pErrorCode)117 unorm_iterate(UCharIterator *src, UBool forward,
118               UChar *dest, int32_t destCapacity,
119               UNormalizationMode mode, int32_t options,
120               UBool doNormalize, UBool *pNeededToNormalize,
121               UErrorCode *pErrorCode) {
122     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
123     const UnicodeSet *uni32;
124     if(options&UNORM_UNICODE_3_2) {
125         uni32=uniset_getUnicode32Instance(*pErrorCode);
126     } else {
127         uni32=NULL;  // unused
128     }
129     FilteredNormalizer2 fn2(*n2, *uni32);
130     if(options&UNORM_UNICODE_3_2) {
131         n2=&fn2;
132     }
133     if(U_FAILURE(*pErrorCode)) {
134         return 0;
135     }
136     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
137         src==NULL
138     ) {
139         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
140         return 0;
141     }
142 
143     if(pNeededToNormalize!=NULL) {
144         *pNeededToNormalize=FALSE;
145     }
146     if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) {
147         return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
148     }
149 
150     UnicodeString buffer;
151     UChar32 c;
152     if(forward) {
153         /* get one character and ignore its properties */
154         buffer.append(uiter_next32(src));
155         /* get all following characters until we see a boundary */
156         while((c=uiter_next32(src))>=0) {
157             if(n2->hasBoundaryBefore(c)) {
158                 /* back out the latest movement to stop at the boundary */
159                 src->move(src, -U16_LENGTH(c), UITER_CURRENT);
160                 break;
161             } else {
162                 buffer.append(c);
163             }
164         }
165     } else {
166         while((c=uiter_previous32(src))>=0) {
167             /* always write this character to the front of the buffer */
168             buffer.insert(0, c);
169             /* stop if this just-copied character is a boundary */
170             if(n2->hasBoundaryBefore(c)) {
171                 break;
172             }
173         }
174     }
175 
176     UnicodeString destString(dest, 0, destCapacity);
177     if(buffer.length()>0 && doNormalize) {
178         n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode);
179         if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) {
180             *pNeededToNormalize= destString!=buffer;
181         }
182         return destString.length();
183     } else {
184         /* just copy the source characters */
185         return buffer.extract(dest, destCapacity, *pErrorCode);
186     }
187 }
188 
189 U_CAPI int32_t U_EXPORT2
unorm_previous(UCharIterator * src,UChar * dest,int32_t destCapacity,UNormalizationMode mode,int32_t options,UBool doNormalize,UBool * pNeededToNormalize,UErrorCode * pErrorCode)190 unorm_previous(UCharIterator *src,
191                UChar *dest, int32_t destCapacity,
192                UNormalizationMode mode, int32_t options,
193                UBool doNormalize, UBool *pNeededToNormalize,
194                UErrorCode *pErrorCode) {
195     return unorm_iterate(src, FALSE,
196                          dest, destCapacity,
197                          mode, options,
198                          doNormalize, pNeededToNormalize,
199                          pErrorCode);
200 }
201 
202 U_CAPI int32_t U_EXPORT2
unorm_next(UCharIterator * src,UChar * dest,int32_t destCapacity,UNormalizationMode mode,int32_t options,UBool doNormalize,UBool * pNeededToNormalize,UErrorCode * pErrorCode)203 unorm_next(UCharIterator *src,
204            UChar *dest, int32_t destCapacity,
205            UNormalizationMode mode, int32_t options,
206            UBool doNormalize, UBool *pNeededToNormalize,
207            UErrorCode *pErrorCode) {
208     return unorm_iterate(src, TRUE,
209                          dest, destCapacity,
210                          mode, options,
211                          doNormalize, pNeededToNormalize,
212                          pErrorCode);
213 }
214 
215 /* Concatenation of normalized strings -------------------------------------- */
216 
217 U_CAPI int32_t U_EXPORT2
unorm_concatenate(const UChar * left,int32_t leftLength,const UChar * right,int32_t rightLength,UChar * dest,int32_t destCapacity,UNormalizationMode mode,int32_t options,UErrorCode * pErrorCode)218 unorm_concatenate(const UChar *left, int32_t leftLength,
219                   const UChar *right, int32_t rightLength,
220                   UChar *dest, int32_t destCapacity,
221                   UNormalizationMode mode, int32_t options,
222                   UErrorCode *pErrorCode) {
223     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
224     const UnicodeSet *uni32;
225     if(options&UNORM_UNICODE_3_2) {
226         uni32=uniset_getUnicode32Instance(*pErrorCode);
227     } else {
228         uni32=NULL;  // unused
229     }
230     FilteredNormalizer2 fn2(*n2, *uni32);
231     if(options&UNORM_UNICODE_3_2) {
232         n2=&fn2;
233     }
234     if(U_FAILURE(*pErrorCode)) {
235         return 0;
236     }
237     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
238         left==NULL || leftLength<-1 ||
239         right==NULL || rightLength<-1
240     ) {
241         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
242         return 0;
243     }
244 
245     /* check for overlapping right and destination */
246     if( dest!=NULL &&
247         ((right>=dest && right<(dest+destCapacity)) ||
248          (rightLength>0 && dest>=right && dest<(right+rightLength)))
249     ) {
250         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
251         return 0;
252     }
253 
254     /* allow left==dest */
255     UnicodeString destString;
256     if(left==dest) {
257         destString.setTo(dest, leftLength, destCapacity);
258     } else {
259         destString.setTo(dest, 0, destCapacity);
260         destString.append(left, leftLength);
261     }
262     return n2->append(destString, UnicodeString(rightLength<0, right, rightLength), *pErrorCode).
263            extract(dest, destCapacity, *pErrorCode);
264 }
265 
266 #endif /* #if !UCONFIG_NO_NORMALIZATION */
267