• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ******************************************************************************
3 * Copyright (c) 1996-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************
6 * File unorm.cpp
7 *
8 * Created by: Vladimir Weinstein 12052000
9 *
10 * Modification history :
11 *
12 * Date        Name        Description
13 * 02/01/01    synwee      Added normalization quickcheck enum and method.
14 * 02/12/01    synwee      Commented out quickcheck util api has been approved
15 *                         Added private method for doing FCD checks
16 * 02/23/01    synwee      Modified quickcheck and checkFCE to run through
17 *                         string for codepoints < 0x300 for the normalization
18 *                         mode NFC.
19 * 05/25/01+   Markus Scherer total rewrite, implement all normalization here
20 *                         instead of just wrappers around normlzr.cpp,
21 *                         load unorm.dat, support Unicode 3.1 with
22 *                         supplementary code points, etc.
23 * 2009-nov..2010-jan  Markus Scherer  total rewrite, new Normalizer2 API & code
24 */
25 
26 #include "unicode/utypes.h"
27 
28 #if !UCONFIG_NO_NORMALIZATION
29 
30 #include "unicode/udata.h"
31 #include "unicode/ustring.h"
32 #include "unicode/uiter.h"
33 #include "unicode/unorm.h"
34 #include "unicode/unorm2.h"
35 #include "normalizer2impl.h"
36 #include "unormimp.h"
37 #include "uprops.h"
38 #include "ustr_imp.h"
39 
40 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
41 
42 U_NAMESPACE_USE
43 
44 /* quick check functions ---------------------------------------------------- */
45 
46 U_CAPI UNormalizationCheckResult U_EXPORT2
unorm_quickCheck(const UChar * src,int32_t srcLength,UNormalizationMode mode,UErrorCode * pErrorCode)47 unorm_quickCheck(const UChar *src,
48                  int32_t srcLength,
49                  UNormalizationMode mode,
50                  UErrorCode *pErrorCode) {
51     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
52     return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
53 }
54 
55 U_CAPI UNormalizationCheckResult U_EXPORT2
unorm_quickCheckWithOptions(const UChar * src,int32_t srcLength,UNormalizationMode mode,int32_t options,UErrorCode * pErrorCode)56 unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength,
57                             UNormalizationMode mode, int32_t options,
58                             UErrorCode *pErrorCode) {
59     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
60     if(options&UNORM_UNICODE_3_2) {
61         FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
62         return unorm2_quickCheck(
63             reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
64             src, srcLength, pErrorCode);
65     } else {
66         return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
67     }
68 }
69 
70 U_CAPI UBool U_EXPORT2
unorm_isNormalized(const UChar * src,int32_t srcLength,UNormalizationMode mode,UErrorCode * pErrorCode)71 unorm_isNormalized(const UChar *src, int32_t srcLength,
72                    UNormalizationMode mode,
73                    UErrorCode *pErrorCode) {
74     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
75     return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
76 }
77 
78 U_CAPI UBool U_EXPORT2
unorm_isNormalizedWithOptions(const UChar * src,int32_t srcLength,UNormalizationMode mode,int32_t options,UErrorCode * pErrorCode)79 unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
80                               UNormalizationMode mode, int32_t options,
81                               UErrorCode *pErrorCode) {
82     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
83     if(options&UNORM_UNICODE_3_2) {
84         FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
85         return unorm2_isNormalized(
86             reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
87             src, srcLength, pErrorCode);
88     } else {
89         return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
90     }
91 }
92 
93 /* normalize() API ---------------------------------------------------------- */
94 
95 /** Public API for normalizing. */
96 U_CAPI int32_t U_EXPORT2
unorm_normalize(const UChar * src,int32_t srcLength,UNormalizationMode mode,int32_t options,UChar * dest,int32_t destCapacity,UErrorCode * pErrorCode)97 unorm_normalize(const UChar *src, int32_t srcLength,
98                 UNormalizationMode mode, int32_t options,
99                 UChar *dest, int32_t destCapacity,
100                 UErrorCode *pErrorCode) {
101     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
102     if(options&UNORM_UNICODE_3_2) {
103         FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
104         return unorm2_normalize(
105             reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
106             src, srcLength, dest, destCapacity, pErrorCode);
107     } else {
108         return unorm2_normalize((const UNormalizer2 *)n2,
109             src, srcLength, dest, destCapacity, pErrorCode);
110     }
111 }
112 
113 
114 /* iteration functions ------------------------------------------------------ */
115 
116 static int32_t
unorm_iterate(UCharIterator * src,UBool forward,UChar * dest,int32_t destCapacity,UNormalizationMode mode,int32_t options,UBool doNormalize,UBool * pNeededToNormalize,UErrorCode * pErrorCode)117 unorm_iterate(UCharIterator *src, UBool forward,
118               UChar *dest, int32_t destCapacity,
119               UNormalizationMode mode, int32_t options,
120               UBool doNormalize, UBool *pNeededToNormalize,
121               UErrorCode *pErrorCode) {
122     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
123     const UnicodeSet *uni32;
124     if(options&UNORM_UNICODE_3_2) {
125         uni32=uniset_getUnicode32Instance(*pErrorCode);
126     } else {
127         uni32=NULL;  // unused
128     }
129 
130     if(U_FAILURE(*pErrorCode)) {
131         return 0;
132     }
133 
134     FilteredNormalizer2 fn2(*n2, *uni32);
135     if(options&UNORM_UNICODE_3_2) {
136         n2=&fn2;
137     }
138 
139     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
140         src==NULL
141     ) {
142         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
143         return 0;
144     }
145 
146     if(pNeededToNormalize!=NULL) {
147         *pNeededToNormalize=FALSE;
148     }
149     if(!(forward ? src->hasNext(src) : src->hasPrevious(src))) {
150         return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
151     }
152 
153     UnicodeString buffer;
154     UChar32 c;
155     if(forward) {
156         /* get one character and ignore its properties */
157         buffer.append(uiter_next32(src));
158         /* get all following characters until we see a boundary */
159         while((c=uiter_next32(src))>=0) {
160             if(n2->hasBoundaryBefore(c)) {
161                 /* back out the latest movement to stop at the boundary */
162                 src->move(src, -U16_LENGTH(c), UITER_CURRENT);
163                 break;
164             } else {
165                 buffer.append(c);
166             }
167         }
168     } else {
169         while((c=uiter_previous32(src))>=0) {
170             /* always write this character to the front of the buffer */
171             buffer.insert(0, c);
172             /* stop if this just-copied character is a boundary */
173             if(n2->hasBoundaryBefore(c)) {
174                 break;
175             }
176         }
177     }
178 
179     UnicodeString destString(dest, 0, destCapacity);
180     if(buffer.length()>0 && doNormalize) {
181         n2->normalize(buffer, destString, *pErrorCode).extract(dest, destCapacity, *pErrorCode);
182         if(pNeededToNormalize!=NULL && U_SUCCESS(*pErrorCode)) {
183             *pNeededToNormalize= destString!=buffer;
184         }
185         return destString.length();
186     } else {
187         /* just copy the source characters */
188         return buffer.extract(dest, destCapacity, *pErrorCode);
189     }
190 }
191 
192 U_CAPI int32_t U_EXPORT2
unorm_previous(UCharIterator * src,UChar * dest,int32_t destCapacity,UNormalizationMode mode,int32_t options,UBool doNormalize,UBool * pNeededToNormalize,UErrorCode * pErrorCode)193 unorm_previous(UCharIterator *src,
194                UChar *dest, int32_t destCapacity,
195                UNormalizationMode mode, int32_t options,
196                UBool doNormalize, UBool *pNeededToNormalize,
197                UErrorCode *pErrorCode) {
198     return unorm_iterate(src, FALSE,
199                          dest, destCapacity,
200                          mode, options,
201                          doNormalize, pNeededToNormalize,
202                          pErrorCode);
203 }
204 
205 U_CAPI int32_t U_EXPORT2
unorm_next(UCharIterator * src,UChar * dest,int32_t destCapacity,UNormalizationMode mode,int32_t options,UBool doNormalize,UBool * pNeededToNormalize,UErrorCode * pErrorCode)206 unorm_next(UCharIterator *src,
207            UChar *dest, int32_t destCapacity,
208            UNormalizationMode mode, int32_t options,
209            UBool doNormalize, UBool *pNeededToNormalize,
210            UErrorCode *pErrorCode) {
211     return unorm_iterate(src, TRUE,
212                          dest, destCapacity,
213                          mode, options,
214                          doNormalize, pNeededToNormalize,
215                          pErrorCode);
216 }
217 
218 /* Concatenation of normalized strings -------------------------------------- */
219 
220 U_CAPI int32_t U_EXPORT2
unorm_concatenate(const UChar * left,int32_t leftLength,const UChar * right,int32_t rightLength,UChar * dest,int32_t destCapacity,UNormalizationMode mode,int32_t options,UErrorCode * pErrorCode)221 unorm_concatenate(const UChar *left, int32_t leftLength,
222                   const UChar *right, int32_t rightLength,
223                   UChar *dest, int32_t destCapacity,
224                   UNormalizationMode mode, int32_t options,
225                   UErrorCode *pErrorCode) {
226     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
227     const UnicodeSet *uni32;
228     if(options&UNORM_UNICODE_3_2) {
229         uni32=uniset_getUnicode32Instance(*pErrorCode);
230     } else {
231         uni32=NULL;  // unused
232     }
233 
234     if(U_FAILURE(*pErrorCode)) {
235         return 0;
236     }
237 
238     FilteredNormalizer2 fn2(*n2, *uni32);
239     if(options&UNORM_UNICODE_3_2) {
240         n2=&fn2;
241     }
242 
243     if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
244         left==NULL || leftLength<-1 ||
245         right==NULL || rightLength<-1
246     ) {
247         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
248         return 0;
249     }
250 
251     /* check for overlapping right and destination */
252     if( dest!=NULL &&
253         ((right>=dest && right<(dest+destCapacity)) ||
254          (rightLength>0 && dest>=right && dest<(right+rightLength)))
255     ) {
256         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
257         return 0;
258     }
259 
260     /* allow left==dest */
261     UnicodeString destString;
262     if(left==dest) {
263         destString.setTo(dest, leftLength, destCapacity);
264     } else {
265         destString.setTo(dest, 0, destCapacity);
266         destString.append(left, leftLength);
267     }
268     return n2->append(destString, UnicodeString(rightLength<0, right, rightLength), *pErrorCode).
269            extract(dest, destCapacity, *pErrorCode);
270 }
271 
272 #endif /* #if !UCONFIG_NO_NORMALIZATION */
273