• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *************************************************************************
3  * COPYRIGHT:
4  * Copyright (c) 1996-2011, International Business Machines Corporation and
5  * others. All Rights Reserved.
6  *************************************************************************
7  */
8 
9 #include "unicode/utypes.h"
10 
11 #if !UCONFIG_NO_NORMALIZATION
12 
13 #include "unicode/uniset.h"
14 #include "unicode/unistr.h"
15 #include "unicode/chariter.h"
16 #include "unicode/schriter.h"
17 #include "unicode/uchriter.h"
18 #include "unicode/normlzr.h"
19 #include "cmemory.h"
20 #include "normalizer2impl.h"
21 #include "uprops.h"  // for uniset_getUnicode32Instance()
22 
23 U_NAMESPACE_BEGIN
24 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
26 
27 //-------------------------------------------------------------------------
28 // Constructors and other boilerplate
29 //-------------------------------------------------------------------------
30 
31 Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
32     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
33     text(new StringCharacterIterator(str)),
34     currentIndex(0), nextIndex(0),
35     buffer(), bufferPos(0)
36 {
37     init();
38 }
39 
Normalizer(const UChar * str,int32_t length,UNormalizationMode mode)40 Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
41     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
42     text(new UCharCharacterIterator(str, length)),
43     currentIndex(0), nextIndex(0),
44     buffer(), bufferPos(0)
45 {
46     init();
47 }
48 
Normalizer(const CharacterIterator & iter,UNormalizationMode mode)49 Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
50     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
51     text(iter.clone()),
52     currentIndex(0), nextIndex(0),
53     buffer(), bufferPos(0)
54 {
55     init();
56 }
57 
Normalizer(const Normalizer & copy)58 Normalizer::Normalizer(const Normalizer &copy) :
59     UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
60     text(copy.text->clone()),
61     currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
62     buffer(copy.buffer), bufferPos(copy.bufferPos)
63 {
64     init();
65 }
66 
67 static const UChar _NUL=0;
68 
69 void
init()70 Normalizer::init() {
71     UErrorCode errorCode=U_ZERO_ERROR;
72     fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
73     if(fOptions&UNORM_UNICODE_3_2) {
74         delete fFilteredNorm2;
75         fNorm2=fFilteredNorm2=
76             new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
77     }
78     if(U_FAILURE(errorCode)) {
79         errorCode=U_ZERO_ERROR;
80         fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
81     }
82 }
83 
~Normalizer()84 Normalizer::~Normalizer()
85 {
86     delete fFilteredNorm2;
87     delete text;
88 }
89 
90 Normalizer*
clone() const91 Normalizer::clone() const
92 {
93     return new Normalizer(*this);
94 }
95 
96 /**
97  * Generates a hash code for this iterator.
98  */
hashCode() const99 int32_t Normalizer::hashCode() const
100 {
101     return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
102 }
103 
operator ==(const Normalizer & that) const104 UBool Normalizer::operator==(const Normalizer& that) const
105 {
106     return
107         this==&that ||
108         (fUMode==that.fUMode &&
109         fOptions==that.fOptions &&
110         *text==*that.text &&
111         buffer==that.buffer &&
112         bufferPos==that.bufferPos &&
113         nextIndex==that.nextIndex);
114 }
115 
116 //-------------------------------------------------------------------------
117 // Static utility methods
118 //-------------------------------------------------------------------------
119 
120 void U_EXPORT2
normalize(const UnicodeString & source,UNormalizationMode mode,int32_t options,UnicodeString & result,UErrorCode & status)121 Normalizer::normalize(const UnicodeString& source,
122                       UNormalizationMode mode, int32_t options,
123                       UnicodeString& result,
124                       UErrorCode &status) {
125     if(source.isBogus() || U_FAILURE(status)) {
126         result.setToBogus();
127         if(U_SUCCESS(status)) {
128             status=U_ILLEGAL_ARGUMENT_ERROR;
129         }
130     } else {
131         UnicodeString localDest;
132         UnicodeString *dest;
133 
134         if(&source!=&result) {
135             dest=&result;
136         } else {
137             // the source and result strings are the same object, use a temporary one
138             dest=&localDest;
139         }
140         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
141         if(U_SUCCESS(status)) {
142             if(options&UNORM_UNICODE_3_2) {
143                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
144                     normalize(source, *dest, status);
145             } else {
146                 n2->normalize(source, *dest, status);
147             }
148         }
149         if(dest==&localDest && U_SUCCESS(status)) {
150             result=*dest;
151         }
152     }
153 }
154 
155 void U_EXPORT2
compose(const UnicodeString & source,UBool compat,int32_t options,UnicodeString & result,UErrorCode & status)156 Normalizer::compose(const UnicodeString& source,
157                     UBool compat, int32_t options,
158                     UnicodeString& result,
159                     UErrorCode &status) {
160     normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
161 }
162 
163 void U_EXPORT2
decompose(const UnicodeString & source,UBool compat,int32_t options,UnicodeString & result,UErrorCode & status)164 Normalizer::decompose(const UnicodeString& source,
165                       UBool compat, int32_t options,
166                       UnicodeString& result,
167                       UErrorCode &status) {
168     normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
169 }
170 
171 UNormalizationCheckResult
quickCheck(const UnicodeString & source,UNormalizationMode mode,int32_t options,UErrorCode & status)172 Normalizer::quickCheck(const UnicodeString& source,
173                        UNormalizationMode mode, int32_t options,
174                        UErrorCode &status) {
175     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
176     if(U_SUCCESS(status)) {
177         if(options&UNORM_UNICODE_3_2) {
178             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
179                 quickCheck(source, status);
180         } else {
181             return n2->quickCheck(source, status);
182         }
183     } else {
184         return UNORM_MAYBE;
185     }
186 }
187 
188 UBool
isNormalized(const UnicodeString & source,UNormalizationMode mode,int32_t options,UErrorCode & status)189 Normalizer::isNormalized(const UnicodeString& source,
190                          UNormalizationMode mode, int32_t options,
191                          UErrorCode &status) {
192     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
193     if(U_SUCCESS(status)) {
194         if(options&UNORM_UNICODE_3_2) {
195             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
196                 isNormalized(source, status);
197         } else {
198             return n2->isNormalized(source, status);
199         }
200     } else {
201         return FALSE;
202     }
203 }
204 
205 UnicodeString & U_EXPORT2
concatenate(const UnicodeString & left,const UnicodeString & right,UnicodeString & result,UNormalizationMode mode,int32_t options,UErrorCode & errorCode)206 Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
207                         UnicodeString &result,
208                         UNormalizationMode mode, int32_t options,
209                         UErrorCode &errorCode) {
210     if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
211         result.setToBogus();
212         if(U_SUCCESS(errorCode)) {
213             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
214         }
215     } else {
216         UnicodeString localDest;
217         UnicodeString *dest;
218 
219         if(&right!=&result) {
220             dest=&result;
221         } else {
222             // the right and result strings are the same object, use a temporary one
223             dest=&localDest;
224         }
225         *dest=left;
226         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
227         if(U_SUCCESS(errorCode)) {
228             if(options&UNORM_UNICODE_3_2) {
229                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
230                     append(*dest, right, errorCode);
231             } else {
232                 n2->append(*dest, right, errorCode);
233             }
234         }
235         if(dest==&localDest && U_SUCCESS(errorCode)) {
236             result=*dest;
237         }
238     }
239     return result;
240 }
241 
242 //-------------------------------------------------------------------------
243 // Iteration API
244 //-------------------------------------------------------------------------
245 
246 /**
247  * Return the current character in the normalized text.
248  */
current()249 UChar32 Normalizer::current() {
250     if(bufferPos<buffer.length() || nextNormalize()) {
251         return buffer.char32At(bufferPos);
252     } else {
253         return DONE;
254     }
255 }
256 
257 /**
258  * Return the next character in the normalized text and advance
259  * the iteration position by one.  If the end
260  * of the text has already been reached, {@link #DONE} is returned.
261  */
next()262 UChar32 Normalizer::next() {
263     if(bufferPos<buffer.length() ||  nextNormalize()) {
264         UChar32 c=buffer.char32At(bufferPos);
265         bufferPos+=UTF_CHAR_LENGTH(c);
266         return c;
267     } else {
268         return DONE;
269     }
270 }
271 
272 /**
273  * Return the previous character in the normalized text and decrement
274  * the iteration position by one.  If the beginning
275  * of the text has already been reached, {@link #DONE} is returned.
276  */
previous()277 UChar32 Normalizer::previous() {
278     if(bufferPos>0 || previousNormalize()) {
279         UChar32 c=buffer.char32At(bufferPos-1);
280         bufferPos-=UTF_CHAR_LENGTH(c);
281         return c;
282     } else {
283         return DONE;
284     }
285 }
286 
reset()287 void Normalizer::reset() {
288     currentIndex=nextIndex=text->setToStart();
289     clearBuffer();
290 }
291 
292 void
setIndexOnly(int32_t index)293 Normalizer::setIndexOnly(int32_t index) {
294     text->setIndex(index);  // pins index
295     currentIndex=nextIndex=text->getIndex();
296     clearBuffer();
297 }
298 
299 /**
300  * Return the first character in the normalized text.  This resets
301  * the <tt>Normalizer's</tt> position to the beginning of the text.
302  */
first()303 UChar32 Normalizer::first() {
304     reset();
305     return next();
306 }
307 
308 /**
309  * Return the last character in the normalized text.  This resets
310  * the <tt>Normalizer's</tt> position to be just before the
311  * the input text corresponding to that normalized character.
312  */
last()313 UChar32 Normalizer::last() {
314     currentIndex=nextIndex=text->setToEnd();
315     clearBuffer();
316     return previous();
317 }
318 
319 /**
320  * Retrieve the current iteration position in the input text that is
321  * being normalized.  This method is useful in applications such as
322  * searching, where you need to be able to determine the position in
323  * the input text that corresponds to a given normalized output character.
324  * <p>
325  * <b>Note:</b> This method sets the position in the <em>input</em>, while
326  * {@link #next} and {@link #previous} iterate through characters in the
327  * <em>output</em>.  This means that there is not necessarily a one-to-one
328  * correspondence between characters returned by <tt>next</tt> and
329  * <tt>previous</tt> and the indices passed to and returned from
330  * <tt>setIndex</tt> and {@link #getIndex}.
331  *
332  */
getIndex() const333 int32_t Normalizer::getIndex() const {
334     if(bufferPos<buffer.length()) {
335         return currentIndex;
336     } else {
337         return nextIndex;
338     }
339 }
340 
341 /**
342  * Retrieve the index of the start of the input text.  This is the begin index
343  * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
344  * over which this <tt>Normalizer</tt> is iterating
345  */
startIndex() const346 int32_t Normalizer::startIndex() const {
347     return text->startIndex();
348 }
349 
350 /**
351  * Retrieve the index of the end of the input text.  This is the end index
352  * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
353  * over which this <tt>Normalizer</tt> is iterating
354  */
endIndex() const355 int32_t Normalizer::endIndex() const {
356     return text->endIndex();
357 }
358 
359 //-------------------------------------------------------------------------
360 // Property access methods
361 //-------------------------------------------------------------------------
362 
363 void
setMode(UNormalizationMode newMode)364 Normalizer::setMode(UNormalizationMode newMode)
365 {
366     fUMode = newMode;
367     init();
368 }
369 
370 UNormalizationMode
getUMode() const371 Normalizer::getUMode() const
372 {
373     return fUMode;
374 }
375 
376 void
setOption(int32_t option,UBool value)377 Normalizer::setOption(int32_t option,
378                       UBool value)
379 {
380     if (value) {
381         fOptions |= option;
382     } else {
383         fOptions &= (~option);
384     }
385     init();
386 }
387 
388 UBool
getOption(int32_t option) const389 Normalizer::getOption(int32_t option) const
390 {
391     return (fOptions & option) != 0;
392 }
393 
394 /**
395  * Set the input text over which this <tt>Normalizer</tt> will iterate.
396  * The iteration position is set to the beginning of the input text.
397  */
398 void
setText(const UnicodeString & newText,UErrorCode & status)399 Normalizer::setText(const UnicodeString& newText,
400                     UErrorCode &status)
401 {
402     if (U_FAILURE(status)) {
403         return;
404     }
405     CharacterIterator *newIter = new StringCharacterIterator(newText);
406     if (newIter == NULL) {
407         status = U_MEMORY_ALLOCATION_ERROR;
408         return;
409     }
410     delete text;
411     text = newIter;
412     reset();
413 }
414 
415 /**
416  * Set the input text over which this <tt>Normalizer</tt> will iterate.
417  * The iteration position is set to the beginning of the string.
418  */
419 void
setText(const CharacterIterator & newText,UErrorCode & status)420 Normalizer::setText(const CharacterIterator& newText,
421                     UErrorCode &status)
422 {
423     if (U_FAILURE(status)) {
424         return;
425     }
426     CharacterIterator *newIter = newText.clone();
427     if (newIter == NULL) {
428         status = U_MEMORY_ALLOCATION_ERROR;
429         return;
430     }
431     delete text;
432     text = newIter;
433     reset();
434 }
435 
436 void
setText(const UChar * newText,int32_t length,UErrorCode & status)437 Normalizer::setText(const UChar* newText,
438                     int32_t length,
439                     UErrorCode &status)
440 {
441     if (U_FAILURE(status)) {
442         return;
443     }
444     CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
445     if (newIter == NULL) {
446         status = U_MEMORY_ALLOCATION_ERROR;
447         return;
448     }
449     delete text;
450     text = newIter;
451     reset();
452 }
453 
454 /**
455  * Copies the text under iteration into the UnicodeString referred to by "result".
456  * @param result Receives a copy of the text under iteration.
457  */
458 void
getText(UnicodeString & result)459 Normalizer::getText(UnicodeString&  result)
460 {
461     text->getText(result);
462 }
463 
464 //-------------------------------------------------------------------------
465 // Private utility methods
466 //-------------------------------------------------------------------------
467 
clearBuffer()468 void Normalizer::clearBuffer() {
469     buffer.remove();
470     bufferPos=0;
471 }
472 
473 UBool
nextNormalize()474 Normalizer::nextNormalize() {
475     clearBuffer();
476     currentIndex=nextIndex;
477     text->setIndex(nextIndex);
478     if(!text->hasNext()) {
479         return FALSE;
480     }
481     // Skip at least one character so we make progress.
482     UnicodeString segment(text->next32PostInc());
483     while(text->hasNext()) {
484         UChar32 c;
485         if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
486             text->move32(-1, CharacterIterator::kCurrent);
487             break;
488         }
489         segment.append(c);
490     }
491     nextIndex=text->getIndex();
492     UErrorCode errorCode=U_ZERO_ERROR;
493     fNorm2->normalize(segment, buffer, errorCode);
494     return U_SUCCESS(errorCode) && !buffer.isEmpty();
495 }
496 
497 UBool
previousNormalize()498 Normalizer::previousNormalize() {
499     clearBuffer();
500     nextIndex=currentIndex;
501     text->setIndex(currentIndex);
502     if(!text->hasPrevious()) {
503         return FALSE;
504     }
505     UnicodeString segment;
506     while(text->hasPrevious()) {
507         UChar32 c=text->previous32();
508         segment.insert(0, c);
509         if(fNorm2->hasBoundaryBefore(c)) {
510             break;
511         }
512     }
513     currentIndex=text->getIndex();
514     UErrorCode errorCode=U_ZERO_ERROR;
515     fNorm2->normalize(segment, buffer, errorCode);
516     bufferPos=buffer.length();
517     return U_SUCCESS(errorCode) && !buffer.isEmpty();
518 }
519 
520 U_NAMESPACE_END
521 
522 #endif /* #if !UCONFIG_NO_NORMALIZATION */
523