1 /*
2 *************************************************************************
3 * COPYRIGHT:
4 * Copyright (c) 1996-2011, International Business Machines Corporation and
5 * others. All Rights Reserved.
6 *************************************************************************
7 */
8
9 #include "unicode/utypes.h"
10
11 #if !UCONFIG_NO_NORMALIZATION
12
13 #include "unicode/uniset.h"
14 #include "unicode/unistr.h"
15 #include "unicode/chariter.h"
16 #include "unicode/schriter.h"
17 #include "unicode/uchriter.h"
18 #include "unicode/normlzr.h"
19 #include "cmemory.h"
20 #include "normalizer2impl.h"
21 #include "uprops.h" // for uniset_getUnicode32Instance()
22
23 U_NAMESPACE_BEGIN
24
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
26
27 //-------------------------------------------------------------------------
28 // Constructors and other boilerplate
29 //-------------------------------------------------------------------------
30
31 Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
32 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
33 text(new StringCharacterIterator(str)),
34 currentIndex(0), nextIndex(0),
35 buffer(), bufferPos(0)
36 {
37 init();
38 }
39
Normalizer(const UChar * str,int32_t length,UNormalizationMode mode)40 Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
41 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
42 text(new UCharCharacterIterator(str, length)),
43 currentIndex(0), nextIndex(0),
44 buffer(), bufferPos(0)
45 {
46 init();
47 }
48
Normalizer(const CharacterIterator & iter,UNormalizationMode mode)49 Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
50 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
51 text(iter.clone()),
52 currentIndex(0), nextIndex(0),
53 buffer(), bufferPos(0)
54 {
55 init();
56 }
57
Normalizer(const Normalizer & copy)58 Normalizer::Normalizer(const Normalizer ©) :
59 UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
60 text(copy.text->clone()),
61 currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
62 buffer(copy.buffer), bufferPos(copy.bufferPos)
63 {
64 init();
65 }
66
67 static const UChar _NUL=0;
68
69 void
init()70 Normalizer::init() {
71 UErrorCode errorCode=U_ZERO_ERROR;
72 fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
73 if(fOptions&UNORM_UNICODE_3_2) {
74 delete fFilteredNorm2;
75 fNorm2=fFilteredNorm2=
76 new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
77 }
78 if(U_FAILURE(errorCode)) {
79 errorCode=U_ZERO_ERROR;
80 fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
81 }
82 }
83
~Normalizer()84 Normalizer::~Normalizer()
85 {
86 delete fFilteredNorm2;
87 delete text;
88 }
89
90 Normalizer*
clone() const91 Normalizer::clone() const
92 {
93 return new Normalizer(*this);
94 }
95
96 /**
97 * Generates a hash code for this iterator.
98 */
hashCode() const99 int32_t Normalizer::hashCode() const
100 {
101 return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
102 }
103
operator ==(const Normalizer & that) const104 UBool Normalizer::operator==(const Normalizer& that) const
105 {
106 return
107 this==&that ||
108 (fUMode==that.fUMode &&
109 fOptions==that.fOptions &&
110 *text==*that.text &&
111 buffer==that.buffer &&
112 bufferPos==that.bufferPos &&
113 nextIndex==that.nextIndex);
114 }
115
116 //-------------------------------------------------------------------------
117 // Static utility methods
118 //-------------------------------------------------------------------------
119
120 void U_EXPORT2
normalize(const UnicodeString & source,UNormalizationMode mode,int32_t options,UnicodeString & result,UErrorCode & status)121 Normalizer::normalize(const UnicodeString& source,
122 UNormalizationMode mode, int32_t options,
123 UnicodeString& result,
124 UErrorCode &status) {
125 if(source.isBogus() || U_FAILURE(status)) {
126 result.setToBogus();
127 if(U_SUCCESS(status)) {
128 status=U_ILLEGAL_ARGUMENT_ERROR;
129 }
130 } else {
131 UnicodeString localDest;
132 UnicodeString *dest;
133
134 if(&source!=&result) {
135 dest=&result;
136 } else {
137 // the source and result strings are the same object, use a temporary one
138 dest=&localDest;
139 }
140 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
141 if(U_SUCCESS(status)) {
142 if(options&UNORM_UNICODE_3_2) {
143 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
144 normalize(source, *dest, status);
145 } else {
146 n2->normalize(source, *dest, status);
147 }
148 }
149 if(dest==&localDest && U_SUCCESS(status)) {
150 result=*dest;
151 }
152 }
153 }
154
155 void U_EXPORT2
compose(const UnicodeString & source,UBool compat,int32_t options,UnicodeString & result,UErrorCode & status)156 Normalizer::compose(const UnicodeString& source,
157 UBool compat, int32_t options,
158 UnicodeString& result,
159 UErrorCode &status) {
160 normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
161 }
162
163 void U_EXPORT2
decompose(const UnicodeString & source,UBool compat,int32_t options,UnicodeString & result,UErrorCode & status)164 Normalizer::decompose(const UnicodeString& source,
165 UBool compat, int32_t options,
166 UnicodeString& result,
167 UErrorCode &status) {
168 normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
169 }
170
171 UNormalizationCheckResult
quickCheck(const UnicodeString & source,UNormalizationMode mode,int32_t options,UErrorCode & status)172 Normalizer::quickCheck(const UnicodeString& source,
173 UNormalizationMode mode, int32_t options,
174 UErrorCode &status) {
175 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
176 if(U_SUCCESS(status)) {
177 if(options&UNORM_UNICODE_3_2) {
178 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
179 quickCheck(source, status);
180 } else {
181 return n2->quickCheck(source, status);
182 }
183 } else {
184 return UNORM_MAYBE;
185 }
186 }
187
188 UBool
isNormalized(const UnicodeString & source,UNormalizationMode mode,int32_t options,UErrorCode & status)189 Normalizer::isNormalized(const UnicodeString& source,
190 UNormalizationMode mode, int32_t options,
191 UErrorCode &status) {
192 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
193 if(U_SUCCESS(status)) {
194 if(options&UNORM_UNICODE_3_2) {
195 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
196 isNormalized(source, status);
197 } else {
198 return n2->isNormalized(source, status);
199 }
200 } else {
201 return FALSE;
202 }
203 }
204
205 UnicodeString & U_EXPORT2
concatenate(const UnicodeString & left,const UnicodeString & right,UnicodeString & result,UNormalizationMode mode,int32_t options,UErrorCode & errorCode)206 Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
207 UnicodeString &result,
208 UNormalizationMode mode, int32_t options,
209 UErrorCode &errorCode) {
210 if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
211 result.setToBogus();
212 if(U_SUCCESS(errorCode)) {
213 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
214 }
215 } else {
216 UnicodeString localDest;
217 UnicodeString *dest;
218
219 if(&right!=&result) {
220 dest=&result;
221 } else {
222 // the right and result strings are the same object, use a temporary one
223 dest=&localDest;
224 }
225 *dest=left;
226 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
227 if(U_SUCCESS(errorCode)) {
228 if(options&UNORM_UNICODE_3_2) {
229 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
230 append(*dest, right, errorCode);
231 } else {
232 n2->append(*dest, right, errorCode);
233 }
234 }
235 if(dest==&localDest && U_SUCCESS(errorCode)) {
236 result=*dest;
237 }
238 }
239 return result;
240 }
241
242 //-------------------------------------------------------------------------
243 // Iteration API
244 //-------------------------------------------------------------------------
245
246 /**
247 * Return the current character in the normalized text.
248 */
current()249 UChar32 Normalizer::current() {
250 if(bufferPos<buffer.length() || nextNormalize()) {
251 return buffer.char32At(bufferPos);
252 } else {
253 return DONE;
254 }
255 }
256
257 /**
258 * Return the next character in the normalized text and advance
259 * the iteration position by one. If the end
260 * of the text has already been reached, {@link #DONE} is returned.
261 */
next()262 UChar32 Normalizer::next() {
263 if(bufferPos<buffer.length() || nextNormalize()) {
264 UChar32 c=buffer.char32At(bufferPos);
265 bufferPos+=UTF_CHAR_LENGTH(c);
266 return c;
267 } else {
268 return DONE;
269 }
270 }
271
272 /**
273 * Return the previous character in the normalized text and decrement
274 * the iteration position by one. If the beginning
275 * of the text has already been reached, {@link #DONE} is returned.
276 */
previous()277 UChar32 Normalizer::previous() {
278 if(bufferPos>0 || previousNormalize()) {
279 UChar32 c=buffer.char32At(bufferPos-1);
280 bufferPos-=UTF_CHAR_LENGTH(c);
281 return c;
282 } else {
283 return DONE;
284 }
285 }
286
reset()287 void Normalizer::reset() {
288 currentIndex=nextIndex=text->setToStart();
289 clearBuffer();
290 }
291
292 void
setIndexOnly(int32_t index)293 Normalizer::setIndexOnly(int32_t index) {
294 text->setIndex(index); // pins index
295 currentIndex=nextIndex=text->getIndex();
296 clearBuffer();
297 }
298
299 /**
300 * Return the first character in the normalized text. This resets
301 * the <tt>Normalizer's</tt> position to the beginning of the text.
302 */
first()303 UChar32 Normalizer::first() {
304 reset();
305 return next();
306 }
307
308 /**
309 * Return the last character in the normalized text. This resets
310 * the <tt>Normalizer's</tt> position to be just before the
311 * the input text corresponding to that normalized character.
312 */
last()313 UChar32 Normalizer::last() {
314 currentIndex=nextIndex=text->setToEnd();
315 clearBuffer();
316 return previous();
317 }
318
319 /**
320 * Retrieve the current iteration position in the input text that is
321 * being normalized. This method is useful in applications such as
322 * searching, where you need to be able to determine the position in
323 * the input text that corresponds to a given normalized output character.
324 * <p>
325 * <b>Note:</b> This method sets the position in the <em>input</em>, while
326 * {@link #next} and {@link #previous} iterate through characters in the
327 * <em>output</em>. This means that there is not necessarily a one-to-one
328 * correspondence between characters returned by <tt>next</tt> and
329 * <tt>previous</tt> and the indices passed to and returned from
330 * <tt>setIndex</tt> and {@link #getIndex}.
331 *
332 */
getIndex() const333 int32_t Normalizer::getIndex() const {
334 if(bufferPos<buffer.length()) {
335 return currentIndex;
336 } else {
337 return nextIndex;
338 }
339 }
340
341 /**
342 * Retrieve the index of the start of the input text. This is the begin index
343 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
344 * over which this <tt>Normalizer</tt> is iterating
345 */
startIndex() const346 int32_t Normalizer::startIndex() const {
347 return text->startIndex();
348 }
349
350 /**
351 * Retrieve the index of the end of the input text. This is the end index
352 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
353 * over which this <tt>Normalizer</tt> is iterating
354 */
endIndex() const355 int32_t Normalizer::endIndex() const {
356 return text->endIndex();
357 }
358
359 //-------------------------------------------------------------------------
360 // Property access methods
361 //-------------------------------------------------------------------------
362
363 void
setMode(UNormalizationMode newMode)364 Normalizer::setMode(UNormalizationMode newMode)
365 {
366 fUMode = newMode;
367 init();
368 }
369
370 UNormalizationMode
getUMode() const371 Normalizer::getUMode() const
372 {
373 return fUMode;
374 }
375
376 void
setOption(int32_t option,UBool value)377 Normalizer::setOption(int32_t option,
378 UBool value)
379 {
380 if (value) {
381 fOptions |= option;
382 } else {
383 fOptions &= (~option);
384 }
385 init();
386 }
387
388 UBool
getOption(int32_t option) const389 Normalizer::getOption(int32_t option) const
390 {
391 return (fOptions & option) != 0;
392 }
393
394 /**
395 * Set the input text over which this <tt>Normalizer</tt> will iterate.
396 * The iteration position is set to the beginning of the input text.
397 */
398 void
setText(const UnicodeString & newText,UErrorCode & status)399 Normalizer::setText(const UnicodeString& newText,
400 UErrorCode &status)
401 {
402 if (U_FAILURE(status)) {
403 return;
404 }
405 CharacterIterator *newIter = new StringCharacterIterator(newText);
406 if (newIter == NULL) {
407 status = U_MEMORY_ALLOCATION_ERROR;
408 return;
409 }
410 delete text;
411 text = newIter;
412 reset();
413 }
414
415 /**
416 * Set the input text over which this <tt>Normalizer</tt> will iterate.
417 * The iteration position is set to the beginning of the string.
418 */
419 void
setText(const CharacterIterator & newText,UErrorCode & status)420 Normalizer::setText(const CharacterIterator& newText,
421 UErrorCode &status)
422 {
423 if (U_FAILURE(status)) {
424 return;
425 }
426 CharacterIterator *newIter = newText.clone();
427 if (newIter == NULL) {
428 status = U_MEMORY_ALLOCATION_ERROR;
429 return;
430 }
431 delete text;
432 text = newIter;
433 reset();
434 }
435
436 void
setText(const UChar * newText,int32_t length,UErrorCode & status)437 Normalizer::setText(const UChar* newText,
438 int32_t length,
439 UErrorCode &status)
440 {
441 if (U_FAILURE(status)) {
442 return;
443 }
444 CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
445 if (newIter == NULL) {
446 status = U_MEMORY_ALLOCATION_ERROR;
447 return;
448 }
449 delete text;
450 text = newIter;
451 reset();
452 }
453
454 /**
455 * Copies the text under iteration into the UnicodeString referred to by "result".
456 * @param result Receives a copy of the text under iteration.
457 */
458 void
getText(UnicodeString & result)459 Normalizer::getText(UnicodeString& result)
460 {
461 text->getText(result);
462 }
463
464 //-------------------------------------------------------------------------
465 // Private utility methods
466 //-------------------------------------------------------------------------
467
clearBuffer()468 void Normalizer::clearBuffer() {
469 buffer.remove();
470 bufferPos=0;
471 }
472
473 UBool
nextNormalize()474 Normalizer::nextNormalize() {
475 clearBuffer();
476 currentIndex=nextIndex;
477 text->setIndex(nextIndex);
478 if(!text->hasNext()) {
479 return FALSE;
480 }
481 // Skip at least one character so we make progress.
482 UnicodeString segment(text->next32PostInc());
483 while(text->hasNext()) {
484 UChar32 c;
485 if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
486 text->move32(-1, CharacterIterator::kCurrent);
487 break;
488 }
489 segment.append(c);
490 }
491 nextIndex=text->getIndex();
492 UErrorCode errorCode=U_ZERO_ERROR;
493 fNorm2->normalize(segment, buffer, errorCode);
494 return U_SUCCESS(errorCode) && !buffer.isEmpty();
495 }
496
497 UBool
previousNormalize()498 Normalizer::previousNormalize() {
499 clearBuffer();
500 nextIndex=currentIndex;
501 text->setIndex(currentIndex);
502 if(!text->hasPrevious()) {
503 return FALSE;
504 }
505 UnicodeString segment;
506 while(text->hasPrevious()) {
507 UChar32 c=text->previous32();
508 segment.insert(0, c);
509 if(fNorm2->hasBoundaryBefore(c)) {
510 break;
511 }
512 }
513 currentIndex=text->getIndex();
514 UErrorCode errorCode=U_ZERO_ERROR;
515 fNorm2->normalize(segment, buffer, errorCode);
516 bufferPos=buffer.length();
517 return U_SUCCESS(errorCode) && !buffer.isEmpty();
518 }
519
520 U_NAMESPACE_END
521
522 #endif /* #if !UCONFIG_NO_NORMALIZATION */
523