1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *************************************************************************
5 * COPYRIGHT:
6 * Copyright (c) 1996-2012, International Business Machines Corporation and
7 * others. All Rights Reserved.
8 *************************************************************************
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_NORMALIZATION
14
15 #include "unicode/uniset.h"
16 #include "unicode/unistr.h"
17 #include "unicode/chariter.h"
18 #include "unicode/schriter.h"
19 #include "unicode/uchriter.h"
20 #include "unicode/normlzr.h"
21 #include "unicode/utf16.h"
22 #include "cmemory.h"
23 #include "normalizer2impl.h"
24 #include "uprops.h" // for uniset_getUnicode32Instance()
25
26 #if defined(move32)
27 // System can define move32 intrinsics, but the char iters define move32 method
28 // using same undef trick in headers, so undef here to re-enable the method.
29 #undef move32
30 #endif
31
32 U_NAMESPACE_BEGIN
33
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)34 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
35
36 //-------------------------------------------------------------------------
37 // Constructors and other boilerplate
38 //-------------------------------------------------------------------------
39
40 Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
41 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
42 text(new StringCharacterIterator(str)),
43 currentIndex(0), nextIndex(0),
44 buffer(), bufferPos(0)
45 {
46 init();
47 }
48
Normalizer(ConstChar16Ptr str,int32_t length,UNormalizationMode mode)49 Normalizer::Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode) :
50 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
51 text(new UCharCharacterIterator(str, length)),
52 currentIndex(0), nextIndex(0),
53 buffer(), bufferPos(0)
54 {
55 init();
56 }
57
Normalizer(const CharacterIterator & iter,UNormalizationMode mode)58 Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
59 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
60 text(iter.clone()),
61 currentIndex(0), nextIndex(0),
62 buffer(), bufferPos(0)
63 {
64 init();
65 }
66
Normalizer(const Normalizer & copy)67 Normalizer::Normalizer(const Normalizer ©) :
68 UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
69 text(copy.text->clone()),
70 currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
71 buffer(copy.buffer), bufferPos(copy.bufferPos)
72 {
73 init();
74 }
75
76 void
init()77 Normalizer::init() {
78 UErrorCode errorCode=U_ZERO_ERROR;
79 fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
80 if(fOptions&UNORM_UNICODE_3_2) {
81 delete fFilteredNorm2;
82 fNorm2=fFilteredNorm2=
83 new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
84 }
85 if(U_FAILURE(errorCode)) {
86 errorCode=U_ZERO_ERROR;
87 fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
88 }
89 }
90
~Normalizer()91 Normalizer::~Normalizer()
92 {
93 delete fFilteredNorm2;
94 delete text;
95 }
96
97 Normalizer*
clone() const98 Normalizer::clone() const
99 {
100 return new Normalizer(*this);
101 }
102
103 /**
104 * Generates a hash code for this iterator.
105 */
hashCode() const106 int32_t Normalizer::hashCode() const
107 {
108 return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
109 }
110
operator ==(const Normalizer & that) const111 UBool Normalizer::operator==(const Normalizer& that) const
112 {
113 return
114 this==&that ||
115 (fUMode==that.fUMode &&
116 fOptions==that.fOptions &&
117 *text==*that.text &&
118 buffer==that.buffer &&
119 bufferPos==that.bufferPos &&
120 nextIndex==that.nextIndex);
121 }
122
123 //-------------------------------------------------------------------------
124 // Static utility methods
125 //-------------------------------------------------------------------------
126
127 void U_EXPORT2
normalize(const UnicodeString & source,UNormalizationMode mode,int32_t options,UnicodeString & result,UErrorCode & status)128 Normalizer::normalize(const UnicodeString& source,
129 UNormalizationMode mode, int32_t options,
130 UnicodeString& result,
131 UErrorCode &status) {
132 if(source.isBogus() || U_FAILURE(status)) {
133 result.setToBogus();
134 if(U_SUCCESS(status)) {
135 status=U_ILLEGAL_ARGUMENT_ERROR;
136 }
137 } else {
138 UnicodeString localDest;
139 UnicodeString *dest;
140
141 if(&source!=&result) {
142 dest=&result;
143 } else {
144 // the source and result strings are the same object, use a temporary one
145 dest=&localDest;
146 }
147 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
148 if(U_SUCCESS(status)) {
149 if(options&UNORM_UNICODE_3_2) {
150 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
151 normalize(source, *dest, status);
152 } else {
153 n2->normalize(source, *dest, status);
154 }
155 }
156 if(dest==&localDest && U_SUCCESS(status)) {
157 result=*dest;
158 }
159 }
160 }
161
162 void U_EXPORT2
compose(const UnicodeString & source,UBool compat,int32_t options,UnicodeString & result,UErrorCode & status)163 Normalizer::compose(const UnicodeString& source,
164 UBool compat, int32_t options,
165 UnicodeString& result,
166 UErrorCode &status) {
167 normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
168 }
169
170 void U_EXPORT2
decompose(const UnicodeString & source,UBool compat,int32_t options,UnicodeString & result,UErrorCode & status)171 Normalizer::decompose(const UnicodeString& source,
172 UBool compat, int32_t options,
173 UnicodeString& result,
174 UErrorCode &status) {
175 normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
176 }
177
178 UNormalizationCheckResult
quickCheck(const UnicodeString & source,UNormalizationMode mode,int32_t options,UErrorCode & status)179 Normalizer::quickCheck(const UnicodeString& source,
180 UNormalizationMode mode, int32_t options,
181 UErrorCode &status) {
182 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
183 if(U_SUCCESS(status)) {
184 if(options&UNORM_UNICODE_3_2) {
185 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
186 quickCheck(source, status);
187 } else {
188 return n2->quickCheck(source, status);
189 }
190 } else {
191 return UNORM_MAYBE;
192 }
193 }
194
195 UBool
isNormalized(const UnicodeString & source,UNormalizationMode mode,int32_t options,UErrorCode & status)196 Normalizer::isNormalized(const UnicodeString& source,
197 UNormalizationMode mode, int32_t options,
198 UErrorCode &status) {
199 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
200 if(U_SUCCESS(status)) {
201 if(options&UNORM_UNICODE_3_2) {
202 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
203 isNormalized(source, status);
204 } else {
205 return n2->isNormalized(source, status);
206 }
207 } else {
208 return FALSE;
209 }
210 }
211
212 UnicodeString & U_EXPORT2
concatenate(const UnicodeString & left,const UnicodeString & right,UnicodeString & result,UNormalizationMode mode,int32_t options,UErrorCode & errorCode)213 Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
214 UnicodeString &result,
215 UNormalizationMode mode, int32_t options,
216 UErrorCode &errorCode) {
217 if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
218 result.setToBogus();
219 if(U_SUCCESS(errorCode)) {
220 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
221 }
222 } else {
223 UnicodeString localDest;
224 UnicodeString *dest;
225
226 if(&right!=&result) {
227 dest=&result;
228 } else {
229 // the right and result strings are the same object, use a temporary one
230 dest=&localDest;
231 }
232 *dest=left;
233 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
234 if(U_SUCCESS(errorCode)) {
235 if(options&UNORM_UNICODE_3_2) {
236 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
237 append(*dest, right, errorCode);
238 } else {
239 n2->append(*dest, right, errorCode);
240 }
241 }
242 if(dest==&localDest && U_SUCCESS(errorCode)) {
243 result=*dest;
244 }
245 }
246 return result;
247 }
248
249 //-------------------------------------------------------------------------
250 // Iteration API
251 //-------------------------------------------------------------------------
252
253 /**
254 * Return the current character in the normalized text.
255 */
current()256 UChar32 Normalizer::current() {
257 if(bufferPos<buffer.length() || nextNormalize()) {
258 return buffer.char32At(bufferPos);
259 } else {
260 return DONE;
261 }
262 }
263
264 /**
265 * Return the next character in the normalized text and advance
266 * the iteration position by one. If the end
267 * of the text has already been reached, {@link #DONE} is returned.
268 */
next()269 UChar32 Normalizer::next() {
270 if(bufferPos<buffer.length() || nextNormalize()) {
271 UChar32 c=buffer.char32At(bufferPos);
272 bufferPos+=U16_LENGTH(c);
273 return c;
274 } else {
275 return DONE;
276 }
277 }
278
279 /**
280 * Return the previous character in the normalized text and decrement
281 * the iteration position by one. If the beginning
282 * of the text has already been reached, {@link #DONE} is returned.
283 */
previous()284 UChar32 Normalizer::previous() {
285 if(bufferPos>0 || previousNormalize()) {
286 UChar32 c=buffer.char32At(bufferPos-1);
287 bufferPos-=U16_LENGTH(c);
288 return c;
289 } else {
290 return DONE;
291 }
292 }
293
reset()294 void Normalizer::reset() {
295 currentIndex=nextIndex=text->setToStart();
296 clearBuffer();
297 }
298
299 void
setIndexOnly(int32_t index)300 Normalizer::setIndexOnly(int32_t index) {
301 text->setIndex(index); // pins index
302 currentIndex=nextIndex=text->getIndex();
303 clearBuffer();
304 }
305
306 /**
307 * Return the first character in the normalized text. This resets
308 * the <tt>Normalizer's</tt> position to the beginning of the text.
309 */
first()310 UChar32 Normalizer::first() {
311 reset();
312 return next();
313 }
314
315 /**
316 * Return the last character in the normalized text. This resets
317 * the <tt>Normalizer's</tt> position to be just before the
318 * the input text corresponding to that normalized character.
319 */
last()320 UChar32 Normalizer::last() {
321 currentIndex=nextIndex=text->setToEnd();
322 clearBuffer();
323 return previous();
324 }
325
326 /**
327 * Retrieve the current iteration position in the input text that is
328 * being normalized. This method is useful in applications such as
329 * searching, where you need to be able to determine the position in
330 * the input text that corresponds to a given normalized output character.
331 * <p>
332 * <b>Note:</b> This method sets the position in the <em>input</em>, while
333 * {@link #next} and {@link #previous} iterate through characters in the
334 * <em>output</em>. This means that there is not necessarily a one-to-one
335 * correspondence between characters returned by <tt>next</tt> and
336 * <tt>previous</tt> and the indices passed to and returned from
337 * <tt>setIndex</tt> and {@link #getIndex}.
338 *
339 */
getIndex() const340 int32_t Normalizer::getIndex() const {
341 if(bufferPos<buffer.length()) {
342 return currentIndex;
343 } else {
344 return nextIndex;
345 }
346 }
347
348 /**
349 * Retrieve the index of the start of the input text. This is the begin index
350 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
351 * over which this <tt>Normalizer</tt> is iterating
352 */
startIndex() const353 int32_t Normalizer::startIndex() const {
354 return text->startIndex();
355 }
356
357 /**
358 * Retrieve the index of the end of the input text. This is the end index
359 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
360 * over which this <tt>Normalizer</tt> is iterating
361 */
endIndex() const362 int32_t Normalizer::endIndex() const {
363 return text->endIndex();
364 }
365
366 //-------------------------------------------------------------------------
367 // Property access methods
368 //-------------------------------------------------------------------------
369
370 void
setMode(UNormalizationMode newMode)371 Normalizer::setMode(UNormalizationMode newMode)
372 {
373 fUMode = newMode;
374 init();
375 }
376
377 UNormalizationMode
getUMode() const378 Normalizer::getUMode() const
379 {
380 return fUMode;
381 }
382
383 void
setOption(int32_t option,UBool value)384 Normalizer::setOption(int32_t option,
385 UBool value)
386 {
387 if (value) {
388 fOptions |= option;
389 } else {
390 fOptions &= (~option);
391 }
392 init();
393 }
394
395 UBool
getOption(int32_t option) const396 Normalizer::getOption(int32_t option) const
397 {
398 return (fOptions & option) != 0;
399 }
400
401 /**
402 * Set the input text over which this <tt>Normalizer</tt> will iterate.
403 * The iteration position is set to the beginning of the input text.
404 */
405 void
setText(const UnicodeString & newText,UErrorCode & status)406 Normalizer::setText(const UnicodeString& newText,
407 UErrorCode &status)
408 {
409 if (U_FAILURE(status)) {
410 return;
411 }
412 CharacterIterator *newIter = new StringCharacterIterator(newText);
413 if (newIter == NULL) {
414 status = U_MEMORY_ALLOCATION_ERROR;
415 return;
416 }
417 delete text;
418 text = newIter;
419 reset();
420 }
421
422 /**
423 * Set the input text over which this <tt>Normalizer</tt> will iterate.
424 * The iteration position is set to the beginning of the string.
425 */
426 void
setText(const CharacterIterator & newText,UErrorCode & status)427 Normalizer::setText(const CharacterIterator& newText,
428 UErrorCode &status)
429 {
430 if (U_FAILURE(status)) {
431 return;
432 }
433 CharacterIterator *newIter = newText.clone();
434 if (newIter == NULL) {
435 status = U_MEMORY_ALLOCATION_ERROR;
436 return;
437 }
438 delete text;
439 text = newIter;
440 reset();
441 }
442
443 void
setText(ConstChar16Ptr newText,int32_t length,UErrorCode & status)444 Normalizer::setText(ConstChar16Ptr newText,
445 int32_t length,
446 UErrorCode &status)
447 {
448 if (U_FAILURE(status)) {
449 return;
450 }
451 CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
452 if (newIter == NULL) {
453 status = U_MEMORY_ALLOCATION_ERROR;
454 return;
455 }
456 delete text;
457 text = newIter;
458 reset();
459 }
460
461 /**
462 * Copies the text under iteration into the UnicodeString referred to by "result".
463 * @param result Receives a copy of the text under iteration.
464 */
465 void
getText(UnicodeString & result)466 Normalizer::getText(UnicodeString& result)
467 {
468 text->getText(result);
469 }
470
471 //-------------------------------------------------------------------------
472 // Private utility methods
473 //-------------------------------------------------------------------------
474
clearBuffer()475 void Normalizer::clearBuffer() {
476 buffer.remove();
477 bufferPos=0;
478 }
479
480 UBool
nextNormalize()481 Normalizer::nextNormalize() {
482 clearBuffer();
483 currentIndex=nextIndex;
484 text->setIndex(nextIndex);
485 if(!text->hasNext()) {
486 return FALSE;
487 }
488 // Skip at least one character so we make progress.
489 UnicodeString segment(text->next32PostInc());
490 while(text->hasNext()) {
491 UChar32 c;
492 if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
493 text->move32(-1, CharacterIterator::kCurrent);
494 break;
495 }
496 segment.append(c);
497 }
498 nextIndex=text->getIndex();
499 UErrorCode errorCode=U_ZERO_ERROR;
500 fNorm2->normalize(segment, buffer, errorCode);
501 return U_SUCCESS(errorCode) && !buffer.isEmpty();
502 }
503
504 UBool
previousNormalize()505 Normalizer::previousNormalize() {
506 clearBuffer();
507 nextIndex=currentIndex;
508 text->setIndex(currentIndex);
509 if(!text->hasPrevious()) {
510 return FALSE;
511 }
512 UnicodeString segment;
513 while(text->hasPrevious()) {
514 UChar32 c=text->previous32();
515 segment.insert(0, c);
516 if(fNorm2->hasBoundaryBefore(c)) {
517 break;
518 }
519 }
520 currentIndex=text->getIndex();
521 UErrorCode errorCode=U_ZERO_ERROR;
522 fNorm2->normalize(segment, buffer, errorCode);
523 bufferPos=buffer.length();
524 return U_SUCCESS(errorCode) && !buffer.isEmpty();
525 }
526
527 U_NAMESPACE_END
528
529 #endif /* #if !UCONFIG_NO_NORMALIZATION */
530