1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *************************************************************************
5 * COPYRIGHT:
6 * Copyright (c) 1996-2012, International Business Machines Corporation and
7 * others. All Rights Reserved.
8 *************************************************************************
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_NORMALIZATION
14
15 #include "unicode/uniset.h"
16 #include "unicode/unistr.h"
17 #include "unicode/chariter.h"
18 #include "unicode/schriter.h"
19 #include "unicode/uchriter.h"
20 #include "unicode/normlzr.h"
21 #include "unicode/utf16.h"
22 #include "cmemory.h"
23 #include "normalizer2impl.h"
24 #include "uprops.h" // for uniset_getUnicode32Instance()
25
26 U_NAMESPACE_BEGIN
27
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)28 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
29
30 //-------------------------------------------------------------------------
31 // Constructors and other boilerplate
32 //-------------------------------------------------------------------------
33
34 Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
35 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
36 text(new StringCharacterIterator(str)),
37 currentIndex(0), nextIndex(0),
38 buffer(), bufferPos(0)
39 {
40 init();
41 }
42
Normalizer(const UChar * str,int32_t length,UNormalizationMode mode)43 Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
44 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
45 text(new UCharCharacterIterator(str, length)),
46 currentIndex(0), nextIndex(0),
47 buffer(), bufferPos(0)
48 {
49 init();
50 }
51
Normalizer(const CharacterIterator & iter,UNormalizationMode mode)52 Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
53 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
54 text(iter.clone()),
55 currentIndex(0), nextIndex(0),
56 buffer(), bufferPos(0)
57 {
58 init();
59 }
60
Normalizer(const Normalizer & copy)61 Normalizer::Normalizer(const Normalizer ©) :
62 UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
63 text(copy.text->clone()),
64 currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
65 buffer(copy.buffer), bufferPos(copy.bufferPos)
66 {
67 init();
68 }
69
70 void
init()71 Normalizer::init() {
72 UErrorCode errorCode=U_ZERO_ERROR;
73 fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
74 if(fOptions&UNORM_UNICODE_3_2) {
75 delete fFilteredNorm2;
76 fNorm2=fFilteredNorm2=
77 new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
78 }
79 if(U_FAILURE(errorCode)) {
80 errorCode=U_ZERO_ERROR;
81 fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
82 }
83 }
84
~Normalizer()85 Normalizer::~Normalizer()
86 {
87 delete fFilteredNorm2;
88 delete text;
89 }
90
91 Normalizer*
clone() const92 Normalizer::clone() const
93 {
94 return new Normalizer(*this);
95 }
96
97 /**
98 * Generates a hash code for this iterator.
99 */
hashCode() const100 int32_t Normalizer::hashCode() const
101 {
102 return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
103 }
104
operator ==(const Normalizer & that) const105 UBool Normalizer::operator==(const Normalizer& that) const
106 {
107 return
108 this==&that ||
109 (fUMode==that.fUMode &&
110 fOptions==that.fOptions &&
111 *text==*that.text &&
112 buffer==that.buffer &&
113 bufferPos==that.bufferPos &&
114 nextIndex==that.nextIndex);
115 }
116
117 //-------------------------------------------------------------------------
118 // Static utility methods
119 //-------------------------------------------------------------------------
120
121 void U_EXPORT2
normalize(const UnicodeString & source,UNormalizationMode mode,int32_t options,UnicodeString & result,UErrorCode & status)122 Normalizer::normalize(const UnicodeString& source,
123 UNormalizationMode mode, int32_t options,
124 UnicodeString& result,
125 UErrorCode &status) {
126 if(source.isBogus() || U_FAILURE(status)) {
127 result.setToBogus();
128 if(U_SUCCESS(status)) {
129 status=U_ILLEGAL_ARGUMENT_ERROR;
130 }
131 } else {
132 UnicodeString localDest;
133 UnicodeString *dest;
134
135 if(&source!=&result) {
136 dest=&result;
137 } else {
138 // the source and result strings are the same object, use a temporary one
139 dest=&localDest;
140 }
141 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
142 if(U_SUCCESS(status)) {
143 if(options&UNORM_UNICODE_3_2) {
144 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
145 normalize(source, *dest, status);
146 } else {
147 n2->normalize(source, *dest, status);
148 }
149 }
150 if(dest==&localDest && U_SUCCESS(status)) {
151 result=*dest;
152 }
153 }
154 }
155
156 void U_EXPORT2
compose(const UnicodeString & source,UBool compat,int32_t options,UnicodeString & result,UErrorCode & status)157 Normalizer::compose(const UnicodeString& source,
158 UBool compat, int32_t options,
159 UnicodeString& result,
160 UErrorCode &status) {
161 normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
162 }
163
164 void U_EXPORT2
decompose(const UnicodeString & source,UBool compat,int32_t options,UnicodeString & result,UErrorCode & status)165 Normalizer::decompose(const UnicodeString& source,
166 UBool compat, int32_t options,
167 UnicodeString& result,
168 UErrorCode &status) {
169 normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
170 }
171
172 UNormalizationCheckResult
quickCheck(const UnicodeString & source,UNormalizationMode mode,int32_t options,UErrorCode & status)173 Normalizer::quickCheck(const UnicodeString& source,
174 UNormalizationMode mode, int32_t options,
175 UErrorCode &status) {
176 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
177 if(U_SUCCESS(status)) {
178 if(options&UNORM_UNICODE_3_2) {
179 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
180 quickCheck(source, status);
181 } else {
182 return n2->quickCheck(source, status);
183 }
184 } else {
185 return UNORM_MAYBE;
186 }
187 }
188
189 UBool
isNormalized(const UnicodeString & source,UNormalizationMode mode,int32_t options,UErrorCode & status)190 Normalizer::isNormalized(const UnicodeString& source,
191 UNormalizationMode mode, int32_t options,
192 UErrorCode &status) {
193 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
194 if(U_SUCCESS(status)) {
195 if(options&UNORM_UNICODE_3_2) {
196 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
197 isNormalized(source, status);
198 } else {
199 return n2->isNormalized(source, status);
200 }
201 } else {
202 return FALSE;
203 }
204 }
205
206 UnicodeString & U_EXPORT2
concatenate(const UnicodeString & left,const UnicodeString & right,UnicodeString & result,UNormalizationMode mode,int32_t options,UErrorCode & errorCode)207 Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
208 UnicodeString &result,
209 UNormalizationMode mode, int32_t options,
210 UErrorCode &errorCode) {
211 if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
212 result.setToBogus();
213 if(U_SUCCESS(errorCode)) {
214 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
215 }
216 } else {
217 UnicodeString localDest;
218 UnicodeString *dest;
219
220 if(&right!=&result) {
221 dest=&result;
222 } else {
223 // the right and result strings are the same object, use a temporary one
224 dest=&localDest;
225 }
226 *dest=left;
227 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
228 if(U_SUCCESS(errorCode)) {
229 if(options&UNORM_UNICODE_3_2) {
230 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
231 append(*dest, right, errorCode);
232 } else {
233 n2->append(*dest, right, errorCode);
234 }
235 }
236 if(dest==&localDest && U_SUCCESS(errorCode)) {
237 result=*dest;
238 }
239 }
240 return result;
241 }
242
243 //-------------------------------------------------------------------------
244 // Iteration API
245 //-------------------------------------------------------------------------
246
247 /**
248 * Return the current character in the normalized text.
249 */
current()250 UChar32 Normalizer::current() {
251 if(bufferPos<buffer.length() || nextNormalize()) {
252 return buffer.char32At(bufferPos);
253 } else {
254 return DONE;
255 }
256 }
257
258 /**
259 * Return the next character in the normalized text and advance
260 * the iteration position by one. If the end
261 * of the text has already been reached, {@link #DONE} is returned.
262 */
next()263 UChar32 Normalizer::next() {
264 if(bufferPos<buffer.length() || nextNormalize()) {
265 UChar32 c=buffer.char32At(bufferPos);
266 bufferPos+=U16_LENGTH(c);
267 return c;
268 } else {
269 return DONE;
270 }
271 }
272
273 /**
274 * Return the previous character in the normalized text and decrement
275 * the iteration position by one. If the beginning
276 * of the text has already been reached, {@link #DONE} is returned.
277 */
previous()278 UChar32 Normalizer::previous() {
279 if(bufferPos>0 || previousNormalize()) {
280 UChar32 c=buffer.char32At(bufferPos-1);
281 bufferPos-=U16_LENGTH(c);
282 return c;
283 } else {
284 return DONE;
285 }
286 }
287
reset()288 void Normalizer::reset() {
289 currentIndex=nextIndex=text->setToStart();
290 clearBuffer();
291 }
292
293 void
setIndexOnly(int32_t index)294 Normalizer::setIndexOnly(int32_t index) {
295 text->setIndex(index); // pins index
296 currentIndex=nextIndex=text->getIndex();
297 clearBuffer();
298 }
299
300 /**
301 * Return the first character in the normalized text. This resets
302 * the <tt>Normalizer's</tt> position to the beginning of the text.
303 */
first()304 UChar32 Normalizer::first() {
305 reset();
306 return next();
307 }
308
309 /**
310 * Return the last character in the normalized text. This resets
311 * the <tt>Normalizer's</tt> position to be just before the
312 * the input text corresponding to that normalized character.
313 */
last()314 UChar32 Normalizer::last() {
315 currentIndex=nextIndex=text->setToEnd();
316 clearBuffer();
317 return previous();
318 }
319
320 /**
321 * Retrieve the current iteration position in the input text that is
322 * being normalized. This method is useful in applications such as
323 * searching, where you need to be able to determine the position in
324 * the input text that corresponds to a given normalized output character.
325 * <p>
326 * <b>Note:</b> This method sets the position in the <em>input</em>, while
327 * {@link #next} and {@link #previous} iterate through characters in the
328 * <em>output</em>. This means that there is not necessarily a one-to-one
329 * correspondence between characters returned by <tt>next</tt> and
330 * <tt>previous</tt> and the indices passed to and returned from
331 * <tt>setIndex</tt> and {@link #getIndex}.
332 *
333 */
getIndex() const334 int32_t Normalizer::getIndex() const {
335 if(bufferPos<buffer.length()) {
336 return currentIndex;
337 } else {
338 return nextIndex;
339 }
340 }
341
342 /**
343 * Retrieve the index of the start of the input text. This is the begin index
344 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
345 * over which this <tt>Normalizer</tt> is iterating
346 */
startIndex() const347 int32_t Normalizer::startIndex() const {
348 return text->startIndex();
349 }
350
351 /**
352 * Retrieve the index of the end of the input text. This is the end index
353 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
354 * over which this <tt>Normalizer</tt> is iterating
355 */
endIndex() const356 int32_t Normalizer::endIndex() const {
357 return text->endIndex();
358 }
359
360 //-------------------------------------------------------------------------
361 // Property access methods
362 //-------------------------------------------------------------------------
363
364 void
setMode(UNormalizationMode newMode)365 Normalizer::setMode(UNormalizationMode newMode)
366 {
367 fUMode = newMode;
368 init();
369 }
370
371 UNormalizationMode
getUMode() const372 Normalizer::getUMode() const
373 {
374 return fUMode;
375 }
376
377 void
setOption(int32_t option,UBool value)378 Normalizer::setOption(int32_t option,
379 UBool value)
380 {
381 if (value) {
382 fOptions |= option;
383 } else {
384 fOptions &= (~option);
385 }
386 init();
387 }
388
389 UBool
getOption(int32_t option) const390 Normalizer::getOption(int32_t option) const
391 {
392 return (fOptions & option) != 0;
393 }
394
395 /**
396 * Set the input text over which this <tt>Normalizer</tt> will iterate.
397 * The iteration position is set to the beginning of the input text.
398 */
399 void
setText(const UnicodeString & newText,UErrorCode & status)400 Normalizer::setText(const UnicodeString& newText,
401 UErrorCode &status)
402 {
403 if (U_FAILURE(status)) {
404 return;
405 }
406 CharacterIterator *newIter = new StringCharacterIterator(newText);
407 if (newIter == NULL) {
408 status = U_MEMORY_ALLOCATION_ERROR;
409 return;
410 }
411 delete text;
412 text = newIter;
413 reset();
414 }
415
416 /**
417 * Set the input text over which this <tt>Normalizer</tt> will iterate.
418 * The iteration position is set to the beginning of the string.
419 */
420 void
setText(const CharacterIterator & newText,UErrorCode & status)421 Normalizer::setText(const CharacterIterator& newText,
422 UErrorCode &status)
423 {
424 if (U_FAILURE(status)) {
425 return;
426 }
427 CharacterIterator *newIter = newText.clone();
428 if (newIter == NULL) {
429 status = U_MEMORY_ALLOCATION_ERROR;
430 return;
431 }
432 delete text;
433 text = newIter;
434 reset();
435 }
436
437 void
setText(const UChar * newText,int32_t length,UErrorCode & status)438 Normalizer::setText(const UChar* newText,
439 int32_t length,
440 UErrorCode &status)
441 {
442 if (U_FAILURE(status)) {
443 return;
444 }
445 CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
446 if (newIter == NULL) {
447 status = U_MEMORY_ALLOCATION_ERROR;
448 return;
449 }
450 delete text;
451 text = newIter;
452 reset();
453 }
454
455 /**
456 * Copies the text under iteration into the UnicodeString referred to by "result".
457 * @param result Receives a copy of the text under iteration.
458 */
459 void
getText(UnicodeString & result)460 Normalizer::getText(UnicodeString& result)
461 {
462 text->getText(result);
463 }
464
465 //-------------------------------------------------------------------------
466 // Private utility methods
467 //-------------------------------------------------------------------------
468
clearBuffer()469 void Normalizer::clearBuffer() {
470 buffer.remove();
471 bufferPos=0;
472 }
473
474 UBool
nextNormalize()475 Normalizer::nextNormalize() {
476 clearBuffer();
477 currentIndex=nextIndex;
478 text->setIndex(nextIndex);
479 if(!text->hasNext()) {
480 return FALSE;
481 }
482 // Skip at least one character so we make progress.
483 UnicodeString segment(text->next32PostInc());
484 while(text->hasNext()) {
485 UChar32 c;
486 if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
487 text->move32(-1, CharacterIterator::kCurrent);
488 break;
489 }
490 segment.append(c);
491 }
492 nextIndex=text->getIndex();
493 UErrorCode errorCode=U_ZERO_ERROR;
494 fNorm2->normalize(segment, buffer, errorCode);
495 return U_SUCCESS(errorCode) && !buffer.isEmpty();
496 }
497
498 UBool
previousNormalize()499 Normalizer::previousNormalize() {
500 clearBuffer();
501 nextIndex=currentIndex;
502 text->setIndex(currentIndex);
503 if(!text->hasPrevious()) {
504 return FALSE;
505 }
506 UnicodeString segment;
507 while(text->hasPrevious()) {
508 UChar32 c=text->previous32();
509 segment.insert(0, c);
510 if(fNorm2->hasBoundaryBefore(c)) {
511 break;
512 }
513 }
514 currentIndex=text->getIndex();
515 UErrorCode errorCode=U_ZERO_ERROR;
516 fNorm2->normalize(segment, buffer, errorCode);
517 bufferPos=buffer.length();
518 return U_SUCCESS(errorCode) && !buffer.isEmpty();
519 }
520
521 U_NAMESPACE_END
522
523 #endif /* #if !UCONFIG_NO_NORMALIZATION */
524