• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (C) 2001-2007, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   Date        Name        Description
7 *   07/03/01    aliu        Creation.
8 **********************************************************************
9 */
10 
11 #include "unicode/utypes.h"
12 
13 #if !UCONFIG_NO_TRANSLITERATION
14 
15 #include "unicode/uniset.h"
16 #include "unicode/uiter.h"
17 #include "nortrans.h"
18 #include "unormimp.h"
19 #include "ucln_in.h"
20 
21 U_NAMESPACE_BEGIN
22 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)23 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
24 
25 /**
26  * System registration hook.
27  */
28 void NormalizationTransliterator::registerIDs() {
29     UErrorCode errorCode = U_ZERO_ERROR;
30     if(!unorm_haveData(&errorCode)) {
31         return;
32     }
33 
34     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
35                                      _create, integerToken(UNORM_NFC));
36     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
37                                      _create, integerToken(UNORM_NFKC));
38     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
39                                      _create, integerToken(UNORM_NFD));
40     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
41                                      _create, integerToken(UNORM_NFKD));
42     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
43                                             UNICODE_STRING_SIMPLE("NFD"), TRUE);
44     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
45                                             UNICODE_STRING_SIMPLE("NFKD"), TRUE);
46 }
47 
48 /**
49  * Factory methods
50  */
_create(const UnicodeString & ID,Token context)51 Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
52                                                      Token context) {
53     return new NormalizationTransliterator(ID, (UNormalizationMode) context.integer, 0);
54 }
55 
56 /**
57  * Constructs a transliterator.
58  */
NormalizationTransliterator(const UnicodeString & id,UNormalizationMode mode,int32_t opt)59 NormalizationTransliterator::NormalizationTransliterator(
60                                  const UnicodeString& id,
61                                  UNormalizationMode mode, int32_t opt) :
62     Transliterator(id, 0) {
63     fMode = mode;
64     options = opt;
65 }
66 
67 /**
68  * Destructor.
69  */
~NormalizationTransliterator()70 NormalizationTransliterator::~NormalizationTransliterator() {
71 }
72 
73 /**
74  * Copy constructor.
75  */
NormalizationTransliterator(const NormalizationTransliterator & o)76 NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
77 Transliterator(o) {
78     fMode = o.fMode;
79     options = o.options;
80 }
81 
82 /**
83  * Assignment operator.
84  */
85 /*NormalizationTransliterator& NormalizationTransliterator::operator=(const NormalizationTransliterator& o) {
86     Transliterator::operator=(o);
87     fMode = o.fMode;
88     options = o.options;
89     return *this;
90 }*/
91 
92 /**
93  * Transliterator API.
94  */
clone(void) const95 Transliterator* NormalizationTransliterator::clone(void) const {
96     return new NormalizationTransliterator(*this);
97 }
98 
99 /**
100  * Implements {@link Transliterator#handleTransliterate}.
101  */
handleTransliterate(Replaceable & text,UTransPosition & offsets,UBool isIncremental) const102 void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
103                                                       UBool isIncremental) const {
104     // start and limit of the input range
105     int32_t start = offsets.start;
106     int32_t limit = offsets.limit;
107     int32_t length, delta;
108 
109     if(start >= limit) {
110         return;
111     }
112 
113     // a C code unit iterator, implemented around the Replaceable
114     UCharIterator iter;
115     uiter_setReplaceable(&iter, &text);
116 
117     // the output string and buffer pointer
118     UnicodeString output;
119     UChar *buffer;
120     UBool neededToNormalize;
121 
122     UErrorCode errorCode;
123 
124     /*
125      * Normalize as short chunks at a time as possible even in
126      * bulk mode, so that styled text is minimally disrupted.
127      * In incremental mode, a chunk that ends with offsets.limit
128      * must not be normalized.
129      *
130      * If it was known that the input text is not styled, then
131      * a bulk mode normalization could look like this:
132      *
133 
134     UChar staticChars[256];
135     UnicodeString input;
136 
137     length = limit - start;
138     input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias
139 
140     _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
141     input.releaseBuffer(length);
142 
143     UErrorCode status = U_ZERO_ERROR;
144     Normalizer::normalize(input, fMode, options, output, status);
145 
146     text.handleReplaceBetween(start, limit, output);
147 
148     int32_t delta = output.length() - length;
149     offsets.contextLimit += delta;
150     offsets.limit += delta;
151     offsets.start = limit + delta;
152 
153      *
154      */
155     while(start < limit) {
156         // set the iterator limits for the remaining input range
157         // this is a moving target because of the replacements in the text object
158         iter.start = iter.index = start;
159         iter.limit = limit;
160 
161         // incrementally normalize a small chunk of the input
162         buffer = output.getBuffer(-1);
163         errorCode = U_ZERO_ERROR;
164         length = unorm_next(&iter, buffer, output.getCapacity(),
165                             fMode, 0,
166                             TRUE, &neededToNormalize,
167                             &errorCode);
168         output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
169 
170         if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
171             // use a larger output string buffer and do it again from the start
172             iter.index = start;
173             buffer = output.getBuffer(length);
174             errorCode = U_ZERO_ERROR;
175             length = unorm_next(&iter, buffer, output.getCapacity(),
176                                 fMode, 0,
177                                 TRUE, &neededToNormalize,
178                                 &errorCode);
179             output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
180         }
181 
182         if(U_FAILURE(errorCode)) {
183             break;
184         }
185 
186         limit = iter.index;
187         if(isIncremental && limit == iter.limit) {
188             // stop in incremental mode when we reach the input limit
189             // in case there are additional characters that could change the
190             // normalization result
191 
192             // UNLESS all characters in the result of the normalization of
193             // the last run are in the skippable set
194             const UChar *s=output.getBuffer();
195             int32_t i=0, outLength=output.length();
196             UChar32 c;
197 
198             while(i<outLength) {
199                 U16_NEXT(s, i, outLength, c);
200                 if(!unorm_isNFSkippable(c, fMode)) {
201                     outLength=-1; // I wish C++ had labeled loops and break outer; ...
202                     break;
203                 }
204             }
205             if (outLength<0) {
206                 break;
207             }
208         }
209 
210         if(neededToNormalize) {
211             // replace the input chunk with its normalized form
212             text.handleReplaceBetween(start, limit, output);
213 
214             // update all necessary indexes accordingly
215             delta = length - (limit - start);   // length change in the text object
216             start = limit += delta;             // the next chunk starts where this one ends, with adjustment
217             limit = offsets.limit += delta;     // set the iteration limit to the adjusted end of the input range
218             offsets.contextLimit += delta;
219         } else {
220             // delta == 0
221             start = limit;
222             limit = offsets.limit;
223         }
224     }
225 
226     offsets.start = start;
227 }
228 
229 U_NAMESPACE_END
230 
231 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
232