1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3
4 // norms.cpp
5 // created: 2017jun04 Markus W. Scherer
6 // (pulled out of n2builder.cpp)
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_NORMALIZATION
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include "unicode/errorcode.h"
15 #include "unicode/umutablecptrie.h"
16 #include "unicode/unistr.h"
17 #include "unicode/utf16.h"
18 #include "normalizer2impl.h"
19 #include "norms.h"
20 #include "toolutil.h"
21 #include "uvectr32.h"
22
23 U_NAMESPACE_BEGIN
24
append(UChar32 c,uint8_t cc)25 void BuilderReorderingBuffer::append(UChar32 c, uint8_t cc) {
26 if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) {
27 if(cc==0) {
28 fLastStarterIndex=fLength;
29 }
30 fArray[fLength++]=(c<<8)|cc;
31 return;
32 }
33 // Let this character bubble back to its canonical order.
34 int32_t i=fLength-1;
35 while(i>fLastStarterIndex && ccAt(i)>cc) {
36 --i;
37 }
38 ++i; // after the last starter or prevCC<=cc
39 // Move this and the following characters forward one to make space.
40 for(int32_t j=fLength; i<j; --j) {
41 fArray[j]=fArray[j-1];
42 }
43 fArray[i]=(c<<8)|cc;
44 ++fLength;
45 fDidReorder=true;
46 }
47
toString(UnicodeString & dest) const48 void BuilderReorderingBuffer::toString(UnicodeString &dest) const {
49 dest.remove();
50 for(int32_t i=0; i<fLength; ++i) {
51 dest.append(charAt(i));
52 }
53 }
54
combine(UChar32 trail) const55 UChar32 Norm::combine(UChar32 trail) const {
56 int32_t length;
57 const CompositionPair *pairs=getCompositionPairs(length);
58 for(int32_t i=0; i<length; ++i) {
59 if(trail==pairs[i].trail) {
60 return pairs[i].composite;
61 }
62 if(trail<pairs[i].trail) {
63 break;
64 }
65 }
66 return U_SENTINEL;
67 }
68
Norms(UErrorCode & errorCode)69 Norms::Norms(UErrorCode &errorCode) {
70 normTrie = umutablecptrie_open(0, 0, &errorCode);
71 normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm));
72 // Default "inert" Norm struct at index 0. Practically immutable.
73 norms=allocNorm();
74 norms->type=Norm::INERT;
75 }
76
~Norms()77 Norms::~Norms() {
78 umutablecptrie_close(normTrie);
79 int32_t normsLength=utm_countItems(normMem);
80 for(int32_t i=1; i<normsLength; ++i) {
81 delete norms[i].mapping;
82 delete norms[i].rawMapping;
83 delete norms[i].compositions;
84 }
85 utm_close(normMem);
86 }
87
allocNorm()88 Norm *Norms::allocNorm() {
89 Norm* p = static_cast<Norm*>(utm_alloc(normMem));
90 norms = static_cast<Norm*>(utm_getStart(normMem)); // in case it got reallocated
91 return p;
92 }
93
getNorm(UChar32 c)94 Norm *Norms::getNorm(UChar32 c) {
95 uint32_t i = umutablecptrie_get(normTrie, c);
96 if(i==0) {
97 return nullptr;
98 }
99 return norms+i;
100 }
101
getNorm(UChar32 c) const102 const Norm *Norms::getNorm(UChar32 c) const {
103 uint32_t i = umutablecptrie_get(normTrie, c);
104 if(i==0) {
105 return nullptr;
106 }
107 return norms+i;
108 }
109
getNormRef(UChar32 c) const110 const Norm &Norms::getNormRef(UChar32 c) const {
111 return norms[umutablecptrie_get(normTrie, c)];
112 }
113
createNorm(UChar32 c)114 Norm *Norms::createNorm(UChar32 c) {
115 uint32_t i=umutablecptrie_get(normTrie, c);
116 if(i!=0) {
117 return norms+i;
118 } else {
119 /* allocate Norm */
120 Norm *p=allocNorm();
121 IcuToolErrorCode errorCode("gennorm2/createNorm()");
122 umutablecptrie_set(normTrie, c, static_cast<uint32_t>(p - norms), errorCode);
123 return p;
124 }
125 }
126
reorder(UnicodeString & mapping,BuilderReorderingBuffer & buffer) const127 void Norms::reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const {
128 int32_t length=mapping.length();
129 U_ASSERT(length<=Normalizer2Impl::MAPPING_LENGTH_MASK);
130 const char16_t *s=mapping.getBuffer();
131 int32_t i=0;
132 UChar32 c;
133 while(i<length) {
134 U16_NEXT(s, i, length, c);
135 buffer.append(c, getCC(c));
136 }
137 if(buffer.didReorder()) {
138 buffer.toString(mapping);
139 }
140 }
141
combinesWithCCBetween(const Norm & norm,uint8_t lowCC,int32_t highCC) const142 UBool Norms::combinesWithCCBetween(const Norm &norm, uint8_t lowCC, int32_t highCC) const {
143 if((highCC-lowCC)>=2) {
144 int32_t length;
145 const CompositionPair *pairs=norm.getCompositionPairs(length);
146 for(int32_t i=0; i<length; ++i) {
147 uint8_t trailCC=getCC(pairs[i].trail);
148 if(lowCC<trailCC && trailCC<highCC) {
149 return true;
150 }
151 }
152 }
153 return false;
154 }
155
enumRanges(Enumerator & e)156 void Norms::enumRanges(Enumerator &e) {
157 UChar32 start = 0, end;
158 uint32_t i;
159 while ((end = umutablecptrie_getRange(normTrie, start, UCPMAP_RANGE_NORMAL, 0,
160 nullptr, nullptr, &i)) >= 0) {
161 if (i > 0) {
162 e.rangeHandler(start, end, norms[i]);
163 }
164 start = end + 1;
165 }
166 }
167
~Enumerator()168 Norms::Enumerator::~Enumerator() {}
169
rangeHandler(UChar32 start,UChar32 end,Norm & norm)170 void CompositionBuilder::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
171 if(norm.mappingType!=Norm::ROUND_TRIP) { return; }
172 if(start!=end) {
173 fprintf(stderr,
174 "gennorm2 error: same round-trip mapping for "
175 "more than 1 code point U+%04lX..U+%04lX\n",
176 static_cast<long>(start), static_cast<long>(end));
177 exit(U_INVALID_FORMAT_ERROR);
178 }
179 if(norm.cc!=0) {
180 fprintf(stderr,
181 "gennorm2 error: "
182 "U+%04lX has a round-trip mapping and ccc!=0, "
183 "not possible in Unicode normalization\n",
184 static_cast<long>(start));
185 exit(U_INVALID_FORMAT_ERROR);
186 }
187 // setRoundTripMapping() ensured that there are exactly two code points.
188 const UnicodeString &m=*norm.mapping;
189 UChar32 lead=m.char32At(0);
190 UChar32 trail=m.char32At(m.length()-1);
191 if(norms.getCC(lead)!=0) {
192 fprintf(stderr,
193 "gennorm2 error: "
194 "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, "
195 "not possible in Unicode normalization\n",
196 static_cast<long>(start), static_cast<long>(lead));
197 exit(U_INVALID_FORMAT_ERROR);
198 }
199 // Flag for trailing character.
200 norms.createNorm(trail)->combinesBack=true;
201 // Insert (trail, composite) pair into compositions list for the lead character.
202 IcuToolErrorCode errorCode("gennorm2/addComposition()");
203 Norm *leadNorm=norms.createNorm(lead);
204 UVector32 *compositions=leadNorm->compositions;
205 int32_t i;
206 if(compositions==nullptr) {
207 compositions=leadNorm->compositions=new UVector32(errorCode);
208 i=0; // "insert" the first pair at index 0
209 } else {
210 // Insertion sort, and check for duplicate trail characters.
211 int32_t length;
212 const CompositionPair *pairs=leadNorm->getCompositionPairs(length);
213 for(i=0; i<length; ++i) {
214 if(trail==pairs[i].trail) {
215 fprintf(stderr,
216 "gennorm2 error: same round-trip mapping for "
217 "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n",
218 static_cast<long>(start), static_cast<long>(lead), static_cast<long>(trail));
219 exit(U_INVALID_FORMAT_ERROR);
220 }
221 if(trail<pairs[i].trail) {
222 break;
223 }
224 }
225 }
226 compositions->insertElementAt(trail, 2*i, errorCode);
227 compositions->insertElementAt(start, 2*i+1, errorCode);
228 }
229
rangeHandler(UChar32 start,UChar32 end,Norm & norm)230 void Decomposer::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
231 if(!norm.hasMapping()) { return; }
232 const UnicodeString &m=*norm.mapping;
233 UnicodeString *decomposed=nullptr;
234 const char16_t *s=toUCharPtr(m.getBuffer());
235 int32_t length=m.length();
236 int32_t prev, i=0;
237 UChar32 c;
238 while(i<length) {
239 prev=i;
240 U16_NEXT(s, i, length, c);
241 if(start<=c && c<=end) {
242 fprintf(stderr,
243 "gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
244 static_cast<long>(c));
245 exit(U_INVALID_FORMAT_ERROR);
246 }
247 const Norm &cNorm=norms.getNormRef(c);
248 if(norm.mappingType==Norm::ROUND_TRIP && prev==0 &&
249 !norm.combinesBack && cNorm.combinesBack) {
250 // If a two-way mapping starts with an NFC_QC=Maybe character,
251 // then mark the composite as NFC_QC=Maybe as well,
252 // so that we trigger decomposition and recomposition.
253 norm.combinesBack=true;
254 didDecompose|=true;
255 }
256 if(cNorm.hasMapping()) {
257 if(norm.mappingType==Norm::ROUND_TRIP) {
258 if(prev==0) {
259 if(cNorm.mappingType!=Norm::ROUND_TRIP) {
260 fprintf(stderr,
261 "gennorm2 error: "
262 "U+%04lX's round-trip mapping's starter "
263 "U+%04lX one-way-decomposes, "
264 "not possible in Unicode normalization\n",
265 static_cast<long>(start), static_cast<long>(c));
266 exit(U_INVALID_FORMAT_ERROR);
267 }
268 uint8_t myTrailCC=norms.getCC(m.char32At(i));
269 UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1);
270 uint8_t cTrailCC=norms.getCC(cTrailChar);
271 if(cTrailCC>myTrailCC) {
272 fprintf(stderr,
273 "gennorm2 error: "
274 "U+%04lX's round-trip mapping's starter "
275 "U+%04lX decomposes and the "
276 "inner/earlier tccc=%hu > outer/following tccc=%hu, "
277 "not possible in Unicode normalization\n",
278 static_cast<long>(start), static_cast<long>(c),
279 static_cast<short>(cTrailCC), static_cast<short>(myTrailCC));
280 exit(U_INVALID_FORMAT_ERROR);
281 }
282 } else {
283 fprintf(stderr,
284 "gennorm2 error: "
285 "U+%04lX's round-trip mapping's non-starter "
286 "U+%04lX decomposes, "
287 "not possible in Unicode normalization\n",
288 static_cast<long>(start), static_cast<long>(c));
289 exit(U_INVALID_FORMAT_ERROR);
290 }
291 }
292 if(decomposed==nullptr) {
293 decomposed=new UnicodeString(m, 0, prev);
294 }
295 decomposed->append(*cNorm.mapping);
296 } else if(Hangul::isHangul(c)) {
297 char16_t buffer[3];
298 int32_t hangulLength=Hangul::decompose(c, buffer);
299 if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) {
300 fprintf(stderr,
301 "gennorm2 error: "
302 "U+%04lX's round-trip mapping's non-starter "
303 "U+%04lX decomposes, "
304 "not possible in Unicode normalization\n",
305 static_cast<long>(start), static_cast<long>(c));
306 exit(U_INVALID_FORMAT_ERROR);
307 }
308 if(decomposed==nullptr) {
309 decomposed=new UnicodeString(m, 0, prev);
310 }
311 decomposed->append(buffer, hangulLength);
312 } else if(decomposed!=nullptr) {
313 decomposed->append(m, prev, i-prev);
314 }
315 }
316 if(decomposed!=nullptr) {
317 if(norm.rawMapping==nullptr) {
318 // Remember the original mapping when decomposing recursively.
319 norm.rawMapping=norm.mapping;
320 } else {
321 delete norm.mapping;
322 }
323 norm.mapping=decomposed;
324 // Not norm.setMappingCP(); because the original mapping
325 // is most likely to be encodable as a delta.
326 didDecompose|=true;
327 }
328 }
329
330 U_NAMESPACE_END
331
332 #endif // #if !UCONFIG_NO_NORMALIZATION
333