• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 // norms.cpp
5 // created: 2017jun04 Markus W. Scherer
6 // (pulled out of n2builder.cpp)
7 
8 #include "unicode/utypes.h"
9 
10 #if !UCONFIG_NO_NORMALIZATION
11 
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include "unicode/errorcode.h"
15 #include "unicode/unistr.h"
16 #include "unicode/utf16.h"
17 #include "normalizer2impl.h"
18 #include "norms.h"
19 #include "toolutil.h"
20 #include "utrie2.h"
21 #include "uvectr32.h"
22 
23 U_NAMESPACE_BEGIN
24 
append(UChar32 c,uint8_t cc)25 void BuilderReorderingBuffer::append(UChar32 c, uint8_t cc) {
26     if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) {
27         if(cc==0) {
28             fLastStarterIndex=fLength;
29         }
30         fArray[fLength++]=(c<<8)|cc;
31         return;
32     }
33     // Let this character bubble back to its canonical order.
34     int32_t i=fLength-1;
35     while(i>fLastStarterIndex && ccAt(i)>cc) {
36         --i;
37     }
38     ++i;  // after the last starter or prevCC<=cc
39     // Move this and the following characters forward one to make space.
40     for(int32_t j=fLength; i<j; --j) {
41         fArray[j]=fArray[j-1];
42     }
43     fArray[i]=(c<<8)|cc;
44     ++fLength;
45     fDidReorder=TRUE;
46 }
47 
toString(UnicodeString & dest) const48 void BuilderReorderingBuffer::toString(UnicodeString &dest) const {
49     dest.remove();
50     for(int32_t i=0; i<fLength; ++i) {
51         dest.append(charAt(i));
52     }
53 }
54 
combine(UChar32 trail) const55 UChar32 Norm::combine(UChar32 trail) const {
56     int32_t length;
57     const CompositionPair *pairs=getCompositionPairs(length);
58     for(int32_t i=0; i<length; ++i) {
59         if(trail==pairs[i].trail) {
60             return pairs[i].composite;
61         }
62         if(trail<pairs[i].trail) {
63             break;
64         }
65     }
66     return U_SENTINEL;
67 }
68 
Norms(UErrorCode & errorCode)69 Norms::Norms(UErrorCode &errorCode) {
70     normTrie=utrie2_open(0, 0, &errorCode);
71     normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm));
72     // Default "inert" Norm struct at index 0. Practically immutable.
73     norms=allocNorm();
74     norms->type=Norm::INERT;
75 }
76 
~Norms()77 Norms::~Norms() {
78     utrie2_close(normTrie);
79     int32_t normsLength=utm_countItems(normMem);
80     for(int32_t i=1; i<normsLength; ++i) {
81         delete norms[i].mapping;
82         delete norms[i].rawMapping;
83         delete norms[i].compositions;
84     }
85     utm_close(normMem);
86 }
87 
allocNorm()88 Norm *Norms::allocNorm() {
89     Norm *p=(Norm *)utm_alloc(normMem);
90     norms=(Norm *)utm_getStart(normMem);  // in case it got reallocated
91     return p;
92 }
93 
getNorm(UChar32 c)94 Norm *Norms::getNorm(UChar32 c) {
95     uint32_t i=utrie2_get32(normTrie, c);
96     if(i==0) {
97         return nullptr;
98     }
99     return norms+i;
100 }
101 
getNorm(UChar32 c) const102 const Norm *Norms::getNorm(UChar32 c) const {
103     uint32_t i=utrie2_get32(normTrie, c);
104     if(i==0) {
105         return nullptr;
106     }
107     return norms+i;
108 }
109 
getNormRef(UChar32 c) const110 const Norm &Norms::getNormRef(UChar32 c) const {
111     return norms[utrie2_get32(normTrie, c)];
112 }
113 
createNorm(UChar32 c)114 Norm *Norms::createNorm(UChar32 c) {
115     uint32_t i=utrie2_get32(normTrie, c);
116     if(i!=0) {
117         return norms+i;
118     } else {
119         /* allocate Norm */
120         Norm *p=allocNorm();
121         IcuToolErrorCode errorCode("gennorm2/createNorm()");
122         utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode);
123         return p;
124     }
125 }
126 
reorder(UnicodeString & mapping,BuilderReorderingBuffer & buffer) const127 void Norms::reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const {
128     int32_t length=mapping.length();
129     U_ASSERT(length<=Normalizer2Impl::MAPPING_LENGTH_MASK);
130     const char16_t *s=mapping.getBuffer();
131     int32_t i=0;
132     UChar32 c;
133     while(i<length) {
134         U16_NEXT(s, i, length, c);
135         buffer.append(c, getCC(c));
136     }
137     if(buffer.didReorder()) {
138         buffer.toString(mapping);
139     }
140 }
141 
combinesWithCCBetween(const Norm & norm,uint8_t lowCC,int32_t highCC) const142 UBool Norms::combinesWithCCBetween(const Norm &norm, uint8_t lowCC, int32_t highCC) const {
143     if((highCC-lowCC)>=2) {
144         int32_t length;
145         const CompositionPair *pairs=norm.getCompositionPairs(length);
146         for(int32_t i=0; i<length; ++i) {
147             uint8_t trailCC=getCC(pairs[i].trail);
148             if(lowCC<trailCC && trailCC<highCC) {
149                 return TRUE;
150             }
151         }
152     }
153     return FALSE;
154 }
155 
156 U_CDECL_BEGIN
157 
158 static UBool U_CALLCONV
enumRangeHandler(const void * context,UChar32 start,UChar32 end,uint32_t value)159 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
160     return ((Norms::Enumerator *)context)->rangeHandler(start, end, value);
161 }
162 
163 U_CDECL_END
164 
enumRanges(Enumerator & e)165 void Norms::enumRanges(Enumerator &e) {
166     utrie2_enum(normTrie, nullptr, enumRangeHandler, &e);
167 }
168 
~Enumerator()169 Norms::Enumerator::~Enumerator() {}
170 
rangeHandler(UChar32 start,UChar32 end,uint32_t value)171 UBool Norms::Enumerator::rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
172     if(value!=0) {
173         rangeHandler(start, end, norms.getNormRefByIndex(value));
174     }
175     return TRUE;
176 }
177 
rangeHandler(UChar32 start,UChar32 end,Norm & norm)178 void CompositionBuilder::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
179     if(norm.mappingType!=Norm::ROUND_TRIP) { return; }
180     if(start!=end) {
181         fprintf(stderr,
182                 "gennorm2 error: same round-trip mapping for "
183                 "more than 1 code point U+%04lX..U+%04lX\n",
184                 (long)start, (long)end);
185         exit(U_INVALID_FORMAT_ERROR);
186     }
187     if(norm.cc!=0) {
188         fprintf(stderr,
189                 "gennorm2 error: "
190                 "U+%04lX has a round-trip mapping and ccc!=0, "
191                 "not possible in Unicode normalization\n",
192                 (long)start);
193         exit(U_INVALID_FORMAT_ERROR);
194     }
195     // setRoundTripMapping() ensured that there are exactly two code points.
196     const UnicodeString &m=*norm.mapping;
197     UChar32 lead=m.char32At(0);
198     UChar32 trail=m.char32At(m.length()-1);
199     if(norms.getCC(lead)!=0) {
200         fprintf(stderr,
201                 "gennorm2 error: "
202                 "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, "
203                 "not possible in Unicode normalization\n",
204                 (long)start, (long)lead);
205         exit(U_INVALID_FORMAT_ERROR);
206     }
207     // Flag for trailing character.
208     norms.createNorm(trail)->combinesBack=TRUE;
209     // Insert (trail, composite) pair into compositions list for the lead character.
210     IcuToolErrorCode errorCode("gennorm2/addComposition()");
211     Norm *leadNorm=norms.createNorm(lead);
212     UVector32 *compositions=leadNorm->compositions;
213     int32_t i;
214     if(compositions==nullptr) {
215         compositions=leadNorm->compositions=new UVector32(errorCode);
216         i=0;  // "insert" the first pair at index 0
217     } else {
218         // Insertion sort, and check for duplicate trail characters.
219         int32_t length;
220         const CompositionPair *pairs=leadNorm->getCompositionPairs(length);
221         for(i=0; i<length; ++i) {
222             if(trail==pairs[i].trail) {
223                 fprintf(stderr,
224                         "gennorm2 error: same round-trip mapping for "
225                         "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n",
226                         (long)start, (long)lead, (long)trail);
227                 exit(U_INVALID_FORMAT_ERROR);
228             }
229             if(trail<pairs[i].trail) {
230                 break;
231             }
232         }
233     }
234     compositions->insertElementAt(trail, 2*i, errorCode);
235     compositions->insertElementAt(start, 2*i+1, errorCode);
236 }
237 
rangeHandler(UChar32 start,UChar32 end,Norm & norm)238 void Decomposer::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
239     if(!norm.hasMapping()) { return; }
240     const UnicodeString &m=*norm.mapping;
241     UnicodeString *decomposed=nullptr;
242     const UChar *s=toUCharPtr(m.getBuffer());
243     int32_t length=m.length();
244     int32_t prev, i=0;
245     UChar32 c;
246     while(i<length) {
247         prev=i;
248         U16_NEXT(s, i, length, c);
249         if(start<=c && c<=end) {
250             fprintf(stderr,
251                     "gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
252                     (long)c);
253             exit(U_INVALID_FORMAT_ERROR);
254         }
255         const Norm &cNorm=norms.getNormRef(c);
256         if(cNorm.hasMapping()) {
257             if(norm.mappingType==Norm::ROUND_TRIP) {
258                 if(prev==0) {
259                     if(cNorm.mappingType!=Norm::ROUND_TRIP) {
260                         fprintf(stderr,
261                                 "gennorm2 error: "
262                                 "U+%04lX's round-trip mapping's starter "
263                                 "U+%04lX one-way-decomposes, "
264                                 "not possible in Unicode normalization\n",
265                                 (long)start, (long)c);
266                         exit(U_INVALID_FORMAT_ERROR);
267                     }
268                     uint8_t myTrailCC=norms.getCC(m.char32At(i));
269                     UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1);
270                     uint8_t cTrailCC=norms.getCC(cTrailChar);
271                     if(cTrailCC>myTrailCC) {
272                         fprintf(stderr,
273                                 "gennorm2 error: "
274                                 "U+%04lX's round-trip mapping's starter "
275                                 "U+%04lX decomposes and the "
276                                 "inner/earlier tccc=%hu > outer/following tccc=%hu, "
277                                 "not possible in Unicode normalization\n",
278                                 (long)start, (long)c,
279                                 (short)cTrailCC, (short)myTrailCC);
280                         exit(U_INVALID_FORMAT_ERROR);
281                     }
282                 } else {
283                     fprintf(stderr,
284                             "gennorm2 error: "
285                             "U+%04lX's round-trip mapping's non-starter "
286                             "U+%04lX decomposes, "
287                             "not possible in Unicode normalization\n",
288                             (long)start, (long)c);
289                     exit(U_INVALID_FORMAT_ERROR);
290                 }
291             }
292             if(decomposed==nullptr) {
293                 decomposed=new UnicodeString(m, 0, prev);
294             }
295             decomposed->append(*cNorm.mapping);
296         } else if(Hangul::isHangul(c)) {
297             UChar buffer[3];
298             int32_t hangulLength=Hangul::decompose(c, buffer);
299             if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) {
300                 fprintf(stderr,
301                         "gennorm2 error: "
302                         "U+%04lX's round-trip mapping's non-starter "
303                         "U+%04lX decomposes, "
304                         "not possible in Unicode normalization\n",
305                         (long)start, (long)c);
306                 exit(U_INVALID_FORMAT_ERROR);
307             }
308             if(decomposed==nullptr) {
309                 decomposed=new UnicodeString(m, 0, prev);
310             }
311             decomposed->append(buffer, hangulLength);
312         } else if(decomposed!=nullptr) {
313             decomposed->append(m, prev, i-prev);
314         }
315     }
316     if(decomposed!=nullptr) {
317         if(norm.rawMapping==nullptr) {
318             // Remember the original mapping when decomposing recursively.
319             norm.rawMapping=norm.mapping;
320         } else {
321             delete norm.mapping;
322         }
323         norm.mapping=decomposed;
324         // Not  norm.setMappingCP();  because the original mapping
325         // is most likely to be encodable as a delta.
326         didDecompose|=TRUE;
327     }
328 }
329 
330 U_NAMESPACE_END
331 
332 #endif // #if !UCONFIG_NO_NORMALIZATION
333