1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3
4 // extradata.cpp
5 // created: 2017jun04 Markus W. Scherer
6 // (pulled out of n2builder.cpp)
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_NORMALIZATION
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include "unicode/errorcode.h"
15 #include "unicode/unistr.h"
16 #include "unicode/utf16.h"
17 #include "extradata.h"
18 #include "normalizer2impl.h"
19 #include "norms.h"
20 #include "toolutil.h"
21 #include "utrie2.h"
22 #include "uvectr32.h"
23
24 U_NAMESPACE_BEGIN
25
ExtraData(Norms & n,UBool fast)26 ExtraData::ExtraData(Norms &n, UBool fast) :
27 Norms::Enumerator(n),
28 yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions
29 yesNoMappingsAndCompositions(1000, (UChar32)0, 1), // 0=Hangul LV, 1=start of normal data
30 yesNoMappingsOnly(1000, (UChar32)0, 1), // 0=Hangul LVT, 1=start of normal data
31 optimizeFast(fast) {
32 // Hangul LV algorithmically decomposes to two Jamo.
33 // Some code may harmlessly read this firstUnit.
34 yesNoMappingsAndCompositions.setCharAt(0, 2);
35 // Hangul LVT algorithmically decomposes to three Jamo.
36 // Some code may harmlessly read this firstUnit.
37 yesNoMappingsOnly.setCharAt(0, 3);
38 }
39
writeMapping(UChar32 c,const Norm & norm,UnicodeString & dataString)40 int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString) {
41 UnicodeString &m=*norm.mapping;
42 int32_t length=m.length();
43 // Write the mapping & raw mapping extraData.
44 int32_t firstUnit=length|(norm.trailCC<<8);
45 int32_t preMappingLength=0;
46 if(norm.rawMapping!=NULL) {
47 UnicodeString &rm=*norm.rawMapping;
48 int32_t rmLength=rm.length();
49 if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) {
50 fprintf(stderr,
51 "gennorm2 error: "
52 "raw mapping for U+%04lX longer than maximum of %d\n",
53 (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
54 exit(U_INVALID_FORMAT_ERROR);
55 }
56 UChar rm0=rm.charAt(0);
57 if( rmLength==length-1 &&
58 // 99: overlong substring lengths get pinned to remainder lengths anyway
59 0==rm.compare(1, 99, m, 2, 99) &&
60 rm0>Normalizer2Impl::MAPPING_LENGTH_MASK
61 ) {
62 // Compression:
63 // rawMapping=rm0+mapping.substring(2) -> store only rm0
64 //
65 // The raw mapping is the same as the final mapping after replacing
66 // the final mapping's first two code units with the raw mapping's first one.
67 // In this case, we store only that first unit, rm0.
68 // This helps with a few hundred mappings.
69 dataString.append(rm0);
70 preMappingLength=1;
71 } else {
72 // Store the raw mapping with its length.
73 dataString.append(rm);
74 dataString.append((UChar)rmLength);
75 preMappingLength=rmLength+1;
76 }
77 firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING;
78 }
79 int32_t cccLccc=norm.cc|(norm.leadCC<<8);
80 if(cccLccc!=0) {
81 dataString.append((UChar)cccLccc);
82 ++preMappingLength;
83 firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD;
84 }
85 dataString.append((UChar)firstUnit);
86 dataString.append(m);
87 return preMappingLength;
88 }
89
writeNoNoMapping(UChar32 c,const Norm & norm,UnicodeString & dataString,Hashtable & previousMappings)90 int32_t ExtraData::writeNoNoMapping(UChar32 c, const Norm &norm,
91 UnicodeString &dataString,
92 Hashtable &previousMappings) {
93 UnicodeString newMapping;
94 int32_t offset=writeMapping(c, norm, newMapping);
95 int32_t previousOffset=previousMappings.geti(newMapping);
96 if(previousOffset!=0) {
97 // Duplicate, point to the identical mapping that has already been stored.
98 offset=previousOffset-1;
99 } else {
100 // Append this new mapping and
101 // enter it into the hashtable, avoiding value 0 which is "not found".
102 offset=dataString.length()+offset;
103 dataString.append(newMapping);
104 IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()");
105 previousMappings.puti(newMapping, offset+1, errorCode);
106 }
107 return offset;
108 }
109
setNoNoDelta(UChar32 c,Norm & norm) const110 UBool ExtraData::setNoNoDelta(UChar32 c, Norm &norm) const {
111 // Try a compact, algorithmic encoding to a single compYesAndZeroCC code point.
112 // Do not map from ASCII to non-ASCII.
113 if(norm.mappingCP>=0 &&
114 !(c<=0x7f && norm.mappingCP>0x7f) &&
115 norms.getNormRef(norm.mappingCP).type<Norm::NO_NO_COMP_YES) {
116 int32_t delta=norm.mappingCP-c;
117 if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
118 norm.type=Norm::NO_NO_DELTA;
119 norm.offset=delta;
120 return TRUE;
121 }
122 }
123 return FALSE;
124 }
125
writeCompositions(UChar32 c,const Norm & norm,UnicodeString & dataString)126 void ExtraData::writeCompositions(UChar32 c, const Norm &norm, UnicodeString &dataString) {
127 if(norm.cc!=0) {
128 fprintf(stderr,
129 "gennorm2 error: "
130 "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n",
131 (long)c);
132 exit(U_INVALID_FORMAT_ERROR);
133 }
134 int32_t length;
135 const CompositionPair *pairs=norm.getCompositionPairs(length);
136 for(int32_t i=0; i<length; ++i) {
137 const CompositionPair &pair=pairs[i];
138 // 22 bits for the composite character and whether it combines forward.
139 UChar32 compositeAndFwd=pair.composite<<1;
140 if(norms.getNormRef(pair.composite).compositions!=NULL) {
141 compositeAndFwd|=1; // The composite character also combines-forward.
142 }
143 // Encode most pairs in two units and some in three.
144 int32_t firstUnit, secondUnit, thirdUnit;
145 if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) {
146 if(compositeAndFwd<=0xffff) {
147 firstUnit=pair.trail<<1;
148 secondUnit=compositeAndFwd;
149 thirdUnit=-1;
150 } else {
151 firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE;
152 secondUnit=compositeAndFwd>>16;
153 thirdUnit=compositeAndFwd;
154 }
155 } else {
156 firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+
157 (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))|
158 Normalizer2Impl::COMP_1_TRIPLE;
159 secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)|
160 (compositeAndFwd>>16);
161 thirdUnit=compositeAndFwd;
162 }
163 // Set the high bit of the first unit if this is the last composition pair.
164 if(i==(length-1)) {
165 firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE;
166 }
167 dataString.append((UChar)firstUnit).append((UChar)secondUnit);
168 if(thirdUnit>=0) {
169 dataString.append((UChar)thirdUnit);
170 }
171 }
172 }
173
rangeHandler(UChar32 start,UChar32 end,Norm & norm)174 void ExtraData::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
175 if(start!=end) {
176 fprintf(stderr,
177 "gennorm2 error: unexpected shared data for "
178 "multiple code points U+%04lX..U+%04lX\n",
179 (long)start, (long)end);
180 exit(U_INTERNAL_PROGRAM_ERROR);
181 }
182 if(norm.error!=nullptr) {
183 fprintf(stderr, "gennorm2 error: U+%04lX %s\n", (long)start, norm.error);
184 exit(U_INVALID_FORMAT_ERROR);
185 }
186 writeExtraData(start, norm);
187 }
188
189 // Ticket #13342 - Disable optimizations on MSVC for this function as a workaround.
190 #if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210))
191 #pragma optimize( "", off )
192 #endif
193
writeExtraData(UChar32 c,Norm & norm)194 void ExtraData::writeExtraData(UChar32 c, Norm &norm) {
195 switch(norm.type) {
196 case Norm::INERT:
197 break; // no extra data
198 case Norm::YES_YES_COMBINES_FWD:
199 norm.offset=yesYesCompositions.length();
200 writeCompositions(c, norm, yesYesCompositions);
201 break;
202 case Norm::YES_NO_COMBINES_FWD:
203 norm.offset=yesNoMappingsAndCompositions.length()+
204 writeMapping(c, norm, yesNoMappingsAndCompositions);
205 writeCompositions(c, norm, yesNoMappingsAndCompositions);
206 break;
207 case Norm::YES_NO_MAPPING_ONLY:
208 norm.offset=yesNoMappingsOnly.length()+
209 writeMapping(c, norm, yesNoMappingsOnly);
210 break;
211 case Norm::NO_NO_COMP_YES:
212 if(!optimizeFast && setNoNoDelta(c, norm)) {
213 break;
214 }
215 norm.offset=writeNoNoMapping(c, norm, noNoMappingsCompYes, previousNoNoMappingsCompYes);
216 break;
217 case Norm::NO_NO_COMP_BOUNDARY_BEFORE:
218 if(!optimizeFast && setNoNoDelta(c, norm)) {
219 break;
220 }
221 norm.offset=writeNoNoMapping(
222 c, norm, noNoMappingsCompBoundaryBefore, previousNoNoMappingsCompBoundaryBefore);
223 break;
224 case Norm::NO_NO_COMP_NO_MAYBE_CC:
225 norm.offset=writeNoNoMapping(
226 c, norm, noNoMappingsCompNoMaybeCC, previousNoNoMappingsCompNoMaybeCC);
227 break;
228 case Norm::NO_NO_EMPTY:
229 // There can be multiple extra data entries for mappings to the empty string
230 // if they have different raw mappings.
231 norm.offset=writeNoNoMapping(c, norm, noNoMappingsEmpty, previousNoNoMappingsEmpty);
232 break;
233 case Norm::MAYBE_YES_COMBINES_FWD:
234 norm.offset=maybeYesCompositions.length();
235 writeCompositions(c, norm, maybeYesCompositions);
236 break;
237 case Norm::MAYBE_YES_SIMPLE:
238 break; // no extra data
239 case Norm::YES_YES_WITH_CC:
240 break; // no extra data
241 default: // Should not occur.
242 exit(U_INTERNAL_PROGRAM_ERROR);
243 }
244 }
245
246 // Ticket #13342 - Turn optimization back on.
247 #if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210))
248 #pragma optimize( "", on )
249 #endif
250
251 U_NAMESPACE_END
252
253 #endif // #if !UCONFIG_NO_NORMALIZATION
254