1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3
4 // extradata.cpp
5 // created: 2017jun04 Markus W. Scherer
6 // (pulled out of n2builder.cpp)
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_NORMALIZATION
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include "unicode/errorcode.h"
15 #include "unicode/unistr.h"
16 #include "unicode/utf16.h"
17 #include "extradata.h"
18 #include "normalizer2impl.h"
19 #include "norms.h"
20 #include "toolutil.h"
21 #include "utrie2.h"
22 #include "uvectr32.h"
23
24 U_NAMESPACE_BEGIN
25
ExtraData(Norms & n,UBool fast)26 ExtraData::ExtraData(Norms &n, UBool fast) :
27 Norms::Enumerator(n),
28 yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions
29 yesNoMappingsAndCompositions(1000, (UChar32)0, 1), // 0=Hangul LV, 1=start of normal data
30 yesNoMappingsOnly(1000, (UChar32)0, 1), // 0=Hangul LVT, 1=start of normal data
31 optimizeFast(fast) {
32 // Hangul LV algorithmically decomposes to two Jamo.
33 // Some code may harmlessly read this firstUnit.
34 yesNoMappingsAndCompositions.setCharAt(0, 2);
35 // Hangul LVT algorithmically decomposes to three Jamo.
36 // Some code may harmlessly read this firstUnit.
37 yesNoMappingsOnly.setCharAt(0, 3);
38 }
39
writeMapping(UChar32 c,const Norm & norm,UnicodeString & dataString)40 int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString) {
41 UnicodeString &m=*norm.mapping;
42 int32_t length=m.length();
43 // Write the mapping & raw mapping extraData.
44 int32_t firstUnit=length|(norm.trailCC<<8);
45 int32_t preMappingLength=0;
46 if(norm.rawMapping!=NULL) {
47 UnicodeString &rm=*norm.rawMapping;
48 int32_t rmLength=rm.length();
49 if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) {
50 fprintf(stderr,
51 "gennorm2 error: "
52 "raw mapping for U+%04lX longer than maximum of %d\n",
53 (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
54 exit(U_INVALID_FORMAT_ERROR);
55 }
56 UChar rm0=rm.charAt(0);
57 if( rmLength==length-1 &&
58 // 99: overlong substring lengths get pinned to remainder lengths anyway
59 0==rm.compare(1, 99, m, 2, 99) &&
60 rm0>Normalizer2Impl::MAPPING_LENGTH_MASK
61 ) {
62 // Compression:
63 // rawMapping=rm0+mapping.substring(2) -> store only rm0
64 //
65 // The raw mapping is the same as the final mapping after replacing
66 // the final mapping's first two code units with the raw mapping's first one.
67 // In this case, we store only that first unit, rm0.
68 // This helps with a few hundred mappings.
69 dataString.append(rm0);
70 preMappingLength=1;
71 } else {
72 // Store the raw mapping with its length.
73 dataString.append(rm);
74 dataString.append((UChar)rmLength);
75 preMappingLength=rmLength+1;
76 }
77 firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING;
78 }
79 int32_t cccLccc=norm.cc|(norm.leadCC<<8);
80 if(cccLccc!=0) {
81 dataString.append((UChar)cccLccc);
82 ++preMappingLength;
83 firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD;
84 }
85 dataString.append((UChar)firstUnit);
86 dataString.append(m);
87 return preMappingLength;
88 }
89
writeNoNoMapping(UChar32 c,const Norm & norm,UnicodeString & dataString,Hashtable & previousMappings)90 int32_t ExtraData::writeNoNoMapping(UChar32 c, const Norm &norm,
91 UnicodeString &dataString,
92 Hashtable &previousMappings) {
93 UnicodeString newMapping;
94 int32_t offset=writeMapping(c, norm, newMapping);
95 UBool found=false;
96 int32_t previousOffset=previousMappings.getiAndFound(newMapping, found);
97 if(found) {
98 // Duplicate, point to the identical mapping that has already been stored.
99 offset=previousOffset;
100 } else {
101 // Append this new mapping and
102 // enter it into the hashtable, avoiding value 0 which is "not found".
103 offset=dataString.length()+offset;
104 dataString.append(newMapping);
105 IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.putiAllowZero()");
106 previousMappings.putiAllowZero(newMapping, offset, errorCode);
107 }
108 return offset;
109 }
110
setNoNoDelta(UChar32 c,Norm & norm) const111 UBool ExtraData::setNoNoDelta(UChar32 c, Norm &norm) const {
112 // Try a compact, algorithmic encoding to a single compYesAndZeroCC code point.
113 // Do not map from ASCII to non-ASCII.
114 if(norm.mappingCP>=0 &&
115 !(c<=0x7f && norm.mappingCP>0x7f) &&
116 norms.getNormRef(norm.mappingCP).type<Norm::NO_NO_COMP_YES) {
117 int32_t delta=norm.mappingCP-c;
118 if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
119 norm.type=Norm::NO_NO_DELTA;
120 norm.offset=delta;
121 return true;
122 }
123 }
124 return false;
125 }
126
writeCompositions(UChar32 c,const Norm & norm,UnicodeString & dataString)127 void ExtraData::writeCompositions(UChar32 c, const Norm &norm, UnicodeString &dataString) {
128 if(norm.cc!=0) {
129 fprintf(stderr,
130 "gennorm2 error: "
131 "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n",
132 (long)c);
133 exit(U_INVALID_FORMAT_ERROR);
134 }
135 int32_t length;
136 const CompositionPair *pairs=norm.getCompositionPairs(length);
137 for(int32_t i=0; i<length; ++i) {
138 const CompositionPair &pair=pairs[i];
139 // 22 bits for the composite character and whether it combines forward.
140 UChar32 compositeAndFwd=pair.composite<<1;
141 if(norms.getNormRef(pair.composite).compositions!=NULL) {
142 compositeAndFwd|=1; // The composite character also combines-forward.
143 }
144 // Encode most pairs in two units and some in three.
145 int32_t firstUnit, secondUnit, thirdUnit;
146 if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) {
147 if(compositeAndFwd<=0xffff) {
148 firstUnit=pair.trail<<1;
149 secondUnit=compositeAndFwd;
150 thirdUnit=-1;
151 } else {
152 firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE;
153 secondUnit=compositeAndFwd>>16;
154 thirdUnit=compositeAndFwd;
155 }
156 } else {
157 firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+
158 (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))|
159 Normalizer2Impl::COMP_1_TRIPLE;
160 secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)|
161 (compositeAndFwd>>16);
162 thirdUnit=compositeAndFwd;
163 }
164 // Set the high bit of the first unit if this is the last composition pair.
165 if(i==(length-1)) {
166 firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE;
167 }
168 dataString.append((UChar)firstUnit).append((UChar)secondUnit);
169 if(thirdUnit>=0) {
170 dataString.append((UChar)thirdUnit);
171 }
172 }
173 }
174
rangeHandler(UChar32 start,UChar32 end,Norm & norm)175 void ExtraData::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
176 if(start!=end) {
177 fprintf(stderr,
178 "gennorm2 error: unexpected shared data for "
179 "multiple code points U+%04lX..U+%04lX\n",
180 (long)start, (long)end);
181 exit(U_INTERNAL_PROGRAM_ERROR);
182 }
183 if(norm.error!=nullptr) {
184 fprintf(stderr, "gennorm2 error: U+%04lX %s\n", (long)start, norm.error);
185 exit(U_INVALID_FORMAT_ERROR);
186 }
187 writeExtraData(start, norm);
188 }
189
190 // Ticket #13342 - Disable optimizations on MSVC for this function as a workaround.
191 #if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210))
192 #pragma optimize( "", off )
193 #endif
194
writeExtraData(UChar32 c,Norm & norm)195 void ExtraData::writeExtraData(UChar32 c, Norm &norm) {
196 switch(norm.type) {
197 case Norm::INERT:
198 break; // no extra data
199 case Norm::YES_YES_COMBINES_FWD:
200 norm.offset=yesYesCompositions.length();
201 writeCompositions(c, norm, yesYesCompositions);
202 break;
203 case Norm::YES_NO_COMBINES_FWD:
204 norm.offset=yesNoMappingsAndCompositions.length()+
205 writeMapping(c, norm, yesNoMappingsAndCompositions);
206 writeCompositions(c, norm, yesNoMappingsAndCompositions);
207 break;
208 case Norm::YES_NO_MAPPING_ONLY:
209 norm.offset=yesNoMappingsOnly.length()+
210 writeMapping(c, norm, yesNoMappingsOnly);
211 break;
212 case Norm::NO_NO_COMP_YES:
213 if(!optimizeFast && setNoNoDelta(c, norm)) {
214 break;
215 }
216 norm.offset=writeNoNoMapping(c, norm, noNoMappingsCompYes, previousNoNoMappingsCompYes);
217 break;
218 case Norm::NO_NO_COMP_BOUNDARY_BEFORE:
219 if(!optimizeFast && setNoNoDelta(c, norm)) {
220 break;
221 }
222 norm.offset=writeNoNoMapping(
223 c, norm, noNoMappingsCompBoundaryBefore, previousNoNoMappingsCompBoundaryBefore);
224 break;
225 case Norm::NO_NO_COMP_NO_MAYBE_CC:
226 norm.offset=writeNoNoMapping(
227 c, norm, noNoMappingsCompNoMaybeCC, previousNoNoMappingsCompNoMaybeCC);
228 break;
229 case Norm::NO_NO_EMPTY:
230 // There can be multiple extra data entries for mappings to the empty string
231 // if they have different raw mappings.
232 norm.offset=writeNoNoMapping(c, norm, noNoMappingsEmpty, previousNoNoMappingsEmpty);
233 break;
234 case Norm::MAYBE_YES_COMBINES_FWD:
235 norm.offset=maybeYesCompositions.length();
236 writeCompositions(c, norm, maybeYesCompositions);
237 break;
238 case Norm::MAYBE_YES_SIMPLE:
239 break; // no extra data
240 case Norm::YES_YES_WITH_CC:
241 break; // no extra data
242 default: // Should not occur.
243 exit(U_INTERNAL_PROGRAM_ERROR);
244 }
245 }
246
247 // Ticket #13342 - Turn optimization back on.
248 #if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210))
249 #pragma optimize( "", on )
250 #endif
251
252 U_NAMESPACE_END
253
254 #endif // #if !UCONFIG_NO_NORMALIZATION
255