1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 2008-2013, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 * file name: uspoof_wsconf.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2009Jan05 (refactoring earlier files)
14 * created by: Andy Heninger
15 *
16 * Internal functions for compililing Whole Script confusable source data
17 * into its binary (runtime) form. The binary data format is described
18 * in uspoof_impl.h
19 */
20
21 #include "unicode/utypes.h"
22 #include "unicode/uspoof.h"
23
24 #if !UCONFIG_NO_NORMALIZATION
25
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
27
28 #include "unicode/unorm.h"
29 #include "unicode/uregex.h"
30 #include "unicode/ustring.h"
31 #include "cmemory.h"
32 #include "scriptset.h"
33 #include "uspoof_impl.h"
34 #include "uhash.h"
35 #include "uvector.h"
36 #include "uassert.h"
37 #include "uspoof_wsconf.h"
38
39 U_NAMESPACE_USE
40
41
42 // Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
43 // Example Lines:
44 // 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O
45 // 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
46 // | | | |
47 // | | | |---- Which table, Any Case or Lower Case (A or L)
48 // | | |----------Target script. We need this.
49 // | |----------------Src script. Should match the script of the source
50 // | code points. Beyond checking that, we don't keep it.
51 // |--------------------------------Source code points or range.
52 //
53 // The expression will match _all_ lines, including erroneous lines.
54 // The result of the parse is returned via the contents of the (match) groups.
55 static const char *parseExp =
56 "(?m)" // Multi-line mode
57 "^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1.
58 "|^(?:" // OR
59 "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3.
60 "\\s*([A-Za-z]+)\\s*;" // The source script. Group 4.
61 "\\s*([A-Za-z]+)\\s*;" // The target script. Group 5.
62 "\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7
63 "[ \\t]*(?:#.*?)?" // Trailing commment
64 ")$|" // OR
65 "^(.*?)$"; // An error line. Group 8.
66 // Any line not matching the preceding
67 // parts of the expression.will match
68 // this, and thus be flagged as an error
69
70
71 // Extract a regular expression match group into a char * string.
72 // The group must contain only invariant characters.
73 // Used for script names
74 //
extractGroup(URegularExpression * e,int32_t group,char * destBuf,int32_t destCapacity,UErrorCode & status)75 static void extractGroup(
76 URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
77
78 UChar ubuf[50];
79 ubuf[0] = 0;
80 destBuf[0] = 0;
81 int32_t len = uregex_group(e, group, ubuf, 50, &status);
82 if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
83 return;
84 }
85 UnicodeString s(FALSE, ubuf, len); // Aliasing constructor
86 s.extract(0, len, destBuf, destCapacity, US_INV);
87 }
88
89
90
91 U_NAMESPACE_BEGIN
92
93 // Build the Whole Script Confusable data
94 //
95 // TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class,
96 // because everything is local to this one build function anyhow,
97 // OR
98 // break this function into more reasonably sized pieces, with
99 // state in WSConfusableDataBuilder.
100 //
buildWSConfusableData(SpoofImpl * spImpl,const char * confusablesWS,int32_t confusablesWSLen,UParseError * pe,UErrorCode & status)101 void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
102 int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
103 {
104 if (U_FAILURE(status)) {
105 return;
106 }
107 URegularExpression *parseRegexp = NULL;
108 int32_t inputLen = 0;
109 UChar *input = NULL;
110 int32_t lineNum = 0;
111
112 UVector *scriptSets = NULL;
113 uint32_t rtScriptSetsCount = 2;
114
115 UTrie2 *anyCaseTrie = NULL;
116 UTrie2 *lowerCaseTrie = NULL;
117
118 anyCaseTrie = utrie2_open(0, 0, &status);
119 lowerCaseTrie = utrie2_open(0, 0, &status);
120
121 UnicodeString pattern(parseExp, -1, US_INV);
122
123 // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
124 //
125 // Reserved TRIE values:
126 // 0: Code point has no whole script confusables.
127 // 1: Code point is of script Common or Inherited.
128 // These code points do not participate in whole script confusable detection.
129 // (This is logically equivalent to saying that they contain confusables in
130 // all scripts)
131 //
132 // Because Trie values are indexes into the ScriptSets vector, pre-fill
133 // vector positions 0 and 1 to avoid conflicts with the reserved values.
134
135 scriptSets = new UVector(status);
136 if (scriptSets == NULL) {
137 status = U_MEMORY_ALLOCATION_ERROR;
138 goto cleanup;
139 }
140 scriptSets->addElement((void *)NULL, status);
141 scriptSets->addElement((void *)NULL, status);
142
143 // Convert the user input data from UTF-8 to UChar (UTF-16)
144 u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
145 if (status != U_BUFFER_OVERFLOW_ERROR) {
146 goto cleanup;
147 }
148 status = U_ZERO_ERROR;
149 input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
150 if (input == NULL) {
151 status = U_MEMORY_ALLOCATION_ERROR;
152 goto cleanup;
153 }
154 u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
155
156 parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
157
158 // Zap any Byte Order Mark at the start of input. Changing it to a space is benign
159 // given the syntax of the input.
160 if (*input == 0xfeff) {
161 *input = 0x20;
162 }
163
164 // Parse the input, one line per iteration of this loop.
165 uregex_setText(parseRegexp, input, inputLen, &status);
166 while (uregex_findNext(parseRegexp, &status)) {
167 lineNum++;
168 if (uregex_start(parseRegexp, 1, &status) >= 0) {
169 // this was a blank or comment line.
170 continue;
171 }
172 if (uregex_start(parseRegexp, 8, &status) >= 0) {
173 // input file syntax error.
174 status = U_PARSE_ERROR;
175 goto cleanup;
176 }
177 if (U_FAILURE(status)) {
178 goto cleanup;
179 }
180
181 // Pick up the start and optional range end code points from the parsed line.
182 UChar32 startCodePoint = SpoofImpl::ScanHex(
183 input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
184 UChar32 endCodePoint = startCodePoint;
185 if (uregex_start(parseRegexp, 3, &status) >=0) {
186 endCodePoint = SpoofImpl::ScanHex(
187 input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
188 }
189
190 // Extract the two script names from the source line. We need these in an 8 bit
191 // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
192 // to the ICU u_getPropertyValueEnum() function. Ugh.
193 char srcScriptName[20];
194 char targScriptName[20];
195 extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
196 extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
197 UScriptCode srcScript =
198 static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
199 UScriptCode targScript =
200 static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
201 if (U_FAILURE(status)) {
202 goto cleanup;
203 }
204 if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
205 status = U_INVALID_FORMAT_ERROR;
206 goto cleanup;
207 }
208
209 // select the table - (A) any case or (L) lower case only
210 UTrie2 *table = anyCaseTrie;
211 if (uregex_start(parseRegexp, 7, &status) >= 0) {
212 table = lowerCaseTrie;
213 }
214
215 // Build the set of scripts containing confusable characters for
216 // the code point(s) specified in this input line.
217 // Sanity check that the script of the source code point is the same
218 // as the source script indicated in the input file. Failure of this check is
219 // an error in the input file.
220 // Include the source script in the set (needed for Mixed Script Confusable detection).
221 //
222 UChar32 cp;
223 for (cp=startCodePoint; cp<=endCodePoint; cp++) {
224 int32_t setIndex = utrie2_get32(table, cp);
225 BuilderScriptSet *bsset = NULL;
226 if (setIndex > 0) {
227 U_ASSERT(setIndex < scriptSets->size());
228 bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
229 } else {
230 bsset = new BuilderScriptSet();
231 if (bsset == NULL) {
232 status = U_MEMORY_ALLOCATION_ERROR;
233 goto cleanup;
234 }
235 bsset->codePoint = cp;
236 bsset->trie = table;
237 bsset->sset = new ScriptSet();
238 setIndex = scriptSets->size();
239 bsset->index = setIndex;
240 bsset->rindex = 0;
241 if (bsset->sset == NULL) {
242 status = U_MEMORY_ALLOCATION_ERROR;
243 goto cleanup;
244 }
245 scriptSets->addElement(bsset, status);
246 utrie2_set32(table, cp, setIndex, &status);
247 }
248 bsset->sset->set(targScript, status);
249 bsset->sset->set(srcScript, status);
250
251 if (U_FAILURE(status)) {
252 goto cleanup;
253 }
254 UScriptCode cpScript = uscript_getScript(cp, &status);
255 if (cpScript != srcScript) {
256 status = U_INVALID_FORMAT_ERROR;
257 goto cleanup;
258 }
259 }
260 }
261
262 // Eliminate duplicate script sets. At this point we have a separate
263 // script set for every code point that had data in the input file.
264 //
265 // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
266 //
267 // printf("Number of scriptSets: %d\n", scriptSets->size());
268 {
269 int32_t duplicateCount = 0;
270 rtScriptSetsCount = 2;
271 for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
272 BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
273 if (outerSet->index != static_cast<uint32_t>(outeri)) {
274 // This set was already identified as a duplicate.
275 // It will not be allocated a position in the runtime array of ScriptSets.
276 continue;
277 }
278 outerSet->rindex = rtScriptSetsCount++;
279 for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
280 BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
281 if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
282 delete innerSet->sset;
283 innerSet->scriptSetOwned = FALSE;
284 innerSet->sset = outerSet->sset;
285 innerSet->index = outeri;
286 innerSet->rindex = outerSet->rindex;
287 duplicateCount++;
288 }
289 // But this doesn't get all. We need to fix the TRIE.
290 }
291 }
292 // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
293 }
294
295
296
297 // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
298 // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
299 // are unused, which is why the loop index starts at 2.)
300 {
301 for (int32_t i=2; i<scriptSets->size(); i++) {
302 BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
303 if (bSet->rindex != (uint32_t)i) {
304 utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
305 }
306 }
307 }
308
309 // For code points with script==Common or script==Inherited,
310 // Set the reserved value of 1 into both Tries. These characters do not participate
311 // in Whole Script Confusable detection; this reserved value is the means
312 // by which they are detected.
313 {
314 UnicodeSet ignoreSet;
315 ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
316 UnicodeSet inheritedSet;
317 inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
318 ignoreSet.addAll(inheritedSet);
319 for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
320 UChar32 rangeStart = ignoreSet.getRangeStart(rn);
321 UChar32 rangeEnd = ignoreSet.getRangeEnd(rn);
322 utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
323 utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
324 }
325 }
326
327 // Serialize the data to the Spoof Detector
328 {
329 utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status);
330 int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
331 // printf("Any case Trie size: %d\n", size);
332 if (status != U_BUFFER_OVERFLOW_ERROR) {
333 goto cleanup;
334 }
335 status = U_ZERO_ERROR;
336 spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
337 spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
338 spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
339 void *where = spImpl->fSpoofData->reserveSpace(size, status);
340 utrie2_serialize(anyCaseTrie, where, size, &status);
341
342 utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
343 size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
344 // printf("Lower case Trie size: %d\n", size);
345 if (status != U_BUFFER_OVERFLOW_ERROR) {
346 goto cleanup;
347 }
348 status = U_ZERO_ERROR;
349 spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
350 spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
351 spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
352 where = spImpl->fSpoofData->reserveSpace(size, status);
353 utrie2_serialize(lowerCaseTrie, where, size, &status);
354
355 spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
356 spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
357 ScriptSet *rtScriptSets = static_cast<ScriptSet *>
358 (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
359 uint32_t rindex = 2;
360 for (int32_t i=2; i<scriptSets->size(); i++) {
361 BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
362 if (bSet->rindex < rindex) {
363 // We have already copied this script set to the serialized data.
364 continue;
365 }
366 U_ASSERT(rindex == bSet->rindex);
367 rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits.
368 rindex++;
369 }
370 }
371
372 // Open new utrie2s from the serialized data. We don't want to keep the ones
373 // we just built because we would then have two copies of the data, one internal to
374 // the utries that we have already constructed, and one in the serialized data area.
375 // An alternative would be to not pre-serialize the Trie data, but that makes the
376 // spoof detector data different, depending on how the detector was constructed.
377 // It's simpler to keep the data always the same.
378
379 spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
380 UTRIE2_16_VALUE_BITS,
381 (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
382 spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
383 NULL,
384 &status);
385
386 spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
387 UTRIE2_16_VALUE_BITS,
388 (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
389 spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
390 NULL,
391 &status);
392
393
394
395 cleanup:
396 if (U_FAILURE(status)) {
397 pe->line = lineNum;
398 }
399 uregex_close(parseRegexp);
400 uprv_free(input);
401
402 int32_t i;
403 if (scriptSets != NULL) {
404 for (i=0; i<scriptSets->size(); i++) {
405 BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
406 delete bsset;
407 }
408 delete scriptSets;
409 }
410 utrie2_close(anyCaseTrie);
411 utrie2_close(lowerCaseTrie);
412 return;
413 }
414
415 U_NAMESPACE_END
416
417
418
BuilderScriptSet()419 BuilderScriptSet::BuilderScriptSet() {
420 codePoint = -1;
421 trie = NULL;
422 sset = NULL;
423 index = 0;
424 rindex = 0;
425 scriptSetOwned = TRUE;
426 }
427
~BuilderScriptSet()428 BuilderScriptSet::~BuilderScriptSet() {
429 if (scriptSetOwned) {
430 delete sset;
431 }
432 }
433
434 #endif
435 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
436
437