• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ******************************************************************************
3 *
4 *   Copyright (C) 2008-2012, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ******************************************************************************
8 *   file name:  uspoof_wsconf.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2009Jan05  (refactoring earlier files)
14 *   created by: Andy Heninger
15 *
16 *   Internal functions for compililing Whole Script confusable source data
17 *   into its binary (runtime) form.  The binary data format is described
18 *   in uspoof_impl.h
19 */
20 
21 #include "unicode/utypes.h"
22 #include "unicode/uspoof.h"
23 
24 #if !UCONFIG_NO_NORMALIZATION
25 
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
27 
28 #include "unicode/unorm.h"
29 #include "unicode/uregex.h"
30 #include "unicode/ustring.h"
31 #include "cmemory.h"
32 #include "uspoof_impl.h"
33 #include "uhash.h"
34 #include "uvector.h"
35 #include "uassert.h"
36 #include "uspoof_wsconf.h"
37 
38 U_NAMESPACE_USE
39 
40 
41 // Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
42 // Example Lines:
43 //   006F          ; Latn; Deva; A #      (o)  LATIN SMALL LETTER O
44 //   0048..0049    ; Latn; Grek; A #  [2] (H..I)  LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
45 //    |               |     |    |
46 //    |               |     |    |---- Which table, Any Case or Lower Case (A or L)
47 //    |               |     |----------Target script.   We need this.
48 //    |               |----------------Src script.  Should match the script of the source
49 //    |                                code points.  Beyond checking that, we don't keep it.
50 //    |--------------------------------Source code points or range.
51 //
52 // The expression will match _all_ lines, including erroneous lines.
53 // The result of the parse is returned via the contents of the (match) groups.
54 static const char *parseExp =
55         "(?m)"                                         // Multi-line mode
56         "^([ \\t]*(?:#.*?)?)$"                         // A blank or comment line.  Matches Group 1.
57         "|^(?:"                                        //   OR
58         "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range.  Groups 2 and 3.
59         "\\s*([A-Za-z]+)\\s*;"                         // The source script.  Group 4.
60         "\\s*([A-Za-z]+)\\s*;"                         // The target script.  Group 5.
61         "\\s*(?:(A)|(L))"                              // The table A or L.   Group 6 or 7
62         "[ \\t]*(?:#.*?)?"                             // Trailing commment
63         ")$|"                                          //   OR
64         "^(.*?)$";                                     // An error line.      Group 8.
65                                                        //    Any line not matching the preceding
66                                                        //    parts of the expression.will match
67                                                        //    this, and thus be flagged as an error
68 
69 
70 // Extract a regular expression match group into a char * string.
71 //    The group must contain only invariant characters.
72 //    Used for script names
73 //
extractGroup(URegularExpression * e,int32_t group,char * destBuf,int32_t destCapacity,UErrorCode & status)74 static void extractGroup(
75     URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
76 
77     UChar ubuf[50];
78     ubuf[0] = 0;
79     destBuf[0] = 0;
80     int32_t len = uregex_group(e, group, ubuf, 50, &status);
81     if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
82         return;
83     }
84     UnicodeString s(FALSE, ubuf, len);   // Aliasing constructor
85     s.extract(0, len, destBuf, destCapacity, US_INV);
86 }
87 
88 
89 
90 U_NAMESPACE_BEGIN
91 
92 //  Build the Whole Script Confusable data
93 //
94 //     TODO:  Reorganize.  Either get rid of the WSConfusableDataBuilder class,
95 //                         because everything is local to this one build function anyhow,
96 //                           OR
97 //                         break this function into more reasonably sized pieces, with
98 //                         state in WSConfusableDataBuilder.
99 //
buildWSConfusableData(SpoofImpl * spImpl,const char * confusablesWS,int32_t confusablesWSLen,UParseError * pe,UErrorCode & status)100 void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
101           int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
102 {
103     if (U_FAILURE(status)) {
104         return;
105     }
106     URegularExpression *parseRegexp = NULL;
107     int32_t             inputLen    = 0;
108     UChar              *input       = NULL;
109     int32_t             lineNum     = 0;
110 
111     UVector            *scriptSets        = NULL;
112     uint32_t            rtScriptSetsCount = 2;
113 
114     UTrie2             *anyCaseTrie   = NULL;
115     UTrie2             *lowerCaseTrie = NULL;
116 
117     anyCaseTrie = utrie2_open(0, 0, &status);
118     lowerCaseTrie = utrie2_open(0, 0, &status);
119 
120     UnicodeString pattern(parseExp, -1, US_INV);
121 
122     // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
123     //
124     // Reserved TRIE values:
125     //   0:  Code point has no whole script confusables.
126     //   1:  Code point is of script Common or Inherited.
127     //       These code points do not participate in whole script confusable detection.
128     //       (This is logically equivalent to saying that they contain confusables in
129     //        all scripts)
130     //
131     // Because Trie values are indexes into the ScriptSets vector, pre-fill
132     // vector positions 0 and 1 to avoid conflicts with the reserved values.
133 
134     scriptSets = new UVector(status);
135     if (scriptSets == NULL) {
136         status = U_MEMORY_ALLOCATION_ERROR;
137         goto cleanup;
138     }
139     scriptSets->addElement((void *)NULL, status);
140     scriptSets->addElement((void *)NULL, status);
141 
142     // Convert the user input data from UTF-8 to UChar (UTF-16)
143     u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
144     if (status != U_BUFFER_OVERFLOW_ERROR) {
145         goto cleanup;
146     }
147     status = U_ZERO_ERROR;
148     input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
149     if (input == NULL) {
150         status = U_MEMORY_ALLOCATION_ERROR;
151         goto cleanup;
152     }
153     u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
154 
155     parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
156 
157     // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
158     //   given the syntax of the input.
159     if (*input == 0xfeff) {
160         *input = 0x20;
161     }
162 
163     // Parse the input, one line per iteration of this loop.
164     uregex_setText(parseRegexp, input, inputLen, &status);
165     while (uregex_findNext(parseRegexp, &status)) {
166         lineNum++;
167         if (uregex_start(parseRegexp, 1, &status) >= 0) {
168             // this was a blank or comment line.
169             continue;
170         }
171         if (uregex_start(parseRegexp, 8, &status) >= 0) {
172             // input file syntax error.
173             status = U_PARSE_ERROR;
174             goto cleanup;
175         }
176         if (U_FAILURE(status)) {
177             goto cleanup;
178         }
179 
180         // Pick up the start and optional range end code points from the parsed line.
181         UChar32  startCodePoint = SpoofImpl::ScanHex(
182             input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
183         UChar32  endCodePoint = startCodePoint;
184         if (uregex_start(parseRegexp, 3, &status) >=0) {
185             endCodePoint = SpoofImpl::ScanHex(
186                 input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
187         }
188 
189         // Extract the two script names from the source line.  We need these in an 8 bit
190         //   default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
191         //   to the ICU u_getPropertyValueEnum() function.  Ugh.
192         char  srcScriptName[20];
193         char  targScriptName[20];
194         extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
195         extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
196         UScriptCode srcScript  =
197             static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
198         UScriptCode targScript =
199             static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
200         if (U_FAILURE(status)) {
201             goto cleanup;
202         }
203         if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
204             status = U_INVALID_FORMAT_ERROR;
205             goto cleanup;
206         }
207 
208         // select the table - (A) any case or (L) lower case only
209         UTrie2 *table = anyCaseTrie;
210         if (uregex_start(parseRegexp, 7, &status) >= 0) {
211             table = lowerCaseTrie;
212         }
213 
214         // Build the set of scripts containing confusable characters for
215         //   the code point(s) specified in this input line.
216         // Sanity check that the script of the source code point is the same
217         //   as the source script indicated in the input file.  Failure of this check is
218         //   an error in the input file.
219         // Include the source script in the set (needed for Mixed Script Confusable detection).
220         //
221         UChar32 cp;
222         for (cp=startCodePoint; cp<=endCodePoint; cp++) {
223             int32_t setIndex = utrie2_get32(table, cp);
224             BuilderScriptSet *bsset = NULL;
225             if (setIndex > 0) {
226                 U_ASSERT(setIndex < scriptSets->size());
227                 bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
228             } else {
229                 bsset = new BuilderScriptSet();
230                 if (bsset == NULL) {
231                     status = U_MEMORY_ALLOCATION_ERROR;
232                     goto cleanup;
233                 }
234                 bsset->codePoint = cp;
235                 bsset->trie = table;
236                 bsset->sset = new ScriptSet();
237                 setIndex = scriptSets->size();
238                 bsset->index = setIndex;
239                 bsset->rindex = 0;
240                 if (bsset->sset == NULL) {
241                     status = U_MEMORY_ALLOCATION_ERROR;
242                     goto cleanup;
243                 }
244                 scriptSets->addElement(bsset, status);
245                 utrie2_set32(table, cp, setIndex, &status);
246             }
247             bsset->sset->Union(targScript);
248             bsset->sset->Union(srcScript);
249 
250             if (U_FAILURE(status)) {
251                 goto cleanup;
252             }
253             UScriptCode cpScript = uscript_getScript(cp, &status);
254             if (cpScript != srcScript) {
255                 status = U_INVALID_FORMAT_ERROR;
256                 goto cleanup;
257             }
258         }
259     }
260 
261     // Eliminate duplicate script sets.  At this point we have a separate
262     // script set for every code point that had data in the input file.
263     //
264     // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
265     //
266     // printf("Number of scriptSets: %d\n", scriptSets->size());
267     {
268         int32_t duplicateCount = 0;
269         rtScriptSetsCount = 2;
270         for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
271             BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
272             if (outerSet->index != static_cast<uint32_t>(outeri)) {
273                 // This set was already identified as a duplicate.
274                 //   It will not be allocated a position in the runtime array of ScriptSets.
275                 continue;
276             }
277             outerSet->rindex = rtScriptSetsCount++;
278             for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
279                 BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
280                 if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
281                     delete innerSet->sset;
282                     innerSet->scriptSetOwned = FALSE;
283                     innerSet->sset = outerSet->sset;
284                     innerSet->index = outeri;
285                     innerSet->rindex = outerSet->rindex;
286                     duplicateCount++;
287                 }
288                 // But this doesn't get all.  We need to fix the TRIE.
289             }
290         }
291         // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
292     }
293 
294 
295 
296     // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
297     //    (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
298     //     are unused, which is why the loop index starts at 2.)
299     {
300         for (int32_t i=2; i<scriptSets->size(); i++) {
301             BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
302             if (bSet->rindex != (uint32_t)i) {
303                 utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
304             }
305         }
306     }
307 
308     // For code points with script==Common or script==Inherited,
309     //   Set the reserved value of 1 into both Tries.  These characters do not participate
310     //   in Whole Script Confusable detection; this reserved value is the means
311     //   by which they are detected.
312     {
313         UnicodeSet ignoreSet;
314         ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
315         UnicodeSet inheritedSet;
316         inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
317         ignoreSet.addAll(inheritedSet);
318         for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
319             UChar32 rangeStart = ignoreSet.getRangeStart(rn);
320             UChar32 rangeEnd   = ignoreSet.getRangeEnd(rn);
321             utrie2_setRange32(anyCaseTrie,   rangeStart, rangeEnd, 1, TRUE, &status);
322             utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
323         }
324     }
325 
326     // Serialize the data to the Spoof Detector
327     {
328         utrie2_freeze(anyCaseTrie,   UTRIE2_16_VALUE_BITS, &status);
329         int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
330         // printf("Any case Trie size: %d\n", size);
331         if (status != U_BUFFER_OVERFLOW_ERROR) {
332             goto cleanup;
333         }
334         status = U_ZERO_ERROR;
335         spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
336         spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
337         spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
338         void *where = spImpl->fSpoofData->reserveSpace(size, status);
339         utrie2_serialize(anyCaseTrie, where, size, &status);
340 
341         utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
342         size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
343         // printf("Lower case Trie size: %d\n", size);
344         if (status != U_BUFFER_OVERFLOW_ERROR) {
345             goto cleanup;
346         }
347         status = U_ZERO_ERROR;
348         spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
349         spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
350         spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
351         where = spImpl->fSpoofData->reserveSpace(size, status);
352         utrie2_serialize(lowerCaseTrie, where, size, &status);
353 
354         spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
355         spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
356         ScriptSet *rtScriptSets =  static_cast<ScriptSet *>
357             (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
358         uint32_t rindex = 2;
359         for (int32_t i=2; i<scriptSets->size(); i++) {
360             BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
361             if (bSet->rindex < rindex) {
362                 // We have already copied this script set to the serialized data.
363                 continue;
364             }
365             U_ASSERT(rindex == bSet->rindex);
366             rtScriptSets[rindex] = *bSet->sset;   // Assignment of a ScriptSet just copies the bits.
367             rindex++;
368         }
369     }
370 
371     // Open new utrie2s from the serialized data.  We don't want to keep the ones
372     //   we just built because we would then have two copies of the data, one internal to
373     //   the utries that we have already constructed, and one in the serialized data area.
374     //   An alternative would be to not pre-serialize the Trie data, but that makes the
375     //   spoof detector data different, depending on how the detector was constructed.
376     //   It's simpler to keep the data always the same.
377 
378     spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
379             UTRIE2_16_VALUE_BITS,
380             (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
381             spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
382             NULL,
383             &status);
384 
385     spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
386             UTRIE2_16_VALUE_BITS,
387             (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
388             spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
389             NULL,
390             &status);
391 
392 
393 
394 cleanup:
395     if (U_FAILURE(status)) {
396         pe->line = lineNum;
397     }
398     uregex_close(parseRegexp);
399     uprv_free(input);
400 
401     int32_t i;
402     if (scriptSets != NULL) {
403         for (i=0; i<scriptSets->size(); i++) {
404             BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
405             delete bsset;
406         }
407         delete scriptSets;
408     }
409     utrie2_close(anyCaseTrie);
410     utrie2_close(lowerCaseTrie);
411     return;
412 }
413 
414 U_NAMESPACE_END
415 
416 
417 
BuilderScriptSet()418 BuilderScriptSet::BuilderScriptSet() {
419     codePoint = -1;
420     trie = NULL;
421     sset = NULL;
422     index = 0;
423     rindex = 0;
424     scriptSetOwned = TRUE;
425 }
426 
~BuilderScriptSet()427 BuilderScriptSet::~BuilderScriptSet() {
428     if (scriptSetOwned) {
429         delete sset;
430     }
431 }
432 
433 #endif
434 #endif //  !UCONFIG_NO_REGULAR_EXPRESSIONS
435 
436