• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 *   Copyright (C) 2008-2015, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 ******************************************************************************
10 *   file name:  uspoof_conf.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2009Jan05  (refactoring earlier files)
16 *   created by: Andy Heninger
17 *
18 *   Internal classes for compiling confusable data into its binary (runtime) form.
19 */
20 
21 #include "unicode/utypes.h"
22 #include "unicode/uspoof.h"
23 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
24 #if !UCONFIG_NO_NORMALIZATION
25 
26 #include "unicode/unorm.h"
27 #include "unicode/uregex.h"
28 #include "unicode/ustring.h"
29 #include "cmemory.h"
30 #include "uspoof_impl.h"
31 #include "uhash.h"
32 #include "uvector.h"
33 #include "uassert.h"
34 #include "uarrsort.h"
35 #include "uspoof_conf.h"
36 
37 U_NAMESPACE_USE
38 
39 
40 //---------------------------------------------------------------------
41 //
42 //  buildConfusableData   Compile the source confusable data, as defined by
43 //                        the Unicode data file confusables.txt, into the binary
44 //                        structures used by the confusable detector.
45 //
46 //                        The binary structures are described in uspoof_impl.h
47 //
48 //     1.  Parse the data, making a hash table mapping from a UChar32 to a String.
49 //
50 //     2.  Sort all of the strings encountered by length, since they will need to
51 //         be stored in that order in the final string table.
52 //         TODO: Sorting these strings by length is no longer needed since the removal of
53 //         the string lengths table.  This logic can be removed to save processing time
54 //         when building confusables data.
55 //
56 //     3.  Build a list of keys (UChar32s) from the four mapping tables.  Sort the
57 //         list because that will be the ordering of our runtime table.
58 //
59 //     4.  Generate the run time string table.  This is generated before the key & value
60 //         tables because we need the string indexes when building those tables.
61 //
62 //     5.  Build the run-time key and value tables.  These are parallel tables, and are built
63 //         at the same time
64 //
65 
SPUString(LocalPointer<UnicodeString> s)66 SPUString::SPUString(LocalPointer<UnicodeString> s) {
67     fStr = std::move(s);
68     fCharOrStrTableIndex = 0;
69 }
70 
71 
~SPUString()72 SPUString::~SPUString() {
73 }
74 
75 
SPUStringPool(UErrorCode & status)76 SPUStringPool::SPUStringPool(UErrorCode &status) : fVec(nullptr), fHash(nullptr) {
77     LocalPointer<UVector> vec(new UVector(status), status);
78     if (U_FAILURE(status)) {
79         return;
80     }
81     vec->setDeleter(
82         [](void *obj) {delete (SPUString *)obj;});
83     fVec = vec.orphan();
84     fHash = uhash_open(uhash_hashUnicodeString,           // key hash function
85                        uhash_compareUnicodeString,        // Key Comparator
86                        nullptr,                              // Value Comparator
87                        &status);
88 }
89 
90 
~SPUStringPool()91 SPUStringPool::~SPUStringPool() {
92     delete fVec;
93     uhash_close(fHash);
94 }
95 
96 
size()97 int32_t SPUStringPool::size() {
98     return fVec->size();
99 }
100 
getByIndex(int32_t index)101 SPUString *SPUStringPool::getByIndex(int32_t index) {
102     SPUString *retString = (SPUString *)fVec->elementAt(index);
103     return retString;
104 }
105 
106 
107 // Comparison function for ordering strings in the string pool.
108 // Compare by length first, then, within a group of the same length,
109 // by code point order.
110 // Conforms to the type signature for a USortComparator in uvector.h
111 
SPUStringCompare(UHashTok left,UHashTok right)112 static int32_t U_CALLCONV SPUStringCompare(UHashTok left, UHashTok right) {
113 	const SPUString *sL = const_cast<const SPUString *>(
114         static_cast<SPUString *>(left.pointer));
115  	const SPUString *sR = const_cast<const SPUString *>(
116  	    static_cast<SPUString *>(right.pointer));
117     int32_t lenL = sL->fStr->length();
118     int32_t lenR = sR->fStr->length();
119     if (lenL < lenR) {
120         return -1;
121     } else if (lenL > lenR) {
122         return 1;
123     } else {
124         return sL->fStr->compare(*(sR->fStr));
125     }
126 }
127 
sort(UErrorCode & status)128 void SPUStringPool::sort(UErrorCode &status) {
129     fVec->sort(SPUStringCompare, status);
130 }
131 
132 
addString(UnicodeString * src,UErrorCode & status)133 SPUString *SPUStringPool::addString(UnicodeString *src, UErrorCode &status) {
134     LocalPointer<UnicodeString> lpSrc(src);
135     if (U_FAILURE(status)) {
136         return nullptr;
137     }
138     SPUString *hashedString = static_cast<SPUString *>(uhash_get(fHash, src));
139     if (hashedString != nullptr) {
140         return hashedString;
141     }
142     LocalPointer<SPUString> spuStr(new SPUString(std::move(lpSrc)), status);
143     hashedString = spuStr.getAlias();
144     fVec->adoptElement(spuStr.orphan(), status);
145     if (U_FAILURE(status)) {
146         return nullptr;
147     }
148     uhash_put(fHash, src, hashedString, &status);
149     return hashedString;
150 }
151 
152 
153 
ConfusabledataBuilder(SpoofImpl * spImpl,UErrorCode & status)154 ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status) :
155     fSpoofImpl(spImpl),
156     fInput(nullptr),
157     fTable(nullptr),
158     fKeySet(nullptr),
159     fKeyVec(nullptr),
160     fValueVec(nullptr),
161     fStringTable(nullptr),
162     stringPool(nullptr),
163     fParseLine(nullptr),
164     fParseHexNum(nullptr),
165     fLineNum(0)
166 {
167     if (U_FAILURE(status)) {
168         return;
169     }
170 
171     fTable = uhash_open(uhash_hashLong, uhash_compareLong, nullptr, &status);
172 
173     fKeySet = new UnicodeSet();
174     if (fKeySet == nullptr) {
175         status = U_MEMORY_ALLOCATION_ERROR;
176         return;
177     }
178 
179     fKeyVec = new UVector(status);
180     if (fKeyVec == nullptr) {
181         status = U_MEMORY_ALLOCATION_ERROR;
182         return;
183     }
184 
185     fValueVec = new UVector(status);
186     if (fValueVec == nullptr) {
187         status = U_MEMORY_ALLOCATION_ERROR;
188         return;
189     }
190 
191     stringPool = new SPUStringPool(status);
192     if (stringPool == nullptr) {
193         status = U_MEMORY_ALLOCATION_ERROR;
194         return;
195     }
196 }
197 
198 
~ConfusabledataBuilder()199 ConfusabledataBuilder::~ConfusabledataBuilder() {
200     uprv_free(fInput);
201     uregex_close(fParseLine);
202     uregex_close(fParseHexNum);
203     uhash_close(fTable);
204     delete fKeySet;
205     delete fKeyVec;
206     delete fStringTable;
207     delete fValueVec;
208     delete stringPool;
209 }
210 
211 
buildConfusableData(SpoofImpl * spImpl,const char * confusables,int32_t confusablesLen,int32_t * errorType,UParseError * pe,UErrorCode & status)212 void ConfusabledataBuilder::buildConfusableData(SpoofImpl * spImpl, const char * confusables,
213     int32_t confusablesLen, int32_t *errorType, UParseError *pe, UErrorCode &status) {
214 
215     if (U_FAILURE(status)) {
216         return;
217     }
218     ConfusabledataBuilder builder(spImpl, status);
219     builder.build(confusables, confusablesLen, status);
220     if (U_FAILURE(status) && errorType != nullptr) {
221         *errorType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
222         pe->line = builder.fLineNum;
223     }
224 }
225 
226 
build(const char * confusables,int32_t confusablesLen,UErrorCode & status)227 void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesLen,
228                UErrorCode &status) {
229 
230     // Convert the user input data from UTF-8 to char16_t (UTF-16)
231     int32_t inputLen = 0;
232     if (U_FAILURE(status)) {
233         return;
234     }
235     u_strFromUTF8(nullptr, 0, &inputLen, confusables, confusablesLen, &status);
236     if (status != U_BUFFER_OVERFLOW_ERROR) {
237         return;
238     }
239     status = U_ZERO_ERROR;
240     fInput = static_cast<char16_t *>(uprv_malloc((inputLen+1) * sizeof(char16_t)));
241     if (fInput == nullptr) {
242         status = U_MEMORY_ALLOCATION_ERROR;
243         return;
244     }
245     u_strFromUTF8(fInput, inputLen+1, nullptr, confusables, confusablesLen, &status);
246 
247 
248     // Regular Expression to parse a line from Confusables.txt.  The expression will match
249     // any line.  What was matched is determined by examining which capture groups have a match.
250     //   Capture Group 1:  the source char
251     //   Capture Group 2:  the replacement chars
252     //   Capture Group 3-6  the table type, SL, SA, ML, or MA (deprecated)
253     //   Capture Group 7:  A blank or comment only line.
254     //   Capture Group 8:  A syntactically invalid line.  Anything that didn't match before.
255     // Example Line from the confusables.txt source file:
256     //   "1D702 ;	006E 0329 ;	SL	# MATHEMATICAL ITALIC SMALL ETA ... "
257     UnicodeString pattern(
258         "(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;"      // Match the source char
259         "[ \\t]*([0-9A-Fa-f]+"                    // Match the replacement char(s)
260            "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;"    //     (continued)
261         "\\s*(?:(SL)|(SA)|(ML)|(MA))"             // Match the table type
262         "[ \\t]*(?:#.*?)?$"                       // Match any trailing #comment
263         "|^([ \\t]*(?:#.*?)?)$"       // OR match empty lines or lines with only a #comment
264         "|^(.*?)$", -1, US_INV);      // OR match any line, which catches illegal lines.
265     // TODO: Why are we using the regex C API here? C++ would just take UnicodeString...
266     fParseLine = uregex_open(pattern.getBuffer(), pattern.length(), 0, nullptr, &status);
267 
268     // Regular expression for parsing a hex number out of a space-separated list of them.
269     //   Capture group 1 gets the number, with spaces removed.
270     pattern = UNICODE_STRING_SIMPLE("\\s*([0-9A-F]+)");
271     fParseHexNum = uregex_open(pattern.getBuffer(), pattern.length(), 0, nullptr, &status);
272 
273     // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
274     //   given the syntax of the input.
275     if (*fInput == 0xfeff) {
276         *fInput = 0x20;
277     }
278 
279     // Parse the input, one line per iteration of this loop.
280     uregex_setText(fParseLine, fInput, inputLen, &status);
281     while (uregex_findNext(fParseLine, &status)) {
282         fLineNum++;
283         if (uregex_start(fParseLine, 7, &status) >= 0) {
284             // this was a blank or comment line.
285             continue;
286         }
287         if (uregex_start(fParseLine, 8, &status) >= 0) {
288             // input file syntax error.
289             status = U_PARSE_ERROR;
290             return;
291         }
292 
293         // We have a good input line.  Extract the key character and mapping string, and
294         //    put them into the appropriate mapping table.
295         UChar32 keyChar = SpoofImpl::ScanHex(fInput, uregex_start(fParseLine, 1, &status),
296                           uregex_end(fParseLine, 1, &status), status);
297 
298         int32_t mapStringStart = uregex_start(fParseLine, 2, &status);
299         int32_t mapStringLength = uregex_end(fParseLine, 2, &status) - mapStringStart;
300         uregex_setText(fParseHexNum, &fInput[mapStringStart], mapStringLength, &status);
301 
302         UnicodeString  *mapString = new UnicodeString();
303         if (mapString == nullptr) {
304             status = U_MEMORY_ALLOCATION_ERROR;
305             return;
306         }
307         while (uregex_findNext(fParseHexNum, &status)) {
308             UChar32 c = SpoofImpl::ScanHex(&fInput[mapStringStart], uregex_start(fParseHexNum, 1, &status),
309                                  uregex_end(fParseHexNum, 1, &status), status);
310             mapString->append(c);
311         }
312         U_ASSERT(mapString->length() >= 1);
313 
314         // Put the map (value) string into the string pool
315         // This a little like a Java intern() - any duplicates will be eliminated.
316         SPUString *smapString = stringPool->addString(mapString, status);
317 
318         // Add the UChar32 -> string mapping to the table.
319         // For Unicode 8, the SL, SA and ML tables have been discontinued.
320         //                All input data from confusables.txt is tagged MA.
321         uhash_iput(fTable, keyChar, smapString, &status);
322         if (U_FAILURE(status)) { return; }
323         fKeySet->add(keyChar);
324     }
325 
326     // Input data is now all parsed and collected.
327     // Now create the run-time binary form of the data.
328     //
329     // This is done in two steps.  First the data is assembled into vectors and strings,
330     //   for ease of construction, then the contents of these collections are dumped
331     //   into the actual raw-bytes data storage.
332 
333     // Build up the string array, and record the index of each string therein
334     //  in the (build time only) string pool.
335     // Strings of length one are not entered into the strings array.
336     // (Strings in the table are sorted by length)
337     stringPool->sort(status);
338     fStringTable = new UnicodeString();
339     int32_t poolSize = stringPool->size();
340     int32_t i;
341     for (i=0; i<poolSize; i++) {
342         SPUString *s = stringPool->getByIndex(i);
343         int32_t strLen = s->fStr->length();
344         int32_t strIndex = fStringTable->length();
345         if (strLen == 1) {
346             // strings of length one do not get an entry in the string table.
347             // Keep the single string character itself here, which is the same
348             //  convention that is used in the final run-time string table index.
349             s->fCharOrStrTableIndex = s->fStr->charAt(0);
350         } else {
351             s->fCharOrStrTableIndex = strIndex;
352             fStringTable->append(*(s->fStr));
353         }
354     }
355 
356     // Construct the compile-time Key and Value tables
357     //
358     // For each key code point, check which mapping tables it applies to,
359     //   and create the final data for the key & value structures.
360     //
361     //   The four logical mapping tables are conflated into one combined table.
362     //   If multiple logical tables have the same mapping for some key, they
363     //     share a single entry in the combined table.
364     //   If more than one mapping exists for the same key code point, multiple
365     //     entries will be created in the table
366 
367     for (int32_t range=0; range<fKeySet->getRangeCount(); range++) {
368         // It is an oddity of the UnicodeSet API that simply enumerating the contained
369         //   code points requires a nested loop.
370         for (UChar32 keyChar=fKeySet->getRangeStart(range);
371                 keyChar <= fKeySet->getRangeEnd(range); keyChar++) {
372             SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(fTable, keyChar));
373             U_ASSERT(targetMapping != nullptr);
374 
375             // Set an error code if trying to consume a long string.  Otherwise,
376             // codePointAndLengthToKey will abort on a U_ASSERT.
377             if (targetMapping->fStr->length() > 256) {
378                 status = U_ILLEGAL_ARGUMENT_ERROR;
379                 return;
380             }
381 
382             int32_t key = ConfusableDataUtils::codePointAndLengthToKey(keyChar,
383                 targetMapping->fStr->length());
384             int32_t value = targetMapping->fCharOrStrTableIndex;
385 
386             fKeyVec->addElement(key, status);
387             fValueVec->addElement(value, status);
388         }
389     }
390 
391     // Put the assembled data into the flat runtime array
392     outputData(status);
393 
394     // All of the intermediate allocated data belongs to the ConfusabledataBuilder
395     //  object  (this), and is deleted in the destructor.
396     return;
397 }
398 
399 //
400 // outputData     The confusable data has been compiled and stored in intermediate
401 //                collections and strings.  Copy it from there to the final flat
402 //                binary array.
403 //
404 //                Note that as each section is added to the output data, the
405 //                expand (reserveSpace() function will likely relocate it in memory.
406 //                Be careful with pointers.
407 //
outputData(UErrorCode & status)408 void ConfusabledataBuilder::outputData(UErrorCode &status) {
409 
410     U_ASSERT(fSpoofImpl->fSpoofData->fDataOwned);
411 
412     //  The Key Table
413     //     While copying the keys to the runtime array,
414     //       also sanity check that they are sorted.
415 
416     int32_t numKeys = fKeyVec->size();
417     int32_t *keys =
418         static_cast<int32_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(int32_t), status));
419     if (U_FAILURE(status)) {
420         return;
421     }
422     int i;
423     UChar32 previousCodePoint = 0;
424     for (i=0; i<numKeys; i++) {
425         int32_t key =  fKeyVec->elementAti(i);
426         UChar32 codePoint = ConfusableDataUtils::keyToCodePoint(key);
427         (void)previousCodePoint;    // Suppress unused variable warning.
428         // strictly greater because there can be only one entry per code point
429         U_ASSERT(codePoint > previousCodePoint);
430         keys[i] = key;
431         previousCodePoint = codePoint;
432     }
433     SpoofDataHeader *rawData = fSpoofImpl->fSpoofData->fRawData;
434     rawData->fCFUKeys = (int32_t)((char *)keys - (char *)rawData);
435     rawData->fCFUKeysSize = numKeys;
436     fSpoofImpl->fSpoofData->fCFUKeys = keys;
437 
438 
439     // The Value Table, parallels the key table
440     int32_t numValues = fValueVec->size();
441     U_ASSERT(numKeys == numValues);
442     uint16_t *values =
443         static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(uint16_t), status));
444     if (U_FAILURE(status)) {
445         return;
446     }
447     for (i=0; i<numValues; i++) {
448         uint32_t value = static_cast<uint32_t>(fValueVec->elementAti(i));
449         U_ASSERT(value < 0xffff);
450         values[i] = static_cast<uint16_t>(value);
451     }
452     rawData = fSpoofImpl->fSpoofData->fRawData;
453     rawData->fCFUStringIndex = (int32_t)((char *)values - (char *)rawData);
454     rawData->fCFUStringIndexSize = numValues;
455     fSpoofImpl->fSpoofData->fCFUValues = values;
456 
457     // The Strings Table.
458 
459     uint32_t stringsLength = fStringTable->length();
460     // Reserve an extra space so the string will be nul-terminated.  This is
461     // only a convenience, for when debugging; it is not needed otherwise.
462     char16_t *strings =
463         static_cast<char16_t *>(fSpoofImpl->fSpoofData->reserveSpace(stringsLength*sizeof(char16_t)+2, status));
464     if (U_FAILURE(status)) {
465         return;
466     }
467     fStringTable->extract(strings, stringsLength+1, status);
468     rawData = fSpoofImpl->fSpoofData->fRawData;
469     U_ASSERT(rawData->fCFUStringTable == 0);
470     rawData->fCFUStringTable = (int32_t)((char *)strings - (char *)rawData);
471     rawData->fCFUStringTableLen = stringsLength;
472     fSpoofImpl->fSpoofData->fCFUStrings = strings;
473 }
474 
475 #endif
476 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
477 
478