1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 * Copyright (C) 2008-2015, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 ******************************************************************************
10 * file name: uspoof_conf.cpp
11 * encoding: US-ASCII
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2009Jan05 (refactoring earlier files)
16 * created by: Andy Heninger
17 *
18 * Internal classes for compililing confusable data into its binary (runtime) form.
19 */
20
21 #include "unicode/utypes.h"
22 #include "unicode/uspoof.h"
23 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
24 #if !UCONFIG_NO_NORMALIZATION
25
26 #include "unicode/unorm.h"
27 #include "unicode/uregex.h"
28 #include "unicode/ustring.h"
29 #include "cmemory.h"
30 #include "uspoof_impl.h"
31 #include "uhash.h"
32 #include "uvector.h"
33 #include "uassert.h"
34 #include "uarrsort.h"
35 #include "uspoof_conf.h"
36
37 U_NAMESPACE_USE
38
39
40 //---------------------------------------------------------------------
41 //
42 // buildConfusableData Compile the source confusable data, as defined by
43 // the Unicode data file confusables.txt, into the binary
44 // structures used by the confusable detector.
45 //
46 // The binary structures are described in uspoof_impl.h
47 //
48 // 1. Parse the data, making a hash table mapping from a UChar32 to a String.
49 //
50 // 2. Sort all of the strings encountered by length, since they will need to
51 // be stored in that order in the final string table.
52 // TODO: Sorting these strings by length is no longer needed since the removal of
53 // the string lengths table. This logic can be removed to save processing time
54 // when building confusables data.
55 //
56 // 3. Build a list of keys (UChar32s) from the four mapping tables. Sort the
57 // list because that will be the ordering of our runtime table.
58 //
59 // 4. Generate the run time string table. This is generated before the key & value
60 // tables because we need the string indexes when building those tables.
61 //
62 // 5. Build the run-time key and value tables. These are parallel tables, and are built
63 // at the same time
64 //
65
SPUString(UnicodeString * s)66 SPUString::SPUString(UnicodeString *s) {
67 fStr = s;
68 fCharOrStrTableIndex = 0;
69 }
70
71
~SPUString()72 SPUString::~SPUString() {
73 delete fStr;
74 }
75
76
SPUStringPool(UErrorCode & status)77 SPUStringPool::SPUStringPool(UErrorCode &status) : fVec(NULL), fHash(NULL) {
78 fVec = new UVector(status);
79 fHash = uhash_open(uhash_hashUnicodeString, // key hash function
80 uhash_compareUnicodeString, // Key Comparator
81 NULL, // Value Comparator
82 &status);
83 }
84
85
~SPUStringPool()86 SPUStringPool::~SPUStringPool() {
87 int i;
88 for (i=fVec->size()-1; i>=0; i--) {
89 SPUString *s = static_cast<SPUString *>(fVec->elementAt(i));
90 delete s;
91 }
92 delete fVec;
93 uhash_close(fHash);
94 }
95
96
size()97 int32_t SPUStringPool::size() {
98 return fVec->size();
99 }
100
getByIndex(int32_t index)101 SPUString *SPUStringPool::getByIndex(int32_t index) {
102 SPUString *retString = (SPUString *)fVec->elementAt(index);
103 return retString;
104 }
105
106
107 // Comparison function for ordering strings in the string pool.
108 // Compare by length first, then, within a group of the same length,
109 // by code point order.
110 // Conforms to the type signature for a USortComparator in uvector.h
111
SPUStringCompare(UHashTok left,UHashTok right)112 static int8_t U_CALLCONV SPUStringCompare(UHashTok left, UHashTok right) {
113 const SPUString *sL = const_cast<const SPUString *>(
114 static_cast<SPUString *>(left.pointer));
115 const SPUString *sR = const_cast<const SPUString *>(
116 static_cast<SPUString *>(right.pointer));
117 int32_t lenL = sL->fStr->length();
118 int32_t lenR = sR->fStr->length();
119 if (lenL < lenR) {
120 return -1;
121 } else if (lenL > lenR) {
122 return 1;
123 } else {
124 return sL->fStr->compare(*(sR->fStr));
125 }
126 }
127
sort(UErrorCode & status)128 void SPUStringPool::sort(UErrorCode &status) {
129 fVec->sort(SPUStringCompare, status);
130 }
131
132
addString(UnicodeString * src,UErrorCode & status)133 SPUString *SPUStringPool::addString(UnicodeString *src, UErrorCode &status) {
134 SPUString *hashedString = static_cast<SPUString *>(uhash_get(fHash, src));
135 if (hashedString != NULL) {
136 delete src;
137 } else {
138 hashedString = new SPUString(src);
139 uhash_put(fHash, src, hashedString, &status);
140 fVec->addElement(hashedString, status);
141 }
142 return hashedString;
143 }
144
145
146
ConfusabledataBuilder(SpoofImpl * spImpl,UErrorCode & status)147 ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status) :
148 fSpoofImpl(spImpl),
149 fInput(NULL),
150 fTable(NULL),
151 fKeySet(NULL),
152 fKeyVec(NULL),
153 fValueVec(NULL),
154 fStringTable(NULL),
155 stringPool(NULL),
156 fParseLine(NULL),
157 fParseHexNum(NULL),
158 fLineNum(0)
159 {
160 if (U_FAILURE(status)) {
161 return;
162 }
163 fTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
164 fKeySet = new UnicodeSet();
165 fKeyVec = new UVector(status);
166 fValueVec = new UVector(status);
167 stringPool = new SPUStringPool(status);
168 }
169
170
~ConfusabledataBuilder()171 ConfusabledataBuilder::~ConfusabledataBuilder() {
172 uprv_free(fInput);
173 uregex_close(fParseLine);
174 uregex_close(fParseHexNum);
175 uhash_close(fTable);
176 delete fKeySet;
177 delete fKeyVec;
178 delete fStringTable;
179 delete fValueVec;
180 delete stringPool;
181 }
182
183
buildConfusableData(SpoofImpl * spImpl,const char * confusables,int32_t confusablesLen,int32_t * errorType,UParseError * pe,UErrorCode & status)184 void ConfusabledataBuilder::buildConfusableData(SpoofImpl * spImpl, const char * confusables,
185 int32_t confusablesLen, int32_t *errorType, UParseError *pe, UErrorCode &status) {
186
187 if (U_FAILURE(status)) {
188 return;
189 }
190 ConfusabledataBuilder builder(spImpl, status);
191 builder.build(confusables, confusablesLen, status);
192 if (U_FAILURE(status) && errorType != NULL) {
193 *errorType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
194 pe->line = builder.fLineNum;
195 }
196 }
197
198
build(const char * confusables,int32_t confusablesLen,UErrorCode & status)199 void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesLen,
200 UErrorCode &status) {
201
202 // Convert the user input data from UTF-8 to UChar (UTF-16)
203 int32_t inputLen = 0;
204 if (U_FAILURE(status)) {
205 return;
206 }
207 u_strFromUTF8(NULL, 0, &inputLen, confusables, confusablesLen, &status);
208 if (status != U_BUFFER_OVERFLOW_ERROR) {
209 return;
210 }
211 status = U_ZERO_ERROR;
212 fInput = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
213 if (fInput == NULL) {
214 status = U_MEMORY_ALLOCATION_ERROR;
215 return;
216 }
217 u_strFromUTF8(fInput, inputLen+1, NULL, confusables, confusablesLen, &status);
218
219
220 // Regular Expression to parse a line from Confusables.txt. The expression will match
221 // any line. What was matched is determined by examining which capture groups have a match.
222 // Capture Group 1: the source char
223 // Capture Group 2: the replacement chars
224 // Capture Group 3-6 the table type, SL, SA, ML, or MA (deprecated)
225 // Capture Group 7: A blank or comment only line.
226 // Capture Group 8: A syntactically invalid line. Anything that didn't match before.
227 // Example Line from the confusables.txt source file:
228 // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... "
229 UnicodeString pattern(
230 "(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" // Match the source char
231 "[ \\t]*([0-9A-Fa-f]+" // Match the replacement char(s)
232 "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" // (continued)
233 "\\s*(?:(SL)|(SA)|(ML)|(MA))" // Match the table type
234 "[ \\t]*(?:#.*?)?$" // Match any trailing #comment
235 "|^([ \\t]*(?:#.*?)?)$" // OR match empty lines or lines with only a #comment
236 "|^(.*?)$", -1, US_INV); // OR match any line, which catches illegal lines.
237 // TODO: Why are we using the regex C API here? C++ would just take UnicodeString...
238 fParseLine = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
239
240 // Regular expression for parsing a hex number out of a space-separated list of them.
241 // Capture group 1 gets the number, with spaces removed.
242 pattern = UNICODE_STRING_SIMPLE("\\s*([0-9A-F]+)");
243 fParseHexNum = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
244
245 // Zap any Byte Order Mark at the start of input. Changing it to a space is benign
246 // given the syntax of the input.
247 if (*fInput == 0xfeff) {
248 *fInput = 0x20;
249 }
250
251 // Parse the input, one line per iteration of this loop.
252 uregex_setText(fParseLine, fInput, inputLen, &status);
253 while (uregex_findNext(fParseLine, &status)) {
254 fLineNum++;
255 if (uregex_start(fParseLine, 7, &status) >= 0) {
256 // this was a blank or comment line.
257 continue;
258 }
259 if (uregex_start(fParseLine, 8, &status) >= 0) {
260 // input file syntax error.
261 status = U_PARSE_ERROR;
262 return;
263 }
264
265 // We have a good input line. Extract the key character and mapping string, and
266 // put them into the appropriate mapping table.
267 UChar32 keyChar = SpoofImpl::ScanHex(fInput, uregex_start(fParseLine, 1, &status),
268 uregex_end(fParseLine, 1, &status), status);
269
270 int32_t mapStringStart = uregex_start(fParseLine, 2, &status);
271 int32_t mapStringLength = uregex_end(fParseLine, 2, &status) - mapStringStart;
272 uregex_setText(fParseHexNum, &fInput[mapStringStart], mapStringLength, &status);
273
274 UnicodeString *mapString = new UnicodeString();
275 if (mapString == NULL) {
276 status = U_MEMORY_ALLOCATION_ERROR;
277 return;
278 }
279 while (uregex_findNext(fParseHexNum, &status)) {
280 UChar32 c = SpoofImpl::ScanHex(&fInput[mapStringStart], uregex_start(fParseHexNum, 1, &status),
281 uregex_end(fParseHexNum, 1, &status), status);
282 mapString->append(c);
283 }
284 U_ASSERT(mapString->length() >= 1);
285
286 // Put the map (value) string into the string pool
287 // This a little like a Java intern() - any duplicates will be eliminated.
288 SPUString *smapString = stringPool->addString(mapString, status);
289
290 // Add the UChar32 -> string mapping to the table.
291 // For Unicode 8, the SL, SA and ML tables have been discontinued.
292 // All input data from confusables.txt is tagged MA.
293 uhash_iput(fTable, keyChar, smapString, &status);
294 if (U_FAILURE(status)) { return; }
295 fKeySet->add(keyChar);
296 }
297
298 // Input data is now all parsed and collected.
299 // Now create the run-time binary form of the data.
300 //
301 // This is done in two steps. First the data is assembled into vectors and strings,
302 // for ease of construction, then the contents of these collections are dumped
303 // into the actual raw-bytes data storage.
304
305 // Build up the string array, and record the index of each string therein
306 // in the (build time only) string pool.
307 // Strings of length one are not entered into the strings array.
308 // (Strings in the table are sorted by length)
309 stringPool->sort(status);
310 fStringTable = new UnicodeString();
311 int32_t poolSize = stringPool->size();
312 int32_t i;
313 for (i=0; i<poolSize; i++) {
314 SPUString *s = stringPool->getByIndex(i);
315 int32_t strLen = s->fStr->length();
316 int32_t strIndex = fStringTable->length();
317 if (strLen == 1) {
318 // strings of length one do not get an entry in the string table.
319 // Keep the single string character itself here, which is the same
320 // convention that is used in the final run-time string table index.
321 s->fCharOrStrTableIndex = s->fStr->charAt(0);
322 } else {
323 s->fCharOrStrTableIndex = strIndex;
324 fStringTable->append(*(s->fStr));
325 }
326 }
327
328 // Construct the compile-time Key and Value tables
329 //
330 // For each key code point, check which mapping tables it applies to,
331 // and create the final data for the key & value structures.
332 //
333 // The four logical mapping tables are conflated into one combined table.
334 // If multiple logical tables have the same mapping for some key, they
335 // share a single entry in the combined table.
336 // If more than one mapping exists for the same key code point, multiple
337 // entries will be created in the table
338
339 for (int32_t range=0; range<fKeySet->getRangeCount(); range++) {
340 // It is an oddity of the UnicodeSet API that simply enumerating the contained
341 // code points requires a nested loop.
342 for (UChar32 keyChar=fKeySet->getRangeStart(range);
343 keyChar <= fKeySet->getRangeEnd(range); keyChar++) {
344 SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(fTable, keyChar));
345 U_ASSERT(targetMapping != NULL);
346
347 // Set an error code if trying to consume a long string. Otherwise,
348 // codePointAndLengthToKey will abort on a U_ASSERT.
349 if (targetMapping->fStr->length() > 256) {
350 status = U_ILLEGAL_ARGUMENT_ERROR;
351 return;
352 }
353
354 int32_t key = ConfusableDataUtils::codePointAndLengthToKey(keyChar,
355 targetMapping->fStr->length());
356 int32_t value = targetMapping->fCharOrStrTableIndex;
357
358 fKeyVec->addElement(key, status);
359 fValueVec->addElement(value, status);
360 }
361 }
362
363 // Put the assembled data into the flat runtime array
364 outputData(status);
365
366 // All of the intermediate allocated data belongs to the ConfusabledataBuilder
367 // object (this), and is deleted in the destructor.
368 return;
369 }
370
371 //
372 // outputData The confusable data has been compiled and stored in intermediate
373 // collections and strings. Copy it from there to the final flat
374 // binary array.
375 //
376 // Note that as each section is added to the output data, the
377 // expand (reserveSpace() function will likely relocate it in memory.
378 // Be careful with pointers.
379 //
outputData(UErrorCode & status)380 void ConfusabledataBuilder::outputData(UErrorCode &status) {
381
382 U_ASSERT(fSpoofImpl->fSpoofData->fDataOwned == TRUE);
383
384 // The Key Table
385 // While copying the keys to the runtime array,
386 // also sanity check that they are sorted.
387
388 int32_t numKeys = fKeyVec->size();
389 int32_t *keys =
390 static_cast<int32_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(int32_t), status));
391 if (U_FAILURE(status)) {
392 return;
393 }
394 int i;
395 UChar32 previousCodePoint = 0;
396 for (i=0; i<numKeys; i++) {
397 int32_t key = fKeyVec->elementAti(i);
398 UChar32 codePoint = ConfusableDataUtils::keyToCodePoint(key);
399 // strictly greater because there can be only one entry per code point
400 U_ASSERT(codePoint > previousCodePoint);
401 keys[i] = key;
402 previousCodePoint = codePoint;
403 }
404 SpoofDataHeader *rawData = fSpoofImpl->fSpoofData->fRawData;
405 rawData->fCFUKeys = (int32_t)((char *)keys - (char *)rawData);
406 rawData->fCFUKeysSize = numKeys;
407 fSpoofImpl->fSpoofData->fCFUKeys = keys;
408
409
410 // The Value Table, parallels the key table
411 int32_t numValues = fValueVec->size();
412 U_ASSERT(numKeys == numValues);
413 uint16_t *values =
414 static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(uint16_t), status));
415 if (U_FAILURE(status)) {
416 return;
417 }
418 for (i=0; i<numValues; i++) {
419 uint32_t value = static_cast<uint32_t>(fValueVec->elementAti(i));
420 U_ASSERT(value < 0xffff);
421 values[i] = static_cast<uint16_t>(value);
422 }
423 rawData = fSpoofImpl->fSpoofData->fRawData;
424 rawData->fCFUStringIndex = (int32_t)((char *)values - (char *)rawData);
425 rawData->fCFUStringIndexSize = numValues;
426 fSpoofImpl->fSpoofData->fCFUValues = values;
427
428 // The Strings Table.
429
430 uint32_t stringsLength = fStringTable->length();
431 // Reserve an extra space so the string will be nul-terminated. This is
432 // only a convenience, for when debugging; it is not needed otherwise.
433 UChar *strings =
434 static_cast<UChar *>(fSpoofImpl->fSpoofData->reserveSpace(stringsLength*sizeof(UChar)+2, status));
435 if (U_FAILURE(status)) {
436 return;
437 }
438 fStringTable->extract(strings, stringsLength+1, status);
439 rawData = fSpoofImpl->fSpoofData->fRawData;
440 U_ASSERT(rawData->fCFUStringTable == 0);
441 rawData->fCFUStringTable = (int32_t)((char *)strings - (char *)rawData);
442 rawData->fCFUStringTableLen = stringsLength;
443 fSpoofImpl->fSpoofData->fCFUStrings = strings;
444 }
445
446 #endif
447 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
448
449