1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 * Copyright (C) 2008-2015, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 ******************************************************************************
10 * file name: uspoof_conf.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2009Jan05 (refactoring earlier files)
16 * created by: Andy Heninger
17 *
18 * Internal classes for compiling confusable data into its binary (runtime) form.
19 */
20
21 #include "unicode/utypes.h"
22 #include "unicode/uspoof.h"
23 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
24 #if !UCONFIG_NO_NORMALIZATION
25
26 #include "unicode/unorm.h"
27 #include "unicode/uregex.h"
28 #include "unicode/ustring.h"
29 #include "cmemory.h"
30 #include "uspoof_impl.h"
31 #include "uhash.h"
32 #include "uvector.h"
33 #include "uassert.h"
34 #include "uarrsort.h"
35 #include "uspoof_conf.h"
36
37 U_NAMESPACE_USE
38
39
40 //---------------------------------------------------------------------
41 //
42 // buildConfusableData Compile the source confusable data, as defined by
43 // the Unicode data file confusables.txt, into the binary
44 // structures used by the confusable detector.
45 //
46 // The binary structures are described in uspoof_impl.h
47 //
48 // 1. Parse the data, making a hash table mapping from a UChar32 to a String.
49 //
50 // 2. Sort all of the strings encountered by length, since they will need to
51 // be stored in that order in the final string table.
52 // TODO: Sorting these strings by length is no longer needed since the removal of
53 // the string lengths table. This logic can be removed to save processing time
54 // when building confusables data.
55 //
56 // 3. Build a list of keys (UChar32s) from the four mapping tables. Sort the
57 // list because that will be the ordering of our runtime table.
58 //
59 // 4. Generate the run time string table. This is generated before the key & value
60 // tables because we need the string indexes when building those tables.
61 //
62 // 5. Build the run-time key and value tables. These are parallel tables, and are built
63 // at the same time
64 //
65
SPUString(LocalPointer<UnicodeString> s)66 SPUString::SPUString(LocalPointer<UnicodeString> s) {
67 fStr = std::move(s);
68 fCharOrStrTableIndex = 0;
69 }
70
71
~SPUString()72 SPUString::~SPUString() {
73 }
74
75
SPUStringPool(UErrorCode & status)76 SPUStringPool::SPUStringPool(UErrorCode &status) : fVec(nullptr), fHash(nullptr) {
77 LocalPointer<UVector> vec(new UVector(status), status);
78 if (U_FAILURE(status)) {
79 return;
80 }
81 vec->setDeleter(
82 [](void *obj) {delete (SPUString *)obj;});
83 fVec = vec.orphan();
84 fHash = uhash_open(uhash_hashUnicodeString, // key hash function
85 uhash_compareUnicodeString, // Key Comparator
86 nullptr, // Value Comparator
87 &status);
88 }
89
90
~SPUStringPool()91 SPUStringPool::~SPUStringPool() {
92 delete fVec;
93 uhash_close(fHash);
94 }
95
96
size()97 int32_t SPUStringPool::size() {
98 return fVec->size();
99 }
100
getByIndex(int32_t index)101 SPUString *SPUStringPool::getByIndex(int32_t index) {
102 SPUString *retString = (SPUString *)fVec->elementAt(index);
103 return retString;
104 }
105
106
107 // Comparison function for ordering strings in the string pool.
108 // Compare by length first, then, within a group of the same length,
109 // by code point order.
110 // Conforms to the type signature for a USortComparator in uvector.h
111
SPUStringCompare(UHashTok left,UHashTok right)112 static int32_t U_CALLCONV SPUStringCompare(UHashTok left, UHashTok right) {
113 const SPUString *sL = const_cast<const SPUString *>(
114 static_cast<SPUString *>(left.pointer));
115 const SPUString *sR = const_cast<const SPUString *>(
116 static_cast<SPUString *>(right.pointer));
117 int32_t lenL = sL->fStr->length();
118 int32_t lenR = sR->fStr->length();
119 if (lenL < lenR) {
120 return -1;
121 } else if (lenL > lenR) {
122 return 1;
123 } else {
124 return sL->fStr->compare(*(sR->fStr));
125 }
126 }
127
sort(UErrorCode & status)128 void SPUStringPool::sort(UErrorCode &status) {
129 fVec->sort(SPUStringCompare, status);
130 }
131
132
addString(UnicodeString * src,UErrorCode & status)133 SPUString *SPUStringPool::addString(UnicodeString *src, UErrorCode &status) {
134 LocalPointer<UnicodeString> lpSrc(src);
135 if (U_FAILURE(status)) {
136 return nullptr;
137 }
138 SPUString *hashedString = static_cast<SPUString *>(uhash_get(fHash, src));
139 if (hashedString != nullptr) {
140 return hashedString;
141 }
142 LocalPointer<SPUString> spuStr(new SPUString(std::move(lpSrc)), status);
143 hashedString = spuStr.getAlias();
144 fVec->adoptElement(spuStr.orphan(), status);
145 if (U_FAILURE(status)) {
146 return nullptr;
147 }
148 uhash_put(fHash, src, hashedString, &status);
149 return hashedString;
150 }
151
152
153
ConfusabledataBuilder(SpoofImpl * spImpl,UErrorCode & status)154 ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status) :
155 fSpoofImpl(spImpl),
156 fInput(nullptr),
157 fTable(nullptr),
158 fKeySet(nullptr),
159 fKeyVec(nullptr),
160 fValueVec(nullptr),
161 fStringTable(nullptr),
162 stringPool(nullptr),
163 fParseLine(nullptr),
164 fParseHexNum(nullptr),
165 fLineNum(0)
166 {
167 if (U_FAILURE(status)) {
168 return;
169 }
170
171 fTable = uhash_open(uhash_hashLong, uhash_compareLong, nullptr, &status);
172
173 fKeySet = new UnicodeSet();
174 if (fKeySet == nullptr) {
175 status = U_MEMORY_ALLOCATION_ERROR;
176 return;
177 }
178
179 fKeyVec = new UVector(status);
180 if (fKeyVec == nullptr) {
181 status = U_MEMORY_ALLOCATION_ERROR;
182 return;
183 }
184
185 fValueVec = new UVector(status);
186 if (fValueVec == nullptr) {
187 status = U_MEMORY_ALLOCATION_ERROR;
188 return;
189 }
190
191 stringPool = new SPUStringPool(status);
192 if (stringPool == nullptr) {
193 status = U_MEMORY_ALLOCATION_ERROR;
194 return;
195 }
196 }
197
198
~ConfusabledataBuilder()199 ConfusabledataBuilder::~ConfusabledataBuilder() {
200 uprv_free(fInput);
201 uregex_close(fParseLine);
202 uregex_close(fParseHexNum);
203 uhash_close(fTable);
204 delete fKeySet;
205 delete fKeyVec;
206 delete fStringTable;
207 delete fValueVec;
208 delete stringPool;
209 }
210
211
buildConfusableData(SpoofImpl * spImpl,const char * confusables,int32_t confusablesLen,int32_t * errorType,UParseError * pe,UErrorCode & status)212 void ConfusabledataBuilder::buildConfusableData(SpoofImpl * spImpl, const char * confusables,
213 int32_t confusablesLen, int32_t *errorType, UParseError *pe, UErrorCode &status) {
214
215 if (U_FAILURE(status)) {
216 return;
217 }
218 ConfusabledataBuilder builder(spImpl, status);
219 builder.build(confusables, confusablesLen, status);
220 if (U_FAILURE(status) && errorType != nullptr) {
221 *errorType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
222 pe->line = builder.fLineNum;
223 }
224 }
225
226
build(const char * confusables,int32_t confusablesLen,UErrorCode & status)227 void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesLen,
228 UErrorCode &status) {
229
230 // Convert the user input data from UTF-8 to char16_t (UTF-16)
231 int32_t inputLen = 0;
232 if (U_FAILURE(status)) {
233 return;
234 }
235 u_strFromUTF8(nullptr, 0, &inputLen, confusables, confusablesLen, &status);
236 if (status != U_BUFFER_OVERFLOW_ERROR) {
237 return;
238 }
239 status = U_ZERO_ERROR;
240 fInput = static_cast<char16_t *>(uprv_malloc((inputLen+1) * sizeof(char16_t)));
241 if (fInput == nullptr) {
242 status = U_MEMORY_ALLOCATION_ERROR;
243 return;
244 }
245 u_strFromUTF8(fInput, inputLen+1, nullptr, confusables, confusablesLen, &status);
246
247
248 // Regular Expression to parse a line from Confusables.txt. The expression will match
249 // any line. What was matched is determined by examining which capture groups have a match.
250 // Capture Group 1: the source char
251 // Capture Group 2: the replacement chars
252 // Capture Group 3-6 the table type, SL, SA, ML, or MA (deprecated)
253 // Capture Group 7: A blank or comment only line.
254 // Capture Group 8: A syntactically invalid line. Anything that didn't match before.
255 // Example Line from the confusables.txt source file:
256 // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... "
257 UnicodeString pattern(
258 "(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" // Match the source char
259 "[ \\t]*([0-9A-Fa-f]+" // Match the replacement char(s)
260 "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" // (continued)
261 "\\s*(?:(SL)|(SA)|(ML)|(MA))" // Match the table type
262 "[ \\t]*(?:#.*?)?$" // Match any trailing #comment
263 "|^([ \\t]*(?:#.*?)?)$" // OR match empty lines or lines with only a #comment
264 "|^(.*?)$", -1, US_INV); // OR match any line, which catches illegal lines.
265 // TODO: Why are we using the regex C API here? C++ would just take UnicodeString...
266 fParseLine = uregex_open(pattern.getBuffer(), pattern.length(), 0, nullptr, &status);
267
268 // Regular expression for parsing a hex number out of a space-separated list of them.
269 // Capture group 1 gets the number, with spaces removed.
270 pattern = UNICODE_STRING_SIMPLE("\\s*([0-9A-F]+)");
271 fParseHexNum = uregex_open(pattern.getBuffer(), pattern.length(), 0, nullptr, &status);
272
273 // Zap any Byte Order Mark at the start of input. Changing it to a space is benign
274 // given the syntax of the input.
275 if (*fInput == 0xfeff) {
276 *fInput = 0x20;
277 }
278
279 // Parse the input, one line per iteration of this loop.
280 uregex_setText(fParseLine, fInput, inputLen, &status);
281 while (uregex_findNext(fParseLine, &status)) {
282 fLineNum++;
283 if (uregex_start(fParseLine, 7, &status) >= 0) {
284 // this was a blank or comment line.
285 continue;
286 }
287 if (uregex_start(fParseLine, 8, &status) >= 0) {
288 // input file syntax error.
289 status = U_PARSE_ERROR;
290 return;
291 }
292
293 // We have a good input line. Extract the key character and mapping string, and
294 // put them into the appropriate mapping table.
295 UChar32 keyChar = SpoofImpl::ScanHex(fInput, uregex_start(fParseLine, 1, &status),
296 uregex_end(fParseLine, 1, &status), status);
297
298 int32_t mapStringStart = uregex_start(fParseLine, 2, &status);
299 int32_t mapStringLength = uregex_end(fParseLine, 2, &status) - mapStringStart;
300 uregex_setText(fParseHexNum, &fInput[mapStringStart], mapStringLength, &status);
301
302 UnicodeString *mapString = new UnicodeString();
303 if (mapString == nullptr) {
304 status = U_MEMORY_ALLOCATION_ERROR;
305 return;
306 }
307 while (uregex_findNext(fParseHexNum, &status)) {
308 UChar32 c = SpoofImpl::ScanHex(&fInput[mapStringStart], uregex_start(fParseHexNum, 1, &status),
309 uregex_end(fParseHexNum, 1, &status), status);
310 mapString->append(c);
311 }
312 U_ASSERT(mapString->length() >= 1);
313
314 // Put the map (value) string into the string pool
315 // This a little like a Java intern() - any duplicates will be eliminated.
316 SPUString *smapString = stringPool->addString(mapString, status);
317
318 // Add the UChar32 -> string mapping to the table.
319 // For Unicode 8, the SL, SA and ML tables have been discontinued.
320 // All input data from confusables.txt is tagged MA.
321 uhash_iput(fTable, keyChar, smapString, &status);
322 if (U_FAILURE(status)) { return; }
323 fKeySet->add(keyChar);
324 }
325
326 // Input data is now all parsed and collected.
327 // Now create the run-time binary form of the data.
328 //
329 // This is done in two steps. First the data is assembled into vectors and strings,
330 // for ease of construction, then the contents of these collections are dumped
331 // into the actual raw-bytes data storage.
332
333 // Build up the string array, and record the index of each string therein
334 // in the (build time only) string pool.
335 // Strings of length one are not entered into the strings array.
336 // (Strings in the table are sorted by length)
337 stringPool->sort(status);
338 fStringTable = new UnicodeString();
339 int32_t poolSize = stringPool->size();
340 int32_t i;
341 for (i=0; i<poolSize; i++) {
342 SPUString *s = stringPool->getByIndex(i);
343 int32_t strLen = s->fStr->length();
344 int32_t strIndex = fStringTable->length();
345 if (strLen == 1) {
346 // strings of length one do not get an entry in the string table.
347 // Keep the single string character itself here, which is the same
348 // convention that is used in the final run-time string table index.
349 s->fCharOrStrTableIndex = s->fStr->charAt(0);
350 } else {
351 s->fCharOrStrTableIndex = strIndex;
352 fStringTable->append(*(s->fStr));
353 }
354 }
355
356 // Construct the compile-time Key and Value tables
357 //
358 // For each key code point, check which mapping tables it applies to,
359 // and create the final data for the key & value structures.
360 //
361 // The four logical mapping tables are conflated into one combined table.
362 // If multiple logical tables have the same mapping for some key, they
363 // share a single entry in the combined table.
364 // If more than one mapping exists for the same key code point, multiple
365 // entries will be created in the table
366
367 for (int32_t range=0; range<fKeySet->getRangeCount(); range++) {
368 // It is an oddity of the UnicodeSet API that simply enumerating the contained
369 // code points requires a nested loop.
370 for (UChar32 keyChar=fKeySet->getRangeStart(range);
371 keyChar <= fKeySet->getRangeEnd(range); keyChar++) {
372 SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(fTable, keyChar));
373 U_ASSERT(targetMapping != nullptr);
374
375 // Set an error code if trying to consume a long string. Otherwise,
376 // codePointAndLengthToKey will abort on a U_ASSERT.
377 if (targetMapping->fStr->length() > 256) {
378 status = U_ILLEGAL_ARGUMENT_ERROR;
379 return;
380 }
381
382 int32_t key = ConfusableDataUtils::codePointAndLengthToKey(keyChar,
383 targetMapping->fStr->length());
384 int32_t value = targetMapping->fCharOrStrTableIndex;
385
386 fKeyVec->addElement(key, status);
387 fValueVec->addElement(value, status);
388 }
389 }
390
391 // Put the assembled data into the flat runtime array
392 outputData(status);
393
394 // All of the intermediate allocated data belongs to the ConfusabledataBuilder
395 // object (this), and is deleted in the destructor.
396 return;
397 }
398
399 //
400 // outputData The confusable data has been compiled and stored in intermediate
401 // collections and strings. Copy it from there to the final flat
402 // binary array.
403 //
404 // Note that as each section is added to the output data, the
405 // expand (reserveSpace() function will likely relocate it in memory.
406 // Be careful with pointers.
407 //
outputData(UErrorCode & status)408 void ConfusabledataBuilder::outputData(UErrorCode &status) {
409
410 U_ASSERT(fSpoofImpl->fSpoofData->fDataOwned);
411
412 // The Key Table
413 // While copying the keys to the runtime array,
414 // also sanity check that they are sorted.
415
416 int32_t numKeys = fKeyVec->size();
417 int32_t *keys =
418 static_cast<int32_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(int32_t), status));
419 if (U_FAILURE(status)) {
420 return;
421 }
422 int i;
423 UChar32 previousCodePoint = 0;
424 for (i=0; i<numKeys; i++) {
425 int32_t key = fKeyVec->elementAti(i);
426 UChar32 codePoint = ConfusableDataUtils::keyToCodePoint(key);
427 (void)previousCodePoint; // Suppress unused variable warning.
428 // strictly greater because there can be only one entry per code point
429 U_ASSERT(codePoint > previousCodePoint);
430 keys[i] = key;
431 previousCodePoint = codePoint;
432 }
433 SpoofDataHeader *rawData = fSpoofImpl->fSpoofData->fRawData;
434 rawData->fCFUKeys = (int32_t)((char *)keys - (char *)rawData);
435 rawData->fCFUKeysSize = numKeys;
436 fSpoofImpl->fSpoofData->fCFUKeys = keys;
437
438
439 // The Value Table, parallels the key table
440 int32_t numValues = fValueVec->size();
441 U_ASSERT(numKeys == numValues);
442 uint16_t *values =
443 static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(uint16_t), status));
444 if (U_FAILURE(status)) {
445 return;
446 }
447 for (i=0; i<numValues; i++) {
448 uint32_t value = static_cast<uint32_t>(fValueVec->elementAti(i));
449 U_ASSERT(value < 0xffff);
450 values[i] = static_cast<uint16_t>(value);
451 }
452 rawData = fSpoofImpl->fSpoofData->fRawData;
453 rawData->fCFUStringIndex = (int32_t)((char *)values - (char *)rawData);
454 rawData->fCFUStringIndexSize = numValues;
455 fSpoofImpl->fSpoofData->fCFUValues = values;
456
457 // The Strings Table.
458
459 uint32_t stringsLength = fStringTable->length();
460 // Reserve an extra space so the string will be nul-terminated. This is
461 // only a convenience, for when debugging; it is not needed otherwise.
462 char16_t *strings =
463 static_cast<char16_t *>(fSpoofImpl->fSpoofData->reserveSpace(stringsLength*sizeof(char16_t)+2, status));
464 if (U_FAILURE(status)) {
465 return;
466 }
467 fStringTable->extract(strings, stringsLength+1, status);
468 rawData = fSpoofImpl->fSpoofData->fRawData;
469 U_ASSERT(rawData->fCFUStringTable == 0);
470 rawData->fCFUStringTable = (int32_t)((char *)strings - (char *)rawData);
471 rawData->fCFUStringTableLen = stringsLength;
472 fSpoofImpl->fSpoofData->fCFUStrings = strings;
473 }
474
475 #endif
476 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
477
478