1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 * Copyright (C) 2008-2015, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 ******************************************************************************
10 * file name: uspoof_conf.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2009Jan05 (refactoring earlier files)
16 * created by: Andy Heninger
17 *
18 * Internal classes for compiling confusable data into its binary (runtime) form.
19 */
20
21 #include "unicode/utypes.h"
22 #include "unicode/uspoof.h"
23 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
24 #if !UCONFIG_NO_NORMALIZATION
25
26 #include "unicode/unorm.h"
27 #include "unicode/uregex.h"
28 #include "unicode/ustring.h"
29 #include "cmemory.h"
30 #include "uspoof_impl.h"
31 #include "uhash.h"
32 #include "uvector.h"
33 #include "uassert.h"
34 #include "uarrsort.h"
35 #include "uspoof_conf.h"
36
37 U_NAMESPACE_USE
38
39
40 //---------------------------------------------------------------------
41 //
42 // buildConfusableData Compile the source confusable data, as defined by
43 // the Unicode data file confusables.txt, into the binary
44 // structures used by the confusable detector.
45 //
46 // The binary structures are described in uspoof_impl.h
47 //
48 // 1. Parse the data, making a hash table mapping from a UChar32 to a String.
49 //
50 // 2. Sort all of the strings encountered by length, since they will need to
51 // be stored in that order in the final string table.
52 // TODO: Sorting these strings by length is no longer needed since the removal of
53 // the string lengths table. This logic can be removed to save processing time
54 // when building confusables data.
55 //
56 // 3. Build a list of keys (UChar32s) from the four mapping tables. Sort the
57 // list because that will be the ordering of our runtime table.
58 //
59 // 4. Generate the run time string table. This is generated before the key & value
60 // tables because we need the string indexes when building those tables.
61 //
62 // 5. Build the run-time key and value tables. These are parallel tables, and are built
63 // at the same time
64 //
65
SPUString(UnicodeString * s)66 SPUString::SPUString(UnicodeString *s) {
67 fStr = s;
68 fCharOrStrTableIndex = 0;
69 }
70
71
~SPUString()72 SPUString::~SPUString() {
73 delete fStr;
74 }
75
76
SPUStringPool(UErrorCode & status)77 SPUStringPool::SPUStringPool(UErrorCode &status) : fVec(NULL), fHash(NULL) {
78 fVec = new UVector(status);
79 if (fVec == NULL) {
80 status = U_MEMORY_ALLOCATION_ERROR;
81 return;
82 }
83 fHash = uhash_open(uhash_hashUnicodeString, // key hash function
84 uhash_compareUnicodeString, // Key Comparator
85 NULL, // Value Comparator
86 &status);
87 }
88
89
~SPUStringPool()90 SPUStringPool::~SPUStringPool() {
91 int i;
92 for (i=fVec->size()-1; i>=0; i--) {
93 SPUString *s = static_cast<SPUString *>(fVec->elementAt(i));
94 delete s;
95 }
96 delete fVec;
97 uhash_close(fHash);
98 }
99
100
size()101 int32_t SPUStringPool::size() {
102 return fVec->size();
103 }
104
getByIndex(int32_t index)105 SPUString *SPUStringPool::getByIndex(int32_t index) {
106 SPUString *retString = (SPUString *)fVec->elementAt(index);
107 return retString;
108 }
109
110
111 // Comparison function for ordering strings in the string pool.
112 // Compare by length first, then, within a group of the same length,
113 // by code point order.
114 // Conforms to the type signature for a USortComparator in uvector.h
115
SPUStringCompare(UHashTok left,UHashTok right)116 static int32_t U_CALLCONV SPUStringCompare(UHashTok left, UHashTok right) {
117 const SPUString *sL = const_cast<const SPUString *>(
118 static_cast<SPUString *>(left.pointer));
119 const SPUString *sR = const_cast<const SPUString *>(
120 static_cast<SPUString *>(right.pointer));
121 int32_t lenL = sL->fStr->length();
122 int32_t lenR = sR->fStr->length();
123 if (lenL < lenR) {
124 return -1;
125 } else if (lenL > lenR) {
126 return 1;
127 } else {
128 return sL->fStr->compare(*(sR->fStr));
129 }
130 }
131
sort(UErrorCode & status)132 void SPUStringPool::sort(UErrorCode &status) {
133 fVec->sort(SPUStringCompare, status);
134 }
135
136
addString(UnicodeString * src,UErrorCode & status)137 SPUString *SPUStringPool::addString(UnicodeString *src, UErrorCode &status) {
138 SPUString *hashedString = static_cast<SPUString *>(uhash_get(fHash, src));
139 if (hashedString != NULL) {
140 delete src;
141 } else {
142 hashedString = new SPUString(src);
143 if (hashedString == NULL) {
144 status = U_MEMORY_ALLOCATION_ERROR;
145 return NULL;
146 }
147 uhash_put(fHash, src, hashedString, &status);
148 fVec->addElementX(hashedString, status);
149 }
150 return hashedString;
151 }
152
153
154
ConfusabledataBuilder(SpoofImpl * spImpl,UErrorCode & status)155 ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status) :
156 fSpoofImpl(spImpl),
157 fInput(NULL),
158 fTable(NULL),
159 fKeySet(NULL),
160 fKeyVec(NULL),
161 fValueVec(NULL),
162 fStringTable(NULL),
163 stringPool(NULL),
164 fParseLine(NULL),
165 fParseHexNum(NULL),
166 fLineNum(0)
167 {
168 if (U_FAILURE(status)) {
169 return;
170 }
171
172 fTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
173
174 fKeySet = new UnicodeSet();
175 if (fKeySet == NULL) {
176 status = U_MEMORY_ALLOCATION_ERROR;
177 return;
178 }
179
180 fKeyVec = new UVector(status);
181 if (fKeyVec == NULL) {
182 status = U_MEMORY_ALLOCATION_ERROR;
183 return;
184 }
185
186 fValueVec = new UVector(status);
187 if (fValueVec == NULL) {
188 status = U_MEMORY_ALLOCATION_ERROR;
189 return;
190 }
191
192 stringPool = new SPUStringPool(status);
193 if (stringPool == NULL) {
194 status = U_MEMORY_ALLOCATION_ERROR;
195 return;
196 }
197 }
198
199
~ConfusabledataBuilder()200 ConfusabledataBuilder::~ConfusabledataBuilder() {
201 uprv_free(fInput);
202 uregex_close(fParseLine);
203 uregex_close(fParseHexNum);
204 uhash_close(fTable);
205 delete fKeySet;
206 delete fKeyVec;
207 delete fStringTable;
208 delete fValueVec;
209 delete stringPool;
210 }
211
212
buildConfusableData(SpoofImpl * spImpl,const char * confusables,int32_t confusablesLen,int32_t * errorType,UParseError * pe,UErrorCode & status)213 void ConfusabledataBuilder::buildConfusableData(SpoofImpl * spImpl, const char * confusables,
214 int32_t confusablesLen, int32_t *errorType, UParseError *pe, UErrorCode &status) {
215
216 if (U_FAILURE(status)) {
217 return;
218 }
219 ConfusabledataBuilder builder(spImpl, status);
220 builder.build(confusables, confusablesLen, status);
221 if (U_FAILURE(status) && errorType != NULL) {
222 *errorType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
223 pe->line = builder.fLineNum;
224 }
225 }
226
227
build(const char * confusables,int32_t confusablesLen,UErrorCode & status)228 void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesLen,
229 UErrorCode &status) {
230
231 // Convert the user input data from UTF-8 to UChar (UTF-16)
232 int32_t inputLen = 0;
233 if (U_FAILURE(status)) {
234 return;
235 }
236 u_strFromUTF8(NULL, 0, &inputLen, confusables, confusablesLen, &status);
237 if (status != U_BUFFER_OVERFLOW_ERROR) {
238 return;
239 }
240 status = U_ZERO_ERROR;
241 fInput = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
242 if (fInput == NULL) {
243 status = U_MEMORY_ALLOCATION_ERROR;
244 return;
245 }
246 u_strFromUTF8(fInput, inputLen+1, NULL, confusables, confusablesLen, &status);
247
248
249 // Regular Expression to parse a line from Confusables.txt. The expression will match
250 // any line. What was matched is determined by examining which capture groups have a match.
251 // Capture Group 1: the source char
252 // Capture Group 2: the replacement chars
253 // Capture Group 3-6 the table type, SL, SA, ML, or MA (deprecated)
254 // Capture Group 7: A blank or comment only line.
255 // Capture Group 8: A syntactically invalid line. Anything that didn't match before.
256 // Example Line from the confusables.txt source file:
257 // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... "
258 UnicodeString pattern(
259 "(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" // Match the source char
260 "[ \\t]*([0-9A-Fa-f]+" // Match the replacement char(s)
261 "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" // (continued)
262 "\\s*(?:(SL)|(SA)|(ML)|(MA))" // Match the table type
263 "[ \\t]*(?:#.*?)?$" // Match any trailing #comment
264 "|^([ \\t]*(?:#.*?)?)$" // OR match empty lines or lines with only a #comment
265 "|^(.*?)$", -1, US_INV); // OR match any line, which catches illegal lines.
266 // TODO: Why are we using the regex C API here? C++ would just take UnicodeString...
267 fParseLine = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
268
269 // Regular expression for parsing a hex number out of a space-separated list of them.
270 // Capture group 1 gets the number, with spaces removed.
271 pattern = UNICODE_STRING_SIMPLE("\\s*([0-9A-F]+)");
272 fParseHexNum = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
273
274 // Zap any Byte Order Mark at the start of input. Changing it to a space is benign
275 // given the syntax of the input.
276 if (*fInput == 0xfeff) {
277 *fInput = 0x20;
278 }
279
280 // Parse the input, one line per iteration of this loop.
281 uregex_setText(fParseLine, fInput, inputLen, &status);
282 while (uregex_findNext(fParseLine, &status)) {
283 fLineNum++;
284 if (uregex_start(fParseLine, 7, &status) >= 0) {
285 // this was a blank or comment line.
286 continue;
287 }
288 if (uregex_start(fParseLine, 8, &status) >= 0) {
289 // input file syntax error.
290 status = U_PARSE_ERROR;
291 return;
292 }
293
294 // We have a good input line. Extract the key character and mapping string, and
295 // put them into the appropriate mapping table.
296 UChar32 keyChar = SpoofImpl::ScanHex(fInput, uregex_start(fParseLine, 1, &status),
297 uregex_end(fParseLine, 1, &status), status);
298
299 int32_t mapStringStart = uregex_start(fParseLine, 2, &status);
300 int32_t mapStringLength = uregex_end(fParseLine, 2, &status) - mapStringStart;
301 uregex_setText(fParseHexNum, &fInput[mapStringStart], mapStringLength, &status);
302
303 UnicodeString *mapString = new UnicodeString();
304 if (mapString == NULL) {
305 status = U_MEMORY_ALLOCATION_ERROR;
306 return;
307 }
308 while (uregex_findNext(fParseHexNum, &status)) {
309 UChar32 c = SpoofImpl::ScanHex(&fInput[mapStringStart], uregex_start(fParseHexNum, 1, &status),
310 uregex_end(fParseHexNum, 1, &status), status);
311 mapString->append(c);
312 }
313 U_ASSERT(mapString->length() >= 1);
314
315 // Put the map (value) string into the string pool
316 // This a little like a Java intern() - any duplicates will be eliminated.
317 SPUString *smapString = stringPool->addString(mapString, status);
318
319 // Add the UChar32 -> string mapping to the table.
320 // For Unicode 8, the SL, SA and ML tables have been discontinued.
321 // All input data from confusables.txt is tagged MA.
322 uhash_iput(fTable, keyChar, smapString, &status);
323 if (U_FAILURE(status)) { return; }
324 fKeySet->add(keyChar);
325 }
326
327 // Input data is now all parsed and collected.
328 // Now create the run-time binary form of the data.
329 //
330 // This is done in two steps. First the data is assembled into vectors and strings,
331 // for ease of construction, then the contents of these collections are dumped
332 // into the actual raw-bytes data storage.
333
334 // Build up the string array, and record the index of each string therein
335 // in the (build time only) string pool.
336 // Strings of length one are not entered into the strings array.
337 // (Strings in the table are sorted by length)
338 stringPool->sort(status);
339 fStringTable = new UnicodeString();
340 int32_t poolSize = stringPool->size();
341 int32_t i;
342 for (i=0; i<poolSize; i++) {
343 SPUString *s = stringPool->getByIndex(i);
344 int32_t strLen = s->fStr->length();
345 int32_t strIndex = fStringTable->length();
346 if (strLen == 1) {
347 // strings of length one do not get an entry in the string table.
348 // Keep the single string character itself here, which is the same
349 // convention that is used in the final run-time string table index.
350 s->fCharOrStrTableIndex = s->fStr->charAt(0);
351 } else {
352 s->fCharOrStrTableIndex = strIndex;
353 fStringTable->append(*(s->fStr));
354 }
355 }
356
357 // Construct the compile-time Key and Value tables
358 //
359 // For each key code point, check which mapping tables it applies to,
360 // and create the final data for the key & value structures.
361 //
362 // The four logical mapping tables are conflated into one combined table.
363 // If multiple logical tables have the same mapping for some key, they
364 // share a single entry in the combined table.
365 // If more than one mapping exists for the same key code point, multiple
366 // entries will be created in the table
367
368 for (int32_t range=0; range<fKeySet->getRangeCount(); range++) {
369 // It is an oddity of the UnicodeSet API that simply enumerating the contained
370 // code points requires a nested loop.
371 for (UChar32 keyChar=fKeySet->getRangeStart(range);
372 keyChar <= fKeySet->getRangeEnd(range); keyChar++) {
373 SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(fTable, keyChar));
374 U_ASSERT(targetMapping != NULL);
375
376 // Set an error code if trying to consume a long string. Otherwise,
377 // codePointAndLengthToKey will abort on a U_ASSERT.
378 if (targetMapping->fStr->length() > 256) {
379 status = U_ILLEGAL_ARGUMENT_ERROR;
380 return;
381 }
382
383 int32_t key = ConfusableDataUtils::codePointAndLengthToKey(keyChar,
384 targetMapping->fStr->length());
385 int32_t value = targetMapping->fCharOrStrTableIndex;
386
387 fKeyVec->addElement(key, status);
388 fValueVec->addElement(value, status);
389 }
390 }
391
392 // Put the assembled data into the flat runtime array
393 outputData(status);
394
395 // All of the intermediate allocated data belongs to the ConfusabledataBuilder
396 // object (this), and is deleted in the destructor.
397 return;
398 }
399
400 //
401 // outputData The confusable data has been compiled and stored in intermediate
402 // collections and strings. Copy it from there to the final flat
403 // binary array.
404 //
405 // Note that as each section is added to the output data, the
406 // expand (reserveSpace() function will likely relocate it in memory.
407 // Be careful with pointers.
408 //
outputData(UErrorCode & status)409 void ConfusabledataBuilder::outputData(UErrorCode &status) {
410
411 U_ASSERT(fSpoofImpl->fSpoofData->fDataOwned == TRUE);
412
413 // The Key Table
414 // While copying the keys to the runtime array,
415 // also sanity check that they are sorted.
416
417 int32_t numKeys = fKeyVec->size();
418 int32_t *keys =
419 static_cast<int32_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(int32_t), status));
420 if (U_FAILURE(status)) {
421 return;
422 }
423 int i;
424 UChar32 previousCodePoint = 0;
425 for (i=0; i<numKeys; i++) {
426 int32_t key = fKeyVec->elementAti(i);
427 UChar32 codePoint = ConfusableDataUtils::keyToCodePoint(key);
428 (void)previousCodePoint; // Suppress unused variable warning.
429 // strictly greater because there can be only one entry per code point
430 U_ASSERT(codePoint > previousCodePoint);
431 keys[i] = key;
432 previousCodePoint = codePoint;
433 }
434 SpoofDataHeader *rawData = fSpoofImpl->fSpoofData->fRawData;
435 rawData->fCFUKeys = (int32_t)((char *)keys - (char *)rawData);
436 rawData->fCFUKeysSize = numKeys;
437 fSpoofImpl->fSpoofData->fCFUKeys = keys;
438
439
440 // The Value Table, parallels the key table
441 int32_t numValues = fValueVec->size();
442 U_ASSERT(numKeys == numValues);
443 uint16_t *values =
444 static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(uint16_t), status));
445 if (U_FAILURE(status)) {
446 return;
447 }
448 for (i=0; i<numValues; i++) {
449 uint32_t value = static_cast<uint32_t>(fValueVec->elementAti(i));
450 U_ASSERT(value < 0xffff);
451 values[i] = static_cast<uint16_t>(value);
452 }
453 rawData = fSpoofImpl->fSpoofData->fRawData;
454 rawData->fCFUStringIndex = (int32_t)((char *)values - (char *)rawData);
455 rawData->fCFUStringIndexSize = numValues;
456 fSpoofImpl->fSpoofData->fCFUValues = values;
457
458 // The Strings Table.
459
460 uint32_t stringsLength = fStringTable->length();
461 // Reserve an extra space so the string will be nul-terminated. This is
462 // only a convenience, for when debugging; it is not needed otherwise.
463 UChar *strings =
464 static_cast<UChar *>(fSpoofImpl->fSpoofData->reserveSpace(stringsLength*sizeof(UChar)+2, status));
465 if (U_FAILURE(status)) {
466 return;
467 }
468 fStringTable->extract(strings, stringsLength+1, status);
469 rawData = fSpoofImpl->fSpoofData->fRawData;
470 U_ASSERT(rawData->fCFUStringTable == 0);
471 rawData->fCFUStringTable = (int32_t)((char *)strings - (char *)rawData);
472 rawData->fCFUStringTableLen = stringsLength;
473 fSpoofImpl->fSpoofData->fCFUStrings = strings;
474 }
475
476 #endif
477 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
478
479