1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 2008-2015, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 * file name: uspoof_conf.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2009Jan05 (refactoring earlier files)
14 * created by: Andy Heninger
15 *
16 * Internal classes for compililing confusable data into its binary (runtime) form.
17 */
18
19 #include "unicode/utypes.h"
20 #include "unicode/uspoof.h"
21 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
22 #if !UCONFIG_NO_NORMALIZATION
23
24 #include "unicode/unorm.h"
25 #include "unicode/uregex.h"
26 #include "unicode/ustring.h"
27 #include "cmemory.h"
28 #include "uspoof_impl.h"
29 #include "uhash.h"
30 #include "uvector.h"
31 #include "uassert.h"
32 #include "uarrsort.h"
33 #include "uspoof_conf.h"
34
35 U_NAMESPACE_USE
36
37
38 //---------------------------------------------------------------------
39 //
40 // buildConfusableData Compile the source confusable data, as defined by
41 // the Unicode data file confusables.txt, into the binary
42 // structures used by the confusable detector.
43 //
44 // The binary structures are described in uspoof_impl.h
45 //
46 // 1. parse the data, building 4 hash tables, one each for the SL, SA, ML and MA
47 // tables. Each maps from a UChar32 to a String.
48 //
49 // 2. Sort all of the strings encountered by length, since they will need to
50 // be stored in that order in the final string table.
51 //
52 // 3. Build a list of keys (UChar32s) from the four mapping tables. Sort the
53 // list because that will be the ordering of our runtime table.
54 //
55 // 4. Generate the run time string table. This is generated before the key & value
56 // tables because we need the string indexes when building those tables.
57 //
58 // 5. Build the run-time key and value tables. These are parallel tables, and are built
59 // at the same time
60 //
61
SPUString(UnicodeString * s)62 SPUString::SPUString(UnicodeString *s) {
63 fStr = s;
64 fStrTableIndex = 0;
65 }
66
67
~SPUString()68 SPUString::~SPUString() {
69 delete fStr;
70 }
71
72
SPUStringPool(UErrorCode & status)73 SPUStringPool::SPUStringPool(UErrorCode &status) : fVec(NULL), fHash(NULL) {
74 fVec = new UVector(status);
75 fHash = uhash_open(uhash_hashUnicodeString, // key hash function
76 uhash_compareUnicodeString, // Key Comparator
77 NULL, // Value Comparator
78 &status);
79 }
80
81
~SPUStringPool()82 SPUStringPool::~SPUStringPool() {
83 int i;
84 for (i=fVec->size()-1; i>=0; i--) {
85 SPUString *s = static_cast<SPUString *>(fVec->elementAt(i));
86 delete s;
87 }
88 delete fVec;
89 uhash_close(fHash);
90 }
91
92
size()93 int32_t SPUStringPool::size() {
94 return fVec->size();
95 }
96
getByIndex(int32_t index)97 SPUString *SPUStringPool::getByIndex(int32_t index) {
98 SPUString *retString = (SPUString *)fVec->elementAt(index);
99 return retString;
100 }
101
102
103 // Comparison function for ordering strings in the string pool.
104 // Compare by length first, then, within a group of the same length,
105 // by code point order.
106 // Conforms to the type signature for a USortComparator in uvector.h
107
SPUStringCompare(UHashTok left,UHashTok right)108 static int8_t U_CALLCONV SPUStringCompare(UHashTok left, UHashTok right) {
109 const SPUString *sL = const_cast<const SPUString *>(
110 static_cast<SPUString *>(left.pointer));
111 const SPUString *sR = const_cast<const SPUString *>(
112 static_cast<SPUString *>(right.pointer));
113 int32_t lenL = sL->fStr->length();
114 int32_t lenR = sR->fStr->length();
115 if (lenL < lenR) {
116 return -1;
117 } else if (lenL > lenR) {
118 return 1;
119 } else {
120 return sL->fStr->compare(*(sR->fStr));
121 }
122 }
123
sort(UErrorCode & status)124 void SPUStringPool::sort(UErrorCode &status) {
125 fVec->sort(SPUStringCompare, status);
126 }
127
128
addString(UnicodeString * src,UErrorCode & status)129 SPUString *SPUStringPool::addString(UnicodeString *src, UErrorCode &status) {
130 SPUString *hashedString = static_cast<SPUString *>(uhash_get(fHash, src));
131 if (hashedString != NULL) {
132 delete src;
133 } else {
134 hashedString = new SPUString(src);
135 uhash_put(fHash, src, hashedString, &status);
136 fVec->addElement(hashedString, status);
137 }
138 return hashedString;
139 }
140
141
142
ConfusabledataBuilder(SpoofImpl * spImpl,UErrorCode & status)143 ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status) :
144 fSpoofImpl(spImpl),
145 fInput(NULL),
146 fSLTable(NULL),
147 fSATable(NULL),
148 fMLTable(NULL),
149 fMATable(NULL),
150 fKeySet(NULL),
151 fKeyVec(NULL),
152 fValueVec(NULL),
153 fStringTable(NULL),
154 fStringLengthsTable(NULL),
155 stringPool(NULL),
156 fParseLine(NULL),
157 fParseHexNum(NULL),
158 fLineNum(0)
159 {
160 if (U_FAILURE(status)) {
161 return;
162 }
163 fSLTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
164 fSATable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
165 fMLTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
166 fMATable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
167 fKeySet = new UnicodeSet();
168 fKeyVec = new UVector(status);
169 fValueVec = new UVector(status);
170 stringPool = new SPUStringPool(status);
171 }
172
173
~ConfusabledataBuilder()174 ConfusabledataBuilder::~ConfusabledataBuilder() {
175 uprv_free(fInput);
176 uregex_close(fParseLine);
177 uregex_close(fParseHexNum);
178 uhash_close(fSLTable);
179 uhash_close(fSATable);
180 uhash_close(fMLTable);
181 uhash_close(fMATable);
182 delete fKeySet;
183 delete fKeyVec;
184 delete fStringTable;
185 delete fStringLengthsTable;
186 delete fValueVec;
187 delete stringPool;
188 }
189
190
buildConfusableData(SpoofImpl * spImpl,const char * confusables,int32_t confusablesLen,int32_t * errorType,UParseError * pe,UErrorCode & status)191 void ConfusabledataBuilder::buildConfusableData(SpoofImpl * spImpl, const char * confusables,
192 int32_t confusablesLen, int32_t *errorType, UParseError *pe, UErrorCode &status) {
193
194 if (U_FAILURE(status)) {
195 return;
196 }
197 ConfusabledataBuilder builder(spImpl, status);
198 builder.build(confusables, confusablesLen, status);
199 if (U_FAILURE(status) && errorType != NULL) {
200 *errorType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
201 pe->line = builder.fLineNum;
202 }
203 }
204
205
build(const char * confusables,int32_t confusablesLen,UErrorCode & status)206 void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesLen,
207 UErrorCode &status) {
208
209 // Convert the user input data from UTF-8 to UChar (UTF-16)
210 int32_t inputLen = 0;
211 if (U_FAILURE(status)) {
212 return;
213 }
214 u_strFromUTF8(NULL, 0, &inputLen, confusables, confusablesLen, &status);
215 if (status != U_BUFFER_OVERFLOW_ERROR) {
216 return;
217 }
218 status = U_ZERO_ERROR;
219 fInput = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
220 if (fInput == NULL) {
221 status = U_MEMORY_ALLOCATION_ERROR;
222 return;
223 }
224 u_strFromUTF8(fInput, inputLen+1, NULL, confusables, confusablesLen, &status);
225
226
227 // Regular Expression to parse a line from Confusables.txt. The expression will match
228 // any line. What was matched is determined by examining which capture groups have a match.
229 // Capture Group 1: the source char
230 // Capture Group 2: the replacement chars
231 // Capture Group 3-6 the table type, SL, SA, ML, or MA
232 // Capture Group 7: A blank or comment only line.
233 // Capture Group 8: A syntactically invalid line. Anything that didn't match before.
234 // Example Line from the confusables.txt source file:
235 // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... "
236 UnicodeString pattern(
237 "(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" // Match the source char
238 "[ \\t]*([0-9A-Fa-f]+" // Match the replacement char(s)
239 "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" // (continued)
240 "\\s*(?:(SL)|(SA)|(ML)|(MA))" // Match the table type
241 "[ \\t]*(?:#.*?)?$" // Match any trailing #comment
242 "|^([ \\t]*(?:#.*?)?)$" // OR match empty lines or lines with only a #comment
243 "|^(.*?)$", -1, US_INV); // OR match any line, which catches illegal lines.
244 // TODO: Why are we using the regex C API here? C++ would just take UnicodeString...
245 fParseLine = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
246
247 // Regular expression for parsing a hex number out of a space-separated list of them.
248 // Capture group 1 gets the number, with spaces removed.
249 pattern = UNICODE_STRING_SIMPLE("\\s*([0-9A-F]+)");
250 fParseHexNum = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
251
252 // Zap any Byte Order Mark at the start of input. Changing it to a space is benign
253 // given the syntax of the input.
254 if (*fInput == 0xfeff) {
255 *fInput = 0x20;
256 }
257
258 // Parse the input, one line per iteration of this loop.
259 uregex_setText(fParseLine, fInput, inputLen, &status);
260 while (uregex_findNext(fParseLine, &status)) {
261 fLineNum++;
262 if (uregex_start(fParseLine, 7, &status) >= 0) {
263 // this was a blank or comment line.
264 continue;
265 }
266 if (uregex_start(fParseLine, 8, &status) >= 0) {
267 // input file syntax error.
268 status = U_PARSE_ERROR;
269 return;
270 }
271
272 // We have a good input line. Extract the key character and mapping string, and
273 // put them into the appropriate mapping table.
274 UChar32 keyChar = SpoofImpl::ScanHex(fInput, uregex_start(fParseLine, 1, &status),
275 uregex_end(fParseLine, 1, &status), status);
276
277 int32_t mapStringStart = uregex_start(fParseLine, 2, &status);
278 int32_t mapStringLength = uregex_end(fParseLine, 2, &status) - mapStringStart;
279 uregex_setText(fParseHexNum, &fInput[mapStringStart], mapStringLength, &status);
280
281 UnicodeString *mapString = new UnicodeString();
282 if (mapString == NULL) {
283 status = U_MEMORY_ALLOCATION_ERROR;
284 return;
285 }
286 while (uregex_findNext(fParseHexNum, &status)) {
287 UChar32 c = SpoofImpl::ScanHex(&fInput[mapStringStart], uregex_start(fParseHexNum, 1, &status),
288 uregex_end(fParseHexNum, 1, &status), status);
289 mapString->append(c);
290 }
291 U_ASSERT(mapString->length() >= 1);
292
293 // Put the map (value) string into the string pool
294 // This a little like a Java intern() - any duplicates will be eliminated.
295 SPUString *smapString = stringPool->addString(mapString, status);
296
297 // Add the UChar32 -> string mapping to the appropriate table.
298 UHashtable *table = uregex_start(fParseLine, 3, &status) >= 0 ? fSLTable :
299 uregex_start(fParseLine, 4, &status) >= 0 ? fSATable :
300 uregex_start(fParseLine, 5, &status) >= 0 ? fMLTable :
301 uregex_start(fParseLine, 6, &status) >= 0 ? fMATable :
302 NULL;
303 if (U_SUCCESS(status) && table == NULL) {
304 status = U_PARSE_ERROR;
305 }
306 if (U_FAILURE(status)) {
307 return;
308 }
309
310 // For Unicode 8, the SL, SA and ML tables have been discontinued.
311 // All input data from confusables.txt is tagged MA.
312 // ICU spoof check functions should ignore the specified table and always
313 // use this MA Data.
314 // For now, implement by populating the MA data into all four tables, and
315 // keep the multiple table implementation in place, in case it comes back
316 // at some time in the future.
317 // There is no run time size penalty to keeping the four table implementation -
318 // the data is shared when it's the same betweeen tables.
319 if (table != fMATable) {
320 status = U_PARSE_ERROR;
321 return;
322 };
323 // uhash_iput(table, keyChar, smapString, &status);
324 uhash_iput(fSLTable, keyChar, smapString, &status);
325 uhash_iput(fSATable, keyChar, smapString, &status);
326 uhash_iput(fMLTable, keyChar, smapString, &status);
327 uhash_iput(fMATable, keyChar, smapString, &status);
328 fKeySet->add(keyChar);
329 if (U_FAILURE(status)) {
330 return;
331 }
332 }
333
334 // Input data is now all parsed and collected.
335 // Now create the run-time binary form of the data.
336 //
337 // This is done in two steps. First the data is assembled into vectors and strings,
338 // for ease of construction, then the contents of these collections are dumped
339 // into the actual raw-bytes data storage.
340
341 // Build up the string array, and record the index of each string therein
342 // in the (build time only) string pool.
343 // Strings of length one are not entered into the strings array.
344 // At the same time, build up the string lengths table, which records the
345 // position in the string table of the first string of each length >= 4.
346 // (Strings in the table are sorted by length)
347 stringPool->sort(status);
348 fStringTable = new UnicodeString();
349 fStringLengthsTable = new UVector(status);
350 int32_t previousStringLength = 0;
351 int32_t previousStringIndex = 0;
352 int32_t poolSize = stringPool->size();
353 int32_t i;
354 for (i=0; i<poolSize; i++) {
355 SPUString *s = stringPool->getByIndex(i);
356 int32_t strLen = s->fStr->length();
357 int32_t strIndex = fStringTable->length();
358 U_ASSERT(strLen >= previousStringLength);
359 if (strLen == 1) {
360 // strings of length one do not get an entry in the string table.
361 // Keep the single string character itself here, which is the same
362 // convention that is used in the final run-time string table index.
363 s->fStrTableIndex = s->fStr->charAt(0);
364 } else {
365 if ((strLen > previousStringLength) && (previousStringLength >= 4)) {
366 fStringLengthsTable->addElement(previousStringIndex, status);
367 fStringLengthsTable->addElement(previousStringLength, status);
368 }
369 s->fStrTableIndex = strIndex;
370 fStringTable->append(*(s->fStr));
371 }
372 previousStringLength = strLen;
373 previousStringIndex = strIndex;
374 }
375 // Make the final entry to the string lengths table.
376 // (it holds an entry for the _last_ string of each length, so adding the
377 // final one doesn't happen in the main loop because no longer string was encountered.)
378 if (previousStringLength >= 4) {
379 fStringLengthsTable->addElement(previousStringIndex, status);
380 fStringLengthsTable->addElement(previousStringLength, status);
381 }
382
383 // Construct the compile-time Key and Value tables
384 //
385 // For each key code point, check which mapping tables it applies to,
386 // and create the final data for the key & value structures.
387 //
388 // The four logical mapping tables are conflated into one combined table.
389 // If multiple logical tables have the same mapping for some key, they
390 // share a single entry in the combined table.
391 // If more than one mapping exists for the same key code point, multiple
392 // entries will be created in the table
393
394 for (int32_t range=0; range<fKeySet->getRangeCount(); range++) {
395 // It is an oddity of the UnicodeSet API that simply enumerating the contained
396 // code points requires a nested loop.
397 for (UChar32 keyChar=fKeySet->getRangeStart(range);
398 keyChar <= fKeySet->getRangeEnd(range); keyChar++) {
399 addKeyEntry(keyChar, fSLTable, USPOOF_SL_TABLE_FLAG, status);
400 addKeyEntry(keyChar, fSATable, USPOOF_SA_TABLE_FLAG, status);
401 addKeyEntry(keyChar, fMLTable, USPOOF_ML_TABLE_FLAG, status);
402 addKeyEntry(keyChar, fMATable, USPOOF_MA_TABLE_FLAG, status);
403 }
404 }
405
406 // Put the assembled data into the flat runtime array
407 outputData(status);
408
409 // All of the intermediate allocated data belongs to the ConfusabledataBuilder
410 // object (this), and is deleted in the destructor.
411 return;
412 }
413
414 //
415 // outputData The confusable data has been compiled and stored in intermediate
416 // collections and strings. Copy it from there to the final flat
417 // binary array.
418 //
419 // Note that as each section is added to the output data, the
420 // expand (reserveSpace() function will likely relocate it in memory.
421 // Be careful with pointers.
422 //
outputData(UErrorCode & status)423 void ConfusabledataBuilder::outputData(UErrorCode &status) {
424
425 U_ASSERT(fSpoofImpl->fSpoofData->fDataOwned == TRUE);
426
427 // The Key Table
428 // While copying the keys to the runtime array,
429 // also sanity check that they are sorted.
430
431 int32_t numKeys = fKeyVec->size();
432 int32_t *keys =
433 static_cast<int32_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(int32_t), status));
434 if (U_FAILURE(status)) {
435 return;
436 }
437 int i;
438 int32_t previousKey = 0;
439 for (i=0; i<numKeys; i++) {
440 int32_t key = fKeyVec->elementAti(i);
441 (void)previousKey; // Suppress unused variable warning on gcc.
442 U_ASSERT((key & 0x00ffffff) >= (previousKey & 0x00ffffff));
443 U_ASSERT((key & 0xff000000) != 0);
444 keys[i] = key;
445 previousKey = key;
446 }
447 SpoofDataHeader *rawData = fSpoofImpl->fSpoofData->fRawData;
448 rawData->fCFUKeys = (int32_t)((char *)keys - (char *)rawData);
449 rawData->fCFUKeysSize = numKeys;
450 fSpoofImpl->fSpoofData->fCFUKeys = keys;
451
452
453 // The Value Table, parallels the key table
454 int32_t numValues = fValueVec->size();
455 U_ASSERT(numKeys == numValues);
456 uint16_t *values =
457 static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(uint16_t), status));
458 if (U_FAILURE(status)) {
459 return;
460 }
461 for (i=0; i<numValues; i++) {
462 uint32_t value = static_cast<uint32_t>(fValueVec->elementAti(i));
463 U_ASSERT(value < 0xffff);
464 values[i] = static_cast<uint16_t>(value);
465 }
466 rawData = fSpoofImpl->fSpoofData->fRawData;
467 rawData->fCFUStringIndex = (int32_t)((char *)values - (char *)rawData);
468 rawData->fCFUStringIndexSize = numValues;
469 fSpoofImpl->fSpoofData->fCFUValues = values;
470
471 // The Strings Table.
472
473 uint32_t stringsLength = fStringTable->length();
474 // Reserve an extra space so the string will be nul-terminated. This is
475 // only a convenience, for when debugging; it is not needed otherwise.
476 UChar *strings =
477 static_cast<UChar *>(fSpoofImpl->fSpoofData->reserveSpace(stringsLength*sizeof(UChar)+2, status));
478 if (U_FAILURE(status)) {
479 return;
480 }
481 fStringTable->extract(strings, stringsLength+1, status);
482 rawData = fSpoofImpl->fSpoofData->fRawData;
483 U_ASSERT(rawData->fCFUStringTable == 0);
484 rawData->fCFUStringTable = (int32_t)((char *)strings - (char *)rawData);
485 rawData->fCFUStringTableLen = stringsLength;
486 fSpoofImpl->fSpoofData->fCFUStrings = strings;
487
488 // The String Lengths Table
489 // While copying into the runtime array do some sanity checks on the values
490 // Each complete entry contains two fields, an index and an offset.
491 // Lengths should increase with each entry.
492 // Offsets should be less than the size of the string table.
493 int32_t lengthTableLength = fStringLengthsTable->size();
494 uint16_t *stringLengths =
495 static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(lengthTableLength*sizeof(uint16_t), status));
496 if (U_FAILURE(status)) {
497 return;
498 }
499 int32_t destIndex = 0;
500 uint32_t previousLength = 0;
501 for (i=0; i<lengthTableLength; i+=2) {
502 uint32_t offset = static_cast<uint32_t>(fStringLengthsTable->elementAti(i));
503 uint32_t length = static_cast<uint32_t>(fStringLengthsTable->elementAti(i+1));
504 U_ASSERT(offset < stringsLength);
505 U_ASSERT(length < 40);
506 (void)previousLength; // Suppress unused variable warning on gcc.
507 U_ASSERT(length > previousLength);
508 stringLengths[destIndex++] = static_cast<uint16_t>(offset);
509 stringLengths[destIndex++] = static_cast<uint16_t>(length);
510 previousLength = length;
511 }
512 rawData = fSpoofImpl->fSpoofData->fRawData;
513 rawData->fCFUStringLengths = (int32_t)((char *)stringLengths - (char *)rawData);
514 // Note: StringLengthsSize in the raw data is the number of complete entries,
515 // each consisting of a pair of 16 bit values, hence the divide by 2.
516 rawData->fCFUStringLengthsSize = lengthTableLength / 2;
517 fSpoofImpl->fSpoofData->fCFUStringLengths =
518 reinterpret_cast<SpoofStringLengthsElement *>(stringLengths);
519 }
520
521
522
523 // addKeyEntry Construction of the confusable Key and Mapping Values tables.
524 // This is an intermediate point in the building process.
525 // We already have the mappings in the hash tables fSLTable, etc.
526 // This function builds corresponding run-time style table entries into
527 // fKeyVec and fValueVec
528
addKeyEntry(UChar32 keyChar,UHashtable * table,int32_t tableFlag,UErrorCode & status)529 void ConfusabledataBuilder::addKeyEntry(
530 UChar32 keyChar, // The key character
531 UHashtable *table, // The table, one of SATable, MATable, etc.
532 int32_t tableFlag, // One of USPOOF_SA_TABLE_FLAG, etc.
533 UErrorCode &status) {
534
535 SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(table, keyChar));
536 if (targetMapping == NULL) {
537 // No mapping for this key character.
538 // (This function is called for all four tables for each key char that
539 // is seen anywhere, so this no entry cases are very much expected.)
540 return;
541 }
542
543 // Check whether there is already an entry with the correct mapping.
544 // If so, simply set the flag in the keyTable saying that the existing entry
545 // applies to the table that we're doing now.
546
547 UBool keyHasMultipleValues = FALSE;
548 int32_t i;
549 for (i=fKeyVec->size()-1; i>=0 ; i--) {
550 int32_t key = fKeyVec->elementAti(i);
551 if ((key & 0x0ffffff) != keyChar) {
552 // We have now checked all existing key entries for this key char (if any)
553 // without finding one with the same mapping.
554 break;
555 }
556 UnicodeString mapping = getMapping(i);
557 if (mapping == *(targetMapping->fStr)) {
558 // The run time entry we are currently testing has the correct mapping.
559 // Set the flag in it indicating that it applies to the new table also.
560 key |= tableFlag;
561 fKeyVec->setElementAt(key, i);
562 return;
563 }
564 keyHasMultipleValues = TRUE;
565 }
566
567 // Need to add a new entry to the binary data being built for this mapping.
568 // Includes adding entries to both the key table and the parallel values table.
569
570 int32_t newKey = keyChar | tableFlag;
571 if (keyHasMultipleValues) {
572 newKey |= USPOOF_KEY_MULTIPLE_VALUES;
573 }
574 int32_t adjustedMappingLength = targetMapping->fStr->length() - 1;
575 if (adjustedMappingLength>3) {
576 adjustedMappingLength = 3;
577 }
578 newKey |= adjustedMappingLength << USPOOF_KEY_LENGTH_SHIFT;
579
580 int32_t newData = targetMapping->fStrTableIndex;
581
582 fKeyVec->addElement(newKey, status);
583 fValueVec->addElement(newData, status);
584
585 // If the preceding key entry is for the same key character (but with a different mapping)
586 // set the multiple-values flag on it.
587 if (keyHasMultipleValues) {
588 int32_t previousKeyIndex = fKeyVec->size() - 2;
589 int32_t previousKey = fKeyVec->elementAti(previousKeyIndex);
590 previousKey |= USPOOF_KEY_MULTIPLE_VALUES;
591 fKeyVec->setElementAt(previousKey, previousKeyIndex);
592 }
593 }
594
595
596
getMapping(int32_t index)597 UnicodeString ConfusabledataBuilder::getMapping(int32_t index) {
598 int32_t key = fKeyVec->elementAti(index);
599 int32_t value = fValueVec->elementAti(index);
600 int32_t length = USPOOF_KEY_LENGTH_FIELD(key);
601 int32_t lastIndexWithLen;
602 switch (length) {
603 case 0:
604 return UnicodeString(static_cast<UChar>(value));
605 case 1:
606 case 2:
607 return UnicodeString(*fStringTable, value, length+1);
608 case 3:
609 length = 0;
610 int32_t i;
611 for (i=0; i<fStringLengthsTable->size(); i+=2) {
612 lastIndexWithLen = fStringLengthsTable->elementAti(i);
613 if (value <= lastIndexWithLen) {
614 length = fStringLengthsTable->elementAti(i+1);
615 break;
616 }
617 }
618 U_ASSERT(length>=3);
619 return UnicodeString(*fStringTable, value, length);
620 default:
621 U_ASSERT(FALSE);
622 }
623 return UnicodeString();
624 }
625
626 #endif
627 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
628
629