• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (c) 2002-2009, International Business Machines Corporation
4 *   and others.  All Rights Reserved.
5 **********************************************************************
6 *   Date        Name        Description
7 *   01/14/2002  aliu        Creation.
8 **********************************************************************
9 */
10 
11 #include "unicode/utypes.h"
12 
13 #if !UCONFIG_NO_TRANSLITERATION
14 
15 #include "tridpars.h"
16 #include "hash.h"
17 #include "mutex.h"
18 #include "ucln_in.h"
19 #include "unicode/parsepos.h"
20 #include "unicode/translit.h"
21 #include "unicode/uchar.h"
22 #include "unicode/uniset.h"
23 #include "unicode/unistr.h"
24 #include "unicode/utrans.h"
25 #include "util.h"
26 #include "uvector.h"
27 
28 U_NAMESPACE_BEGIN
29 
30 static const UChar ID_DELIM    = 0x003B; // ;
31 static const UChar TARGET_SEP  = 0x002D; // -
32 static const UChar VARIANT_SEP = 0x002F; // /
33 static const UChar OPEN_REV    = 0x0028; // (
34 static const UChar CLOSE_REV   = 0x0029; // )
35 
36 //static const UChar EMPTY[]     = {0}; // ""
37 static const UChar ANY[]       = {65,110,121,0}; // "Any"
38 static const UChar ANY_NULL[]  = {65,110,121,45,78,117,108,108,0}; // "Any-Null"
39 
40 static const int32_t FORWARD = UTRANS_FORWARD;
41 static const int32_t REVERSE = UTRANS_REVERSE;
42 
43 static Hashtable* SPECIAL_INVERSES = NULL;
44 
45 /**
46  * The mutex controlling access to SPECIAL_INVERSES
47  */
48 static UMTX LOCK = 0;
49 
Specs(const UnicodeString & s,const UnicodeString & t,const UnicodeString & v,UBool sawS,const UnicodeString & f)50 TransliteratorIDParser::Specs::Specs(const UnicodeString& s, const UnicodeString& t,
51                                      const UnicodeString& v, UBool sawS,
52                                      const UnicodeString& f) {
53     source = s;
54     target = t;
55     variant = v;
56     sawSource = sawS;
57     filter = f;
58 }
59 
SingleID(const UnicodeString & c,const UnicodeString & b,const UnicodeString & f)60 TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b,
61                                            const UnicodeString& f) {
62     canonID = c;
63     basicID = b;
64     filter = f;
65 }
66 
SingleID(const UnicodeString & c,const UnicodeString & b)67 TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b) {
68     canonID = c;
69     basicID = b;
70 }
71 
createInstance()72 Transliterator* TransliteratorIDParser::SingleID::createInstance() {
73     Transliterator* t;
74     if (basicID.length() == 0) {
75         t = createBasicInstance(ANY_NULL, &canonID);
76     } else {
77         t = createBasicInstance(basicID, &canonID);
78     }
79     if (t != NULL) {
80         if (filter.length() != 0) {
81             UErrorCode ec = U_ZERO_ERROR;
82             UnicodeSet *set = new UnicodeSet(filter, ec);
83             if (U_FAILURE(ec)) {
84                 delete set;
85             } else {
86                 t->adoptFilter(set);
87             }
88         }
89     }
90     return t;
91 }
92 
93 
94 /**
95  * Parse a single ID, that is, an ID of the general form
96  * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element
97  * optional, the filters optional, and the variants optional.
98  * @param id the id to be parsed
99  * @param pos INPUT-OUTPUT parameter.  On input, the position of
100  * the first character to parse.  On output, the position after
101  * the last character parsed.
102  * @param dir the direction.  If the direction is REVERSE then the
103  * SingleID is constructed for the reverse direction.
104  * @return a SingleID object or NULL
105  */
106 TransliteratorIDParser::SingleID*
parseSingleID(const UnicodeString & id,int32_t & pos,int32_t dir,UErrorCode & status)107 TransliteratorIDParser::parseSingleID(const UnicodeString& id, int32_t& pos,
108                                       int32_t dir, UErrorCode& status) {
109 
110     int32_t start = pos;
111 
112     // The ID will be of the form A, A(), A(B), or (B), where
113     // A and B are filter IDs.
114     Specs* specsA = NULL;
115     Specs* specsB = NULL;
116     UBool sawParen = FALSE;
117 
118     // On the first pass, look for (B) or ().  If this fails, then
119     // on the second pass, look for A, A(B), or A().
120     for (int32_t pass=1; pass<=2; ++pass) {
121         if (pass == 2) {
122             specsA = parseFilterID(id, pos, TRUE);
123             if (specsA == NULL) {
124                 pos = start;
125                 return NULL;
126             }
127         }
128         if (ICU_Utility::parseChar(id, pos, OPEN_REV)) {
129             sawParen = TRUE;
130             if (!ICU_Utility::parseChar(id, pos, CLOSE_REV)) {
131                 specsB = parseFilterID(id, pos, TRUE);
132                 // Must close with a ')'
133                 if (specsB == NULL || !ICU_Utility::parseChar(id, pos, CLOSE_REV)) {
134                     delete specsA;
135                     pos = start;
136                     return NULL;
137                 }
138             }
139             break;
140         }
141     }
142 
143     // Assemble return results
144     SingleID* single;
145     if (sawParen) {
146         if (dir == FORWARD) {
147             SingleID* b = specsToID(specsB, FORWARD);
148             single = specsToID(specsA, FORWARD);
149             // Null pointers check
150             if (b == NULL || single == NULL) {
151             	delete b;
152             	delete single;
153             	status = U_MEMORY_ALLOCATION_ERROR;
154             	return NULL;
155             }
156             single->canonID.append(OPEN_REV)
157                 .append(b->canonID).append(CLOSE_REV);
158             if (specsA != NULL) {
159                 single->filter = specsA->filter;
160             }
161             delete b;
162         } else {
163             SingleID* a = specsToID(specsA, FORWARD);
164             single = specsToID(specsB, FORWARD);
165             // Check for null pointer.
166             if (a == NULL || single == NULL) {
167             	delete a;
168             	delete single;
169             	status = U_MEMORY_ALLOCATION_ERROR;
170             	return NULL;
171             }
172             single->canonID.append(OPEN_REV)
173                 .append(a->canonID).append(CLOSE_REV);
174             if (specsB != NULL) {
175                 single->filter = specsB->filter;
176             }
177             delete a;
178         }
179     } else {
180         // assert(specsA != NULL);
181         if (dir == FORWARD) {
182             single = specsToID(specsA, FORWARD);
183         } else {
184             single = specsToSpecialInverse(*specsA, status);
185             if (single == NULL) {
186                 single = specsToID(specsA, REVERSE);
187             }
188         }
189         // Check for NULL pointer
190         if (single == NULL) {
191         	status = U_MEMORY_ALLOCATION_ERROR;
192         	return NULL;
193         }
194         single->filter = specsA->filter;
195     }
196 
197     delete specsA;
198     delete specsB;
199 
200     return single;
201 }
202 
203 /**
204  * Parse a filter ID, that is, an ID of the general form
205  * "[f1] s1-t1/v1", with the filters optional, and the variants optional.
206  * @param id the id to be parsed
207  * @param pos INPUT-OUTPUT parameter.  On input, the position of
208  * the first character to parse.  On output, the position after
209  * the last character parsed.
210  * @return a SingleID object or null if the parse fails
211  */
212 TransliteratorIDParser::SingleID*
parseFilterID(const UnicodeString & id,int32_t & pos)213 TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos) {
214 
215     int32_t start = pos;
216 
217     Specs* specs = parseFilterID(id, pos, TRUE);
218     if (specs == NULL) {
219         pos = start;
220         return NULL;
221     }
222 
223     // Assemble return results
224     SingleID* single = specsToID(specs, FORWARD);
225     if (single != NULL) {
226         single->filter = specs->filter;
227     }
228     delete specs;
229     return single;
230 }
231 
232 /**
233  * Parse a global filter of the form "[f]" or "([f])", depending
234  * on 'withParens'.
235  * @param id the pattern the parse
236  * @param pos INPUT-OUTPUT parameter.  On input, the position of
237  * the first character to parse.  On output, the position after
238  * the last character parsed.
239  * @param dir the direction.
240  * @param withParens INPUT-OUTPUT parameter.  On entry, if
241  * withParens is 0, then parens are disallowed.  If it is 1,
242  * then parens are requires.  If it is -1, then parens are
243  * optional, and the return result will be set to 0 or 1.
244  * @param canonID OUTPUT parameter.  The pattern for the filter
245  * added to the canonID, either at the end, if dir is FORWARD, or
246  * at the start, if dir is REVERSE.  The pattern will be enclosed
247  * in parentheses if appropriate, and will be suffixed with an
248  * ID_DELIM character.  May be NULL.
249  * @return a UnicodeSet object or NULL.  A non-NULL results
250  * indicates a successful parse, regardless of whether the filter
251  * applies to the given direction.  The caller should discard it
252  * if withParens != (dir == REVERSE).
253  */
parseGlobalFilter(const UnicodeString & id,int32_t & pos,int32_t dir,int32_t & withParens,UnicodeString * canonID)254 UnicodeSet* TransliteratorIDParser::parseGlobalFilter(const UnicodeString& id, int32_t& pos,
255                                                       int32_t dir,
256                                                       int32_t& withParens,
257                                                       UnicodeString* canonID) {
258     UnicodeSet* filter = NULL;
259     int32_t start = pos;
260 
261     if (withParens == -1) {
262         withParens = ICU_Utility::parseChar(id, pos, OPEN_REV) ? 1 : 0;
263     } else if (withParens == 1) {
264         if (!ICU_Utility::parseChar(id, pos, OPEN_REV)) {
265             pos = start;
266             return NULL;
267         }
268     }
269 
270     ICU_Utility::skipWhitespace(id, pos, TRUE);
271 
272     if (UnicodeSet::resemblesPattern(id, pos)) {
273         ParsePosition ppos(pos);
274         UErrorCode ec = U_ZERO_ERROR;
275         filter = new UnicodeSet(id, ppos, USET_IGNORE_SPACE, NULL, ec);
276         /* test for NULL */
277         if (filter == 0) {
278             pos = start;
279             return 0;
280         }
281         if (U_FAILURE(ec)) {
282             delete filter;
283             pos = start;
284             return NULL;
285         }
286 
287         UnicodeString pattern;
288         id.extractBetween(pos, ppos.getIndex(), pattern);
289         pos = ppos.getIndex();
290 
291         if (withParens == 1 && !ICU_Utility::parseChar(id, pos, CLOSE_REV)) {
292             pos = start;
293             return NULL;
294         }
295 
296         // In the forward direction, append the pattern to the
297         // canonID.  In the reverse, insert it at zero, and invert
298         // the presence of parens ("A" <-> "(A)").
299         if (canonID != NULL) {
300             if (dir == FORWARD) {
301                 if (withParens == 1) {
302                     pattern.insert(0, OPEN_REV);
303                     pattern.append(CLOSE_REV);
304                 }
305                 canonID->append(pattern).append(ID_DELIM);
306             } else {
307                 if (withParens == 0) {
308                     pattern.insert(0, OPEN_REV);
309                     pattern.append(CLOSE_REV);
310                 }
311                 canonID->insert(0, pattern);
312                 canonID->insert(pattern.length(), ID_DELIM);
313             }
314         }
315     }
316 
317     return filter;
318 }
319 
320 U_CDECL_BEGIN
_deleteSingleID(void * obj)321 static void U_CALLCONV _deleteSingleID(void* obj) {
322     delete (TransliteratorIDParser::SingleID*) obj;
323 }
324 
_deleteTransliteratorTrIDPars(void * obj)325 static void U_CALLCONV _deleteTransliteratorTrIDPars(void* obj) {
326     delete (Transliterator*) obj;
327 }
328 U_CDECL_END
329 
330 /**
331  * Parse a compound ID, consisting of an optional forward global
332  * filter, a separator, one or more single IDs delimited by
333  * separators, an an optional reverse global filter.  The
334  * separator is a semicolon.  The global filters are UnicodeSet
335  * patterns.  The reverse global filter must be enclosed in
336  * parentheses.
337  * @param id the pattern the parse
338  * @param dir the direction.
339  * @param canonID OUTPUT parameter that receives the canonical ID,
340  * consisting of canonical IDs for all elements, as returned by
341  * parseSingleID(), separated by semicolons.  Previous contents
342  * are discarded.
343  * @param list OUTPUT parameter that receives a list of SingleID
344  * objects representing the parsed IDs.  Previous contents are
345  * discarded.
346  * @param globalFilter OUTPUT parameter that receives a pointer to
347  * a newly created global filter for this ID in this direction, or
348  * NULL if there is none.
349  * @return TRUE if the parse succeeds, that is, if the entire
350  * id is consumed without syntax error.
351  */
parseCompoundID(const UnicodeString & id,int32_t dir,UnicodeString & canonID,UVector & list,UnicodeSet * & globalFilter)352 UBool TransliteratorIDParser::parseCompoundID(const UnicodeString& id, int32_t dir,
353                                               UnicodeString& canonID,
354                                               UVector& list,
355                                               UnicodeSet*& globalFilter) {
356     UErrorCode ec = U_ZERO_ERROR;
357     int32_t i;
358     int32_t pos = 0;
359     int32_t withParens = 1;
360     list.removeAllElements();
361     UnicodeSet* filter;
362     globalFilter = NULL;
363     canonID.truncate(0);
364 
365     // Parse leading global filter, if any
366     withParens = 0; // parens disallowed
367     filter = parseGlobalFilter(id, pos, dir, withParens, &canonID);
368     if (filter != NULL) {
369         if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) {
370             // Not a global filter; backup and resume
371             canonID.truncate(0);
372             pos = 0;
373         }
374         if (dir == FORWARD) {
375             globalFilter = filter;
376         } else {
377             delete filter;
378         }
379         filter = NULL;
380     }
381 
382     UBool sawDelimiter = TRUE;
383     for (;;) {
384         SingleID* single = parseSingleID(id, pos, dir, ec);
385         if (single == NULL) {
386             break;
387         }
388         if (dir == FORWARD) {
389             list.addElement(single, ec);
390         } else {
391             list.insertElementAt(single, 0, ec);
392         }
393         if (U_FAILURE(ec)) {
394             goto FAIL;
395         }
396         if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) {
397             sawDelimiter = FALSE;
398             break;
399         }
400     }
401 
402     if (list.size() == 0) {
403         goto FAIL;
404     }
405 
406     // Construct canonical ID
407     for (i=0; i<list.size(); ++i) {
408         SingleID* single = (SingleID*) list.elementAt(i);
409         canonID.append(single->canonID);
410         if (i != (list.size()-1)) {
411             canonID.append(ID_DELIM);
412         }
413     }
414 
415     // Parse trailing global filter, if any, and only if we saw
416     // a trailing delimiter after the IDs.
417     if (sawDelimiter) {
418         withParens = 1; // parens required
419         filter = parseGlobalFilter(id, pos, dir, withParens, &canonID);
420         if (filter != NULL) {
421             // Don't require trailing ';', but parse it if present
422             ICU_Utility::parseChar(id, pos, ID_DELIM);
423 
424             if (dir == REVERSE) {
425                 globalFilter = filter;
426             } else {
427                 delete filter;
428             }
429             filter = NULL;
430         }
431     }
432 
433     // Trailing unparsed text is a syntax error
434     ICU_Utility::skipWhitespace(id, pos, TRUE);
435     if (pos != id.length()) {
436         goto FAIL;
437     }
438 
439     return TRUE;
440 
441  FAIL:
442     UObjectDeleter *save = list.setDeleter(_deleteSingleID);
443     list.removeAllElements();
444     list.setDeleter(save);
445     delete globalFilter;
446     globalFilter = NULL;
447     return FALSE;
448 }
449 
450 /**
451  * Convert the elements of the 'list' vector, which are SingleID
452  * objects, into actual Transliterator objects.  In the course of
453  * this, some (or all) entries may be removed.  If all entries
454  * are removed, the NULL transliterator will be added.
455  *
456  * Delete entries with empty basicIDs; these are generated by
457  * elements like "(A)" in the forward direction, or "A()" in
458  * the reverse.  THIS MAY RESULT IN AN EMPTY VECTOR.  Convert
459  * SingleID entries to actual transliterators.
460  *
461  * @param list vector of SingleID objects.  On exit, vector
462  * of one or more Transliterators.
463  * @return new value of insertIndex.  The index will shift if
464  * there are empty items, like "(Lower)", with indices less than
465  * insertIndex.
466  */
instantiateList(UVector & list,UErrorCode & ec)467 void TransliteratorIDParser::instantiateList(UVector& list,
468                                                 UErrorCode& ec) {
469     UVector tlist(ec);
470     if (U_FAILURE(ec)) {
471         goto RETURN;
472     }
473     tlist.setDeleter(_deleteTransliteratorTrIDPars);
474 
475     Transliterator* t;
476     int32_t i;
477     for (i=0; i<=list.size(); ++i) { // [sic]: i<=list.size()
478         // We run the loop too long by one, so we can
479         // do an insert after the last element
480         if (i==list.size()) {
481             break;
482         }
483 
484         SingleID* single = (SingleID*) list.elementAt(i);
485         if (single->basicID.length() != 0) {
486             t = single->createInstance();
487             if (t == NULL) {
488                 ec = U_INVALID_ID;
489                 goto RETURN;
490             }
491             tlist.addElement(t, ec);
492             if (U_FAILURE(ec)) {
493                 delete t;
494                 goto RETURN;
495             }
496         }
497     }
498 
499     // An empty list is equivalent to a NULL transliterator.
500     if (tlist.size() == 0) {
501         t = createBasicInstance(ANY_NULL, NULL);
502         if (t == NULL) {
503             // Should never happen
504             ec = U_INTERNAL_TRANSLITERATOR_ERROR;
505         }
506         tlist.addElement(t, ec);
507         if (U_FAILURE(ec)) {
508             delete t;
509         }
510     }
511 
512  RETURN:
513 
514     UObjectDeleter *save = list.setDeleter(_deleteSingleID);
515     list.removeAllElements();
516 
517     if (U_SUCCESS(ec)) {
518         list.setDeleter(_deleteTransliteratorTrIDPars);
519 
520         while (tlist.size() > 0) {
521             t = (Transliterator*) tlist.orphanElementAt(0);
522             list.addElement(t, ec);
523             if (U_FAILURE(ec)) {
524                 delete t;
525                 list.removeAllElements();
526                 break;
527             }
528         }
529     }
530 
531     list.setDeleter(save);
532 }
533 
534 /**
535  * Parse an ID into pieces.  Take IDs of the form T, T/V, S-T,
536  * S-T/V, or S/V-T.  If the source is missing, return a source of
537  * ANY.
538  * @param id the id string, in any of several forms
539  * @return an array of 4 strings: source, target, variant, and
540  * isSourcePresent.  If the source is not present, ANY will be
541  * given as the source, and isSourcePresent will be NULL.  Otherwise
542  * isSourcePresent will be non-NULL.  The target may be empty if the
543  * id is not well-formed.  The variant may be empty.
544  */
IDtoSTV(const UnicodeString & id,UnicodeString & source,UnicodeString & target,UnicodeString & variant,UBool & isSourcePresent)545 void TransliteratorIDParser::IDtoSTV(const UnicodeString& id,
546                                      UnicodeString& source,
547                                      UnicodeString& target,
548                                      UnicodeString& variant,
549                                      UBool& isSourcePresent) {
550     source = ANY;
551     target.truncate(0);
552     variant.truncate(0);
553 
554     int32_t sep = id.indexOf(TARGET_SEP);
555     int32_t var = id.indexOf(VARIANT_SEP);
556     if (var < 0) {
557         var = id.length();
558     }
559     isSourcePresent = FALSE;
560 
561     if (sep < 0) {
562         // Form: T/V or T (or /V)
563         id.extractBetween(0, var, target);
564         id.extractBetween(var, id.length(), variant);
565     } else if (sep < var) {
566         // Form: S-T/V or S-T (or -T/V or -T)
567         if (sep > 0) {
568             id.extractBetween(0, sep, source);
569             isSourcePresent = TRUE;
570         }
571         id.extractBetween(++sep, var, target);
572         id.extractBetween(var, id.length(), variant);
573     } else {
574         // Form: (S/V-T or /V-T)
575         if (var > 0) {
576             id.extractBetween(0, var, source);
577             isSourcePresent = TRUE;
578         }
579         id.extractBetween(var, sep++, variant);
580         id.extractBetween(sep, id.length(), target);
581     }
582 
583     if (variant.length() > 0) {
584         variant.remove(0, 1);
585     }
586 }
587 
588 /**
589  * Given source, target, and variant strings, concatenate them into a
590  * full ID.  If the source is empty, then "Any" will be used for the
591  * source, so the ID will always be of the form s-t/v or s-t.
592  */
STVtoID(const UnicodeString & source,const UnicodeString & target,const UnicodeString & variant,UnicodeString & id)593 void TransliteratorIDParser::STVtoID(const UnicodeString& source,
594                                      const UnicodeString& target,
595                                      const UnicodeString& variant,
596                                      UnicodeString& id) {
597     id = source;
598     if (id.length() == 0) {
599         id = ANY;
600     }
601     id.append(TARGET_SEP).append(target);
602     if (variant.length() != 0) {
603         id.append(VARIANT_SEP).append(variant);
604     }
605     // NUL-terminate the ID string for getTerminatedBuffer.
606     // This prevents valgrind and Purify warnings.
607     id.append((UChar)0);
608     id.truncate(id.length()-1);
609 }
610 
611 /**
612  * Register two targets as being inverses of one another.  For
613  * example, calling registerSpecialInverse("NFC", "NFD", TRUE) causes
614  * Transliterator to form the following inverse relationships:
615  *
616  * <pre>NFC => NFD
617  * Any-NFC => Any-NFD
618  * NFD => NFC
619  * Any-NFD => Any-NFC</pre>
620  *
621  * (Without the special inverse registration, the inverse of NFC
622  * would be NFC-Any.)  Note that NFD is shorthand for Any-NFD, but
623  * that the presence or absence of "Any-" is preserved.
624  *
625  * <p>The relationship is symmetrical; registering (a, b) is
626  * equivalent to registering (b, a).
627  *
628  * <p>The relevant IDs must still be registered separately as
629  * factories or classes.
630  *
631  * <p>Only the targets are specified.  Special inverses always
632  * have the form Any-Target1 <=> Any-Target2.  The target should
633  * have canonical casing (the casing desired to be produced when
634  * an inverse is formed) and should contain no whitespace or other
635  * extraneous characters.
636  *
637  * @param target the target against which to register the inverse
638  * @param inverseTarget the inverse of target, that is
639  * Any-target.getInverse() => Any-inverseTarget
640  * @param bidirectional if TRUE, register the reverse relation
641  * as well, that is, Any-inverseTarget.getInverse() => Any-target
642  */
registerSpecialInverse(const UnicodeString & target,const UnicodeString & inverseTarget,UBool bidirectional,UErrorCode & status)643 void TransliteratorIDParser::registerSpecialInverse(const UnicodeString& target,
644                                                     const UnicodeString& inverseTarget,
645                                                     UBool bidirectional,
646                                                     UErrorCode &status) {
647     init(status);
648     if (U_FAILURE(status)) {
649         return;
650     }
651 
652     // If target == inverseTarget then force bidirectional => FALSE
653     if (bidirectional && 0==target.caseCompare(inverseTarget, U_FOLD_CASE_DEFAULT)) {
654         bidirectional = FALSE;
655     }
656 
657     Mutex lock(&LOCK);
658 
659     UnicodeString *tempus = new UnicodeString(inverseTarget);  // Used for null pointer check before usage.
660     if (tempus == NULL) {
661     	status = U_MEMORY_ALLOCATION_ERROR;
662     	return;
663     }
664     SPECIAL_INVERSES->put(target, tempus, status);
665     if (bidirectional) {
666     	tempus = new UnicodeString(target);
667     	if (tempus == NULL) {
668     		status = U_MEMORY_ALLOCATION_ERROR;
669     		return;
670     	}
671         SPECIAL_INVERSES->put(inverseTarget, tempus, status);
672     }
673 }
674 
675 //----------------------------------------------------------------
676 // Private implementation
677 //----------------------------------------------------------------
678 
679 /**
680  * Parse an ID into component pieces.  Take IDs of the form T,
681  * T/V, S-T, S-T/V, or S/V-T.  If the source is missing, return a
682  * source of ANY.
683  * @param id the id string, in any of several forms
684  * @param pos INPUT-OUTPUT parameter.  On input, pos is the
685  * offset of the first character to parse in id.  On output,
686  * pos is the offset after the last parsed character.  If the
687  * parse failed, pos will be unchanged.
688  * @param allowFilter2 if TRUE, a UnicodeSet pattern is allowed
689  * at any location between specs or delimiters, and is returned
690  * as the fifth string in the array.
691  * @return a Specs object, or NULL if the parse failed.  If
692  * neither source nor target was seen in the parsed id, then the
693  * parse fails.  If allowFilter is TRUE, then the parsed filter
694  * pattern is returned in the Specs object, otherwise the returned
695  * filter reference is NULL.  If the parse fails for any reason
696  * NULL is returned.
697  */
698 TransliteratorIDParser::Specs*
parseFilterID(const UnicodeString & id,int32_t & pos,UBool allowFilter)699 TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos,
700                                       UBool allowFilter) {
701     UnicodeString first;
702     UnicodeString source;
703     UnicodeString target;
704     UnicodeString variant;
705     UnicodeString filter;
706     UChar delimiter = 0;
707     int32_t specCount = 0;
708     int32_t start = pos;
709 
710     // This loop parses one of the following things with each
711     // pass: a filter, a delimiter character (either '-' or '/'),
712     // or a spec (source, target, or variant).
713     for (;;) {
714         ICU_Utility::skipWhitespace(id, pos, TRUE);
715         if (pos == id.length()) {
716             break;
717         }
718 
719         // Parse filters
720         if (allowFilter && filter.length() == 0 &&
721             UnicodeSet::resemblesPattern(id, pos)) {
722 
723             ParsePosition ppos(pos);
724             UErrorCode ec = U_ZERO_ERROR;
725             UnicodeSet set(id, ppos, USET_IGNORE_SPACE, NULL, ec);
726             if (U_FAILURE(ec)) {
727                 pos = start;
728                 return NULL;
729             }
730             id.extractBetween(pos, ppos.getIndex(), filter);
731             pos = ppos.getIndex();
732             continue;
733         }
734 
735         if (delimiter == 0) {
736             UChar c = id.charAt(pos);
737             if ((c == TARGET_SEP && target.length() == 0) ||
738                 (c == VARIANT_SEP && variant.length() == 0)) {
739                 delimiter = c;
740                 ++pos;
741                 continue;
742             }
743         }
744 
745         // We are about to try to parse a spec with no delimiter
746         // when we can no longer do so (we can only do so at the
747         // start); break.
748         if (delimiter == 0 && specCount > 0) {
749             break;
750         }
751 
752         UnicodeString spec = ICU_Utility::parseUnicodeIdentifier(id, pos);
753         if (spec.length() == 0) {
754             // Note that if there was a trailing delimiter, we
755             // consume it.  So Foo-, Foo/, Foo-Bar/, and Foo/Bar-
756             // are legal.
757             break;
758         }
759 
760         switch (delimiter) {
761         case 0:
762             first = spec;
763             break;
764         case TARGET_SEP:
765             target = spec;
766             break;
767         case VARIANT_SEP:
768             variant = spec;
769             break;
770         }
771         ++specCount;
772         delimiter = 0;
773     }
774 
775     // A spec with no prior character is either source or target,
776     // depending on whether an explicit "-target" was seen.
777     if (first.length() != 0) {
778         if (target.length() == 0) {
779             target = first;
780         } else {
781             source = first;
782         }
783     }
784 
785     // Must have either source or target
786     if (source.length() == 0 && target.length() == 0) {
787         pos = start;
788         return NULL;
789     }
790 
791     // Empty source or target defaults to ANY
792     UBool sawSource = TRUE;
793     if (source.length() == 0) {
794         source = ANY;
795         sawSource = FALSE;
796     }
797     if (target.length() == 0) {
798         target = ANY;
799     }
800 
801     return new Specs(source, target, variant, sawSource, filter);
802 }
803 
804 /**
805  * Givens a Spec object, convert it to a SingleID object.  The
806  * Spec object is a more unprocessed parse result.  The SingleID
807  * object contains information about canonical and basic IDs.
808  * @return a SingleID; never returns NULL.  Returned object always
809  * has 'filter' field of NULL.
810  */
811 TransliteratorIDParser::SingleID*
specsToID(const Specs * specs,int32_t dir)812 TransliteratorIDParser::specsToID(const Specs* specs, int32_t dir) {
813     UnicodeString canonID;
814     UnicodeString basicID;
815     UnicodeString basicPrefix;
816     if (specs != NULL) {
817         UnicodeString buf;
818         if (dir == FORWARD) {
819             if (specs->sawSource) {
820                 buf.append(specs->source).append(TARGET_SEP);
821             } else {
822                 basicPrefix = specs->source;
823                 basicPrefix.append(TARGET_SEP);
824             }
825             buf.append(specs->target);
826         } else {
827             buf.append(specs->target).append(TARGET_SEP).append(specs->source);
828         }
829         if (specs->variant.length() != 0) {
830             buf.append(VARIANT_SEP).append(specs->variant);
831         }
832         basicID = basicPrefix;
833         basicID.append(buf);
834         if (specs->filter.length() != 0) {
835             buf.insert(0, specs->filter);
836         }
837         canonID = buf;
838     }
839     return new SingleID(canonID, basicID);
840 }
841 
842 /**
843  * Given a Specs object, return a SingleID representing the
844  * special inverse of that ID.  If there is no special inverse
845  * then return NULL.
846  * @return a SingleID or NULL.  Returned object always has
847  * 'filter' field of NULL.
848  */
849 TransliteratorIDParser::SingleID*
specsToSpecialInverse(const Specs & specs,UErrorCode & status)850 TransliteratorIDParser::specsToSpecialInverse(const Specs& specs, UErrorCode &status) {
851     if (0!=specs.source.caseCompare(ANY, U_FOLD_CASE_DEFAULT)) {
852         return NULL;
853     }
854     init(status);
855 
856     UnicodeString* inverseTarget;
857 
858     umtx_lock(&LOCK);
859     inverseTarget = (UnicodeString*) SPECIAL_INVERSES->get(specs.target);
860     umtx_unlock(&LOCK);
861 
862     if (inverseTarget != NULL) {
863         // If the original ID contained "Any-" then make the
864         // special inverse "Any-Foo"; otherwise make it "Foo".
865         // So "Any-NFC" => "Any-NFD" but "NFC" => "NFD".
866         UnicodeString buf;
867         if (specs.filter.length() != 0) {
868             buf.append(specs.filter);
869         }
870         if (specs.sawSource) {
871             buf.append(ANY).append(TARGET_SEP);
872         }
873         buf.append(*inverseTarget);
874 
875         UnicodeString basicID(ANY);
876         basicID.append(TARGET_SEP).append(*inverseTarget);
877 
878         if (specs.variant.length() != 0) {
879             buf.append(VARIANT_SEP).append(specs.variant);
880             basicID.append(VARIANT_SEP).append(specs.variant);
881         }
882         return new SingleID(buf, basicID);
883     }
884     return NULL;
885 }
886 
887 /**
888  * Glue method to get around access problems in C++.  This would
889  * ideally be inline but we want to avoid a circular header
890  * dependency.
891  */
createBasicInstance(const UnicodeString & id,const UnicodeString * canonID)892 Transliterator* TransliteratorIDParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) {
893     return Transliterator::createBasicInstance(id, canonID);
894 }
895 
896 /**
897  * Initialize static memory.
898  */
init(UErrorCode & status)899 void TransliteratorIDParser::init(UErrorCode &status) {
900     if (SPECIAL_INVERSES != NULL) {
901         return;
902     }
903 
904     Hashtable* special_inverses = new Hashtable(TRUE, status);
905     // Null pointer check
906     if (special_inverses == NULL) {
907     	status = U_MEMORY_ALLOCATION_ERROR;
908     	return;
909     }
910     special_inverses->setValueDeleter(uhash_deleteUnicodeString);
911 
912     umtx_lock(&LOCK);
913     if (SPECIAL_INVERSES == NULL) {
914         SPECIAL_INVERSES = special_inverses;
915         special_inverses = NULL;
916     }
917     umtx_unlock(&LOCK);
918     delete special_inverses; /*null instance*/
919 
920     ucln_i18n_registerCleanup(UCLN_I18N_TRANSLITERATOR, utrans_transliterator_cleanup);
921 }
922 
923 /**
924  * Free static memory.
925  */
cleanup()926 void TransliteratorIDParser::cleanup() {
927     if (SPECIAL_INVERSES) {
928         delete SPECIAL_INVERSES;
929         SPECIAL_INVERSES = NULL;
930     }
931     umtx_destroy(&LOCK);
932 }
933 
934 U_NAMESPACE_END
935 
936 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
937 
938 //eof
939