• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *   Copyright (C) 2004-2011, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 *   file name:  ucol_sit.cpp
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 * Modification history
12 * Date        Name      Comments
13 * 03/12/2004  weiv      Creation
14 */
15 
16 #include "unicode/ustring.h"
17 #include "unicode/udata.h"
18 
19 #include "utracimp.h"
20 #include "ucol_imp.h"
21 #include "ucol_tok.h"
22 #include "cmemory.h"
23 #include "cstring.h"
24 #include "uresimp.h"
25 
26 #if !UCONFIG_NO_COLLATION
27 
28 enum OptionsList {
29     UCOL_SIT_LANGUAGE = 0,
30     UCOL_SIT_SCRIPT,
31     UCOL_SIT_REGION,
32     UCOL_SIT_VARIANT,
33     UCOL_SIT_KEYWORD,
34     UCOL_SIT_BCP47,
35     UCOL_SIT_STRENGTH,
36     UCOL_SIT_CASE_LEVEL,
37     UCOL_SIT_CASE_FIRST,
38     UCOL_SIT_NUMERIC_COLLATION,
39     UCOL_SIT_ALTERNATE_HANDLING,
40     UCOL_SIT_NORMALIZATION_MODE,
41     UCOL_SIT_FRENCH_COLLATION,
42     UCOL_SIT_HIRAGANA_QUATERNARY,
43     UCOL_SIT_VARIABLE_TOP,
44     UCOL_SIT_VARIABLE_TOP_VALUE,
45     UCOL_SIT_ITEMS_COUNT
46 };
47 
48 /* option starters chars. */
49 static const char alternateHArg     = 'A';
50 static const char variableTopValArg = 'B';
51 static const char caseFirstArg      = 'C';
52 static const char numericCollArg    = 'D';
53 static const char caseLevelArg      = 'E';
54 static const char frenchCollArg     = 'F';
55 static const char hiraganaQArg      = 'H';
56 static const char keywordArg        = 'K';
57 static const char languageArg       = 'L';
58 static const char normArg           = 'N';
59 static const char regionArg         = 'R';
60 static const char strengthArg       = 'S';
61 static const char variableTopArg    = 'T';
62 static const char variantArg        = 'V';
63 static const char RFC3066Arg        = 'X';
64 static const char scriptArg         = 'Z';
65 
66 static const char collationKeyword[]  = "@collation=";
67 
68 static const int32_t locElementCount = 5;
69 static const int32_t locElementCapacity = 32;
70 static const int32_t loc3066Capacity = 256;
71 static const int32_t internalBufferSize = 512;
72 
73 /* structure containing specification of a collator. Initialized
74  * from a short string. Also used to construct a short string from a
75  * collator instance
76  */
77 struct CollatorSpec {
78     char locElements[locElementCount][locElementCapacity];
79     char locale[loc3066Capacity];
80     UColAttributeValue options[UCOL_ATTRIBUTE_COUNT];
81     uint32_t variableTopValue;
82     UChar variableTopString[locElementCapacity];
83     int32_t variableTopStringLen;
84     UBool variableTopSet;
85     struct {
86         const char *start;
87         int32_t len;
88     } entries[UCOL_SIT_ITEMS_COUNT];
89 };
90 
91 
92 /* structure for converting between character attribute
93  * representation and real collation attribute value.
94  */
95 struct AttributeConversion {
96     char letter;
97     UColAttributeValue value;
98 };
99 
100 static const AttributeConversion conversions[12] = {
101     { '1', UCOL_PRIMARY },
102     { '2', UCOL_SECONDARY },
103     { '3', UCOL_TERTIARY },
104     { '4', UCOL_QUATERNARY },
105     { 'D', UCOL_DEFAULT },
106     { 'I', UCOL_IDENTICAL },
107     { 'L', UCOL_LOWER_FIRST },
108     { 'N', UCOL_NON_IGNORABLE },
109     { 'O', UCOL_ON },
110     { 'S', UCOL_SHIFTED },
111     { 'U', UCOL_UPPER_FIRST },
112     { 'X', UCOL_OFF }
113 };
114 
115 
116 static char
ucol_sit_attributeValueToLetter(UColAttributeValue value,UErrorCode * status)117 ucol_sit_attributeValueToLetter(UColAttributeValue value, UErrorCode *status) {
118     uint32_t i = 0;
119     for(i = 0; i < sizeof(conversions)/sizeof(conversions[0]); i++) {
120         if(conversions[i].value == value) {
121             return conversions[i].letter;
122         }
123     }
124     *status = U_ILLEGAL_ARGUMENT_ERROR;
125     return 0;
126 }
127 
128 static UColAttributeValue
ucol_sit_letterToAttributeValue(char letter,UErrorCode * status)129 ucol_sit_letterToAttributeValue(char letter, UErrorCode *status) {
130     uint32_t i = 0;
131     for(i = 0; i < sizeof(conversions)/sizeof(conversions[0]); i++) {
132         if(conversions[i].letter == letter) {
133             return conversions[i].value;
134         }
135     }
136     *status = U_ILLEGAL_ARGUMENT_ERROR;
137     return UCOL_DEFAULT;
138 }
139 
140 /* function prototype for functions used to parse a short string */
141 U_CDECL_BEGIN
142 typedef const char* U_CALLCONV
143 ActionFunction(CollatorSpec *spec, uint32_t value1, const char* string,
144                UErrorCode *status);
145 U_CDECL_END
146 
147 U_CDECL_BEGIN
148 static const char* U_CALLCONV
_processLocaleElement(CollatorSpec * spec,uint32_t value,const char * string,UErrorCode * status)149 _processLocaleElement(CollatorSpec *spec, uint32_t value, const char* string,
150                       UErrorCode *status)
151 {
152     int32_t len = 0;
153     do {
154         if(value == 0 || value == 4) {
155             spec->locElements[value][len++] = uprv_tolower(*string);
156         } else {
157             spec->locElements[value][len++] = *string;
158         }
159     } while(*(++string) != '_' && *string && len < locElementCapacity);
160     if(len >= locElementCapacity) {
161         *status = U_BUFFER_OVERFLOW_ERROR;
162         return string;
163     }
164     // don't skip the underscore at the end
165     return string;
166 }
167 U_CDECL_END
168 
169 U_CDECL_BEGIN
170 static const char* U_CALLCONV
_processRFC3066Locale(CollatorSpec * spec,uint32_t,const char * string,UErrorCode * status)171 _processRFC3066Locale(CollatorSpec *spec, uint32_t, const char* string,
172                       UErrorCode *status)
173 {
174     char terminator = *string;
175     string++;
176     const char *end = uprv_strchr(string+1, terminator);
177     if(end == NULL || end - string >= loc3066Capacity) {
178         *status = U_BUFFER_OVERFLOW_ERROR;
179         return string;
180     } else {
181         uprv_strncpy(spec->locale, string, end-string);
182         return end+1;
183     }
184 }
185 
186 U_CDECL_END
187 
188 U_CDECL_BEGIN
189 static const char* U_CALLCONV
_processCollatorOption(CollatorSpec * spec,uint32_t option,const char * string,UErrorCode * status)190 _processCollatorOption(CollatorSpec *spec, uint32_t option, const char* string,
191                        UErrorCode *status)
192 {
193     spec->options[option] = ucol_sit_letterToAttributeValue(*string, status);
194     if((*(++string) != '_' && *string) || U_FAILURE(*status)) {
195         *status = U_ILLEGAL_ARGUMENT_ERROR;
196     }
197     return string;
198 }
199 U_CDECL_END
200 
201 
202 static UChar
readHexCodeUnit(const char ** string,UErrorCode * status)203 readHexCodeUnit(const char **string, UErrorCode *status)
204 {
205     UChar result = 0;
206     int32_t value = 0;
207     char c;
208     int32_t noDigits = 0;
209     while((c = **string) != 0 && noDigits < 4) {
210         if( c >= '0' && c <= '9') {
211             value = c - '0';
212         } else if ( c >= 'a' && c <= 'f') {
213             value = c - 'a' + 10;
214         } else if ( c >= 'A' && c <= 'F') {
215             value = c - 'A' + 10;
216         } else {
217             *status = U_ILLEGAL_ARGUMENT_ERROR;
218             return 0;
219         }
220         result = (result << 4) | (UChar)value;
221         noDigits++;
222         (*string)++;
223     }
224     // if the string was terminated before we read 4 digits, set an error
225     if(noDigits < 4) {
226         *status = U_ILLEGAL_ARGUMENT_ERROR;
227     }
228     return result;
229 }
230 
231 U_CDECL_BEGIN
232 static const char* U_CALLCONV
_processVariableTop(CollatorSpec * spec,uint32_t value1,const char * string,UErrorCode * status)233 _processVariableTop(CollatorSpec *spec, uint32_t value1, const char* string, UErrorCode *status)
234 {
235     // get four digits
236     int32_t i = 0;
237     if(!value1) {
238         while(U_SUCCESS(*status) && i < locElementCapacity && *string != 0 && *string != '_') {
239             spec->variableTopString[i++] = readHexCodeUnit(&string, status);
240         }
241         spec->variableTopStringLen = i;
242         if(i == locElementCapacity && *string != 0 && *string != '_') {
243             *status = U_BUFFER_OVERFLOW_ERROR;
244         }
245     } else {
246         spec->variableTopValue = readHexCodeUnit(&string, status);
247     }
248     if(U_SUCCESS(*status)) {
249         spec->variableTopSet = TRUE;
250     }
251     return string;
252 }
253 U_CDECL_END
254 
255 
256 /* Table for parsing short strings */
257 struct ShortStringOptions {
258     char optionStart;
259     ActionFunction *action;
260     uint32_t attr;
261 };
262 
263 static const ShortStringOptions options[UCOL_SIT_ITEMS_COUNT] =
264 {
265 /* 10 ALTERNATE_HANDLING */   {alternateHArg,     _processCollatorOption, UCOL_ALTERNATE_HANDLING }, // alternate  N, S, D
266 /* 15 VARIABLE_TOP_VALUE */   {variableTopValArg, _processVariableTop,    1 },
267 /* 08 CASE_FIRST */           {caseFirstArg,      _processCollatorOption, UCOL_CASE_FIRST }, // case first L, U, X, D
268 /* 09 NUMERIC_COLLATION */    {numericCollArg,    _processCollatorOption, UCOL_NUMERIC_COLLATION }, // codan      O, X, D
269 /* 07 CASE_LEVEL */           {caseLevelArg,      _processCollatorOption, UCOL_CASE_LEVEL }, // case level O, X, D
270 /* 12 FRENCH_COLLATION */     {frenchCollArg,     _processCollatorOption, UCOL_FRENCH_COLLATION }, // french     O, X, D
271 /* 13 HIRAGANA_QUATERNARY] */ {hiraganaQArg,      _processCollatorOption, UCOL_HIRAGANA_QUATERNARY_MODE }, // hiragana   O, X, D
272 /* 04 KEYWORD */              {keywordArg,        _processLocaleElement,  4 }, // keyword
273 /* 00 LANGUAGE */             {languageArg,       _processLocaleElement,  0 }, // language
274 /* 11 NORMALIZATION_MODE */   {normArg,           _processCollatorOption, UCOL_NORMALIZATION_MODE }, // norm       O, X, D
275 /* 02 REGION */               {regionArg,         _processLocaleElement,  2 }, // region
276 /* 06 STRENGTH */             {strengthArg,       _processCollatorOption, UCOL_STRENGTH }, // strength   1, 2, 3, 4, I, D
277 /* 14 VARIABLE_TOP */         {variableTopArg,    _processVariableTop,    0 },
278 /* 03 VARIANT */              {variantArg,        _processLocaleElement,  3 }, // variant
279 /* 05 RFC3066BIS */           {RFC3066Arg,        _processRFC3066Locale,  0 }, // rfc3066bis locale name
280 /* 01 SCRIPT */               {scriptArg,         _processLocaleElement,  1 }  // script
281 };
282 
283 
284 static
ucol_sit_readOption(const char * start,CollatorSpec * spec,UErrorCode * status)285 const char* ucol_sit_readOption(const char *start, CollatorSpec *spec,
286                             UErrorCode *status)
287 {
288   int32_t i = 0;
289 
290   for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) {
291       if(*start == options[i].optionStart) {
292           spec->entries[i].start = start;
293           const char* end = options[i].action(spec, options[i].attr, start+1, status);
294           spec->entries[i].len = (int32_t)(end - start);
295           return end;
296       }
297   }
298   *status = U_ILLEGAL_ARGUMENT_ERROR;
299   return start;
300 }
301 
302 static
ucol_sit_initCollatorSpecs(CollatorSpec * spec)303 void ucol_sit_initCollatorSpecs(CollatorSpec *spec)
304 {
305     // reset everything
306     uprv_memset(spec, 0, sizeof(CollatorSpec));
307     // set collation options to default
308     int32_t i = 0;
309     for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
310         spec->options[i] = UCOL_DEFAULT;
311     }
312 }
313 
314 static const char*
ucol_sit_readSpecs(CollatorSpec * s,const char * string,UParseError * parseError,UErrorCode * status)315 ucol_sit_readSpecs(CollatorSpec *s, const char *string,
316                         UParseError *parseError, UErrorCode *status)
317 {
318     const char *definition = string;
319     while(U_SUCCESS(*status) && *string) {
320         string = ucol_sit_readOption(string, s, status);
321         // advance over '_'
322         while(*string && *string == '_') {
323             string++;
324         }
325     }
326     if(U_FAILURE(*status)) {
327         parseError->offset = (int32_t)(string - definition);
328     }
329     return string;
330 }
331 
332 static
ucol_sit_dumpSpecs(CollatorSpec * s,char * destination,int32_t capacity,UErrorCode * status)333 int32_t ucol_sit_dumpSpecs(CollatorSpec *s, char *destination, int32_t capacity, UErrorCode *status)
334 {
335     int32_t i = 0, j = 0;
336     int32_t len = 0;
337     char optName;
338     if(U_SUCCESS(*status)) {
339         for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) {
340             if(s->entries[i].start) {
341                 if(len) {
342                     if(len < capacity) {
343                         uprv_strcat(destination, "_");
344                     }
345                     len++;
346                 }
347                 optName = *(s->entries[i].start);
348                 if(optName == languageArg || optName == regionArg || optName == variantArg || optName == keywordArg) {
349                     for(j = 0; j < s->entries[i].len; j++) {
350                         if(len + j < capacity) {
351                             destination[len+j] = uprv_toupper(*(s->entries[i].start+j));
352                         }
353                     }
354                     len += s->entries[i].len;
355                 } else {
356                     len += s->entries[i].len;
357                     if(len < capacity) {
358                         uprv_strncat(destination,s->entries[i].start, s->entries[i].len);
359                     }
360                 }
361             }
362         }
363         return len;
364     } else {
365         return 0;
366     }
367 }
368 
369 static void
ucol_sit_calculateWholeLocale(CollatorSpec * s)370 ucol_sit_calculateWholeLocale(CollatorSpec *s) {
371     // put the locale together, unless we have a done
372     // locale
373     if(s->locale[0] == 0) {
374         // first the language
375         uprv_strcat(s->locale, s->locElements[0]);
376         // then the script, if present
377         if(*(s->locElements[1])) {
378             uprv_strcat(s->locale, "_");
379             uprv_strcat(s->locale, s->locElements[1]);
380         }
381         // then the region, if present
382         if(*(s->locElements[2])) {
383             uprv_strcat(s->locale, "_");
384             uprv_strcat(s->locale, s->locElements[2]);
385         } else if(*(s->locElements[3])) { // if there is a variant, we need an underscore
386             uprv_strcat(s->locale, "_");
387         }
388         // add variant, if there
389         if(*(s->locElements[3])) {
390             uprv_strcat(s->locale, "_");
391             uprv_strcat(s->locale, s->locElements[3]);
392         }
393 
394         // if there is a collation keyword, add that too
395         if(*(s->locElements[4])) {
396             uprv_strcat(s->locale, collationKeyword);
397             uprv_strcat(s->locale, s->locElements[4]);
398         }
399     }
400 }
401 
402 
403 U_CAPI void U_EXPORT2
ucol_prepareShortStringOpen(const char * definition,UBool,UParseError * parseError,UErrorCode * status)404 ucol_prepareShortStringOpen( const char *definition,
405                           UBool,
406                           UParseError *parseError,
407                           UErrorCode *status)
408 {
409     if(U_FAILURE(*status)) return;
410 
411     UParseError internalParseError;
412 
413     if(!parseError) {
414         parseError = &internalParseError;
415     }
416     parseError->line = 0;
417     parseError->offset = 0;
418     parseError->preContext[0] = 0;
419     parseError->postContext[0] = 0;
420 
421 
422     // first we want to pick stuff out of short string.
423     // we'll end up with an UCA version, locale and a bunch of
424     // settings
425 
426     // analyse the string in order to get everything we need.
427     CollatorSpec s;
428     ucol_sit_initCollatorSpecs(&s);
429     ucol_sit_readSpecs(&s, definition, parseError, status);
430     ucol_sit_calculateWholeLocale(&s);
431 
432     char buffer[internalBufferSize];
433     uprv_memset(buffer, 0, internalBufferSize);
434     uloc_canonicalize(s.locale, buffer, internalBufferSize, status);
435 
436     UResourceBundle *b = ures_open(U_ICUDATA_COLL, buffer, status);
437     /* we try to find stuff from keyword */
438     UResourceBundle *collations = ures_getByKey(b, "collations", NULL, status);
439     UResourceBundle *collElem = NULL;
440     char keyBuffer[256];
441     // if there is a keyword, we pick it up and try to get elements
442     if(!uloc_getKeywordValue(buffer, "collation", keyBuffer, 256, status)) {
443       // no keyword. we try to find the default setting, which will give us the keyword value
444       UResourceBundle *defaultColl = ures_getByKeyWithFallback(collations, "default", NULL, status);
445       if(U_SUCCESS(*status)) {
446         int32_t defaultKeyLen = 0;
447         const UChar *defaultKey = ures_getString(defaultColl, &defaultKeyLen, status);
448         u_UCharsToChars(defaultKey, keyBuffer, defaultKeyLen);
449         keyBuffer[defaultKeyLen] = 0;
450       } else {
451         *status = U_INTERNAL_PROGRAM_ERROR;
452         return;
453       }
454       ures_close(defaultColl);
455     }
456     collElem = ures_getByKeyWithFallback(collations, keyBuffer, collElem, status);
457     ures_close(collElem);
458     ures_close(collations);
459     ures_close(b);
460 }
461 
462 
463 U_CAPI UCollator* U_EXPORT2
ucol_openFromShortString(const char * definition,UBool forceDefaults,UParseError * parseError,UErrorCode * status)464 ucol_openFromShortString( const char *definition,
465                           UBool forceDefaults,
466                           UParseError *parseError,
467                           UErrorCode *status)
468 {
469     UTRACE_ENTRY_OC(UTRACE_UCOL_OPEN_FROM_SHORT_STRING);
470     UTRACE_DATA1(UTRACE_INFO, "short string = \"%s\"", definition);
471 
472     if(U_FAILURE(*status)) return 0;
473 
474     UParseError internalParseError;
475 
476     if(!parseError) {
477         parseError = &internalParseError;
478     }
479     parseError->line = 0;
480     parseError->offset = 0;
481     parseError->preContext[0] = 0;
482     parseError->postContext[0] = 0;
483 
484 
485     // first we want to pick stuff out of short string.
486     // we'll end up with an UCA version, locale and a bunch of
487     // settings
488 
489     // analyse the string in order to get everything we need.
490     const char *string = definition;
491     CollatorSpec s;
492     ucol_sit_initCollatorSpecs(&s);
493     string = ucol_sit_readSpecs(&s, definition, parseError, status);
494     ucol_sit_calculateWholeLocale(&s);
495 
496     char buffer[internalBufferSize];
497     uprv_memset(buffer, 0, internalBufferSize);
498     uloc_canonicalize(s.locale, buffer, internalBufferSize, status);
499 
500     UCollator *result = ucol_open(buffer, status);
501     int32_t i = 0;
502 
503     for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
504         if(s.options[i] != UCOL_DEFAULT) {
505             if(forceDefaults || ucol_getAttribute(result, (UColAttribute)i, status) != s.options[i]) {
506                 ucol_setAttribute(result, (UColAttribute)i, s.options[i], status);
507             }
508 
509             if(U_FAILURE(*status)) {
510                 parseError->offset = (int32_t)(string - definition);
511                 ucol_close(result);
512                 return NULL;
513             }
514 
515         }
516     }
517     if(s.variableTopSet) {
518         if(s.variableTopString[0]) {
519             ucol_setVariableTop(result, s.variableTopString, s.variableTopStringLen, status);
520         } else { // we set by value, using 'B'
521             ucol_restoreVariableTop(result, s.variableTopValue, status);
522         }
523     }
524 
525 
526     if(U_FAILURE(*status)) { // here it can only be a bogus value
527         ucol_close(result);
528         result = NULL;
529     }
530 
531     UTRACE_EXIT_PTR_STATUS(result, *status);
532     return result;
533 }
534 
535 
appendShortStringElement(const char * src,int32_t len,char * result,int32_t * resultSize,int32_t capacity,char arg)536 static void appendShortStringElement(const char *src, int32_t len, char *result, int32_t *resultSize, int32_t capacity, char arg)
537 {
538     if(len) {
539         if(*resultSize) {
540             if(*resultSize < capacity) {
541                 uprv_strcat(result, "_");
542             }
543             (*resultSize)++;
544         }
545         *resultSize += len + 1;
546         if(*resultSize < capacity) {
547             uprv_strncat(result, &arg, 1);
548             uprv_strncat(result, src, len);
549         }
550     }
551 }
552 
553 U_CAPI int32_t U_EXPORT2
ucol_getShortDefinitionString(const UCollator * coll,const char * locale,char * dst,int32_t capacity,UErrorCode * status)554 ucol_getShortDefinitionString(const UCollator *coll,
555                               const char *locale,
556                               char *dst,
557                               int32_t capacity,
558                               UErrorCode *status)
559 {
560     if(U_FAILURE(*status)) return 0;
561     char buffer[internalBufferSize];
562     uprv_memset(buffer, 0, internalBufferSize*sizeof(char));
563     int32_t resultSize = 0;
564     char tempbuff[internalBufferSize];
565     char locBuff[internalBufferSize];
566     uprv_memset(buffer, 0, internalBufferSize*sizeof(char));
567     int32_t elementSize = 0;
568     UBool isAvailable = 0;
569     CollatorSpec s;
570     ucol_sit_initCollatorSpecs(&s);
571 
572     if(!locale) {
573         locale = ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, status);
574     }
575     elementSize = ucol_getFunctionalEquivalent(locBuff, internalBufferSize, "collation", locale, &isAvailable, status);
576 
577     if(elementSize) {
578         // we should probably canonicalize here...
579         elementSize = uloc_getLanguage(locBuff, tempbuff, internalBufferSize, status);
580         appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, languageArg);
581         elementSize = uloc_getCountry(locBuff, tempbuff, internalBufferSize, status);
582         appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, regionArg);
583         elementSize = uloc_getScript(locBuff, tempbuff, internalBufferSize, status);
584         appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, scriptArg);
585         elementSize = uloc_getVariant(locBuff, tempbuff, internalBufferSize, status);
586         appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, variantArg);
587         elementSize = uloc_getKeywordValue(locBuff, "collation", tempbuff, internalBufferSize, status);
588         appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, keywordArg);
589     }
590 
591     int32_t i = 0;
592     UColAttributeValue attribute = UCOL_DEFAULT;
593     for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) {
594         if(options[i].action == _processCollatorOption) {
595             attribute = ucol_getAttributeOrDefault(coll, (UColAttribute)options[i].attr, status);
596             if(attribute != UCOL_DEFAULT) {
597                 char letter = ucol_sit_attributeValueToLetter(attribute, status);
598                 appendShortStringElement(&letter, 1,
599                     buffer, &resultSize, /*capacity*/internalBufferSize, options[i].optionStart);
600             }
601         }
602     }
603     if(coll->variableTopValueisDefault == FALSE) {
604         //s.variableTopValue = ucol_getVariableTop(coll, status);
605         elementSize = T_CString_integerToString(tempbuff, coll->variableTopValue, 16);
606         appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, variableTopValArg);
607     }
608 
609     UParseError parseError;
610     return ucol_normalizeShortDefinitionString(buffer, dst, capacity, &parseError, status);
611 }
612 
613 U_CAPI int32_t U_EXPORT2
ucol_normalizeShortDefinitionString(const char * definition,char * destination,int32_t capacity,UParseError * parseError,UErrorCode * status)614 ucol_normalizeShortDefinitionString(const char *definition,
615                                     char *destination,
616                                     int32_t capacity,
617                                     UParseError *parseError,
618                                     UErrorCode *status)
619 {
620 
621     if(U_FAILURE(*status)) {
622         return 0;
623     }
624 
625     if(destination) {
626         uprv_memset(destination, 0, capacity*sizeof(char));
627     }
628 
629     UParseError pe;
630     if(!parseError) {
631         parseError = &pe;
632     }
633 
634     // validate
635     CollatorSpec s;
636     ucol_sit_initCollatorSpecs(&s);
637     ucol_sit_readSpecs(&s, definition, parseError, status);
638     return ucol_sit_dumpSpecs(&s, destination, capacity, status);
639 }
640 
641 U_CAPI UColAttributeValue  U_EXPORT2
ucol_getAttributeOrDefault(const UCollator * coll,UColAttribute attr,UErrorCode * status)642 ucol_getAttributeOrDefault(const UCollator *coll, UColAttribute attr, UErrorCode *status)
643 {
644     if(U_FAILURE(*status) || coll == NULL) {
645       return UCOL_DEFAULT;
646     }
647     switch(attr) {
648     case UCOL_NUMERIC_COLLATION:
649         return coll->numericCollationisDefault?UCOL_DEFAULT:coll->numericCollation;
650     case UCOL_HIRAGANA_QUATERNARY_MODE:
651         return coll->hiraganaQisDefault?UCOL_DEFAULT:coll->hiraganaQ;
652     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
653         return coll->frenchCollationisDefault?UCOL_DEFAULT:coll->frenchCollation;
654     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
655         return coll->alternateHandlingisDefault?UCOL_DEFAULT:coll->alternateHandling;
656     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
657         return coll->caseFirstisDefault?UCOL_DEFAULT:coll->caseFirst;
658     case UCOL_CASE_LEVEL: /* do we have an extra case level */
659         return coll->caseLevelisDefault?UCOL_DEFAULT:coll->caseLevel;
660     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
661         return coll->normalizationModeisDefault?UCOL_DEFAULT:coll->normalizationMode;
662     case UCOL_STRENGTH:         /* attribute for strength */
663         return coll->strengthisDefault?UCOL_DEFAULT:coll->strength;
664     case UCOL_ATTRIBUTE_COUNT:
665     default:
666         *status = U_ILLEGAL_ARGUMENT_ERROR;
667         break;
668     }
669     return UCOL_DEFAULT;
670 }
671 
672 
673 struct contContext {
674     const UCollator *coll;
675     USet            *conts;
676     USet            *expansions;
677     USet            *removedContractions;
678     UBool           addPrefixes;
679     UErrorCode      *status;
680 };
681 
682 
683 
684 static void
addSpecial(contContext * context,UChar * buffer,int32_t bufLen,uint32_t CE,int32_t leftIndex,int32_t rightIndex,UErrorCode * status)685 addSpecial(contContext *context, UChar *buffer, int32_t bufLen,
686                uint32_t CE, int32_t leftIndex, int32_t rightIndex, UErrorCode *status)
687 {
688   const UCollator *coll = context->coll;
689   USet *contractions = context->conts;
690   USet *expansions = context->expansions;
691   UBool addPrefixes = context->addPrefixes;
692 
693     const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
694     uint32_t newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
695     // we might have a contraction that ends from previous level
696     if(newCE != UCOL_NOT_FOUND) {
697       if(isSpecial(CE) && getCETag(CE) == CONTRACTION_TAG && isSpecial(newCE) && getCETag(newCE) == SPEC_PROC_TAG && addPrefixes) {
698         addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status);
699       }
700       if(contractions && rightIndex-leftIndex > 1) {
701             uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex);
702             if(expansions && isSpecial(CE) && getCETag(CE) == EXPANSION_TAG) {
703               uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex);
704             }
705       }
706     }
707 
708     UCharOffset++;
709     // check whether we're doing contraction or prefix
710     if(getCETag(CE) == SPEC_PROC_TAG && addPrefixes) {
711       if(leftIndex == 0) {
712           *status = U_INTERNAL_PROGRAM_ERROR;
713           return;
714       }
715       --leftIndex;
716       while(*UCharOffset != 0xFFFF) {
717           newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
718           buffer[leftIndex] = *UCharOffset;
719           if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) {
720               addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status);
721           } else {
722             if(contractions) {
723                 uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex);
724             }
725             if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) {
726               uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex);
727             }
728           }
729           UCharOffset++;
730       }
731     } else if(getCETag(CE) == CONTRACTION_TAG) {
732       if(rightIndex == bufLen-1) {
733           *status = U_INTERNAL_PROGRAM_ERROR;
734           return;
735       }
736       while(*UCharOffset != 0xFFFF) {
737           newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
738           buffer[rightIndex] = *UCharOffset;
739           if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) {
740               addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex+1, status);
741           } else {
742             if(contractions) {
743               uset_addString(contractions, buffer+leftIndex, rightIndex+1-leftIndex);
744             }
745             if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) {
746               uset_addString(expansions, buffer+leftIndex, rightIndex+1-leftIndex);
747             }
748           }
749           UCharOffset++;
750       }
751     }
752 
753 }
754 
755 U_CDECL_BEGIN
756 static UBool U_CALLCONV
_processSpecials(const void * context,UChar32 start,UChar32 limit,uint32_t CE)757 _processSpecials(const void *context, UChar32 start, UChar32 limit, uint32_t CE)
758 {
759     UErrorCode *status = ((contContext *)context)->status;
760     USet *expansions = ((contContext *)context)->expansions;
761     USet *removed = ((contContext *)context)->removedContractions;
762     UBool addPrefixes = ((contContext *)context)->addPrefixes;
763     UChar contraction[internalBufferSize];
764     if(isSpecial(CE)) {
765       if(((getCETag(CE) == SPEC_PROC_TAG && addPrefixes) || getCETag(CE) == CONTRACTION_TAG)) {
766         while(start < limit && U_SUCCESS(*status)) {
767             // if there are suppressed contractions, we don't
768             // want to add them.
769             if(removed && uset_contains(removed, start)) {
770                 start++;
771                 continue;
772             }
773             // we start our contraction from middle, since we don't know if it
774             // will grow toward right or left
775             contraction[internalBufferSize/2] = (UChar)start;
776             addSpecial(((contContext *)context), contraction, internalBufferSize, CE, internalBufferSize/2, internalBufferSize/2+1, status);
777             start++;
778         }
779       } else if(expansions && getCETag(CE) == EXPANSION_TAG) {
780         while(start < limit && U_SUCCESS(*status)) {
781           uset_add(expansions, start++);
782         }
783       }
784     }
785     if(U_FAILURE(*status)) {
786         return FALSE;
787     } else {
788         return TRUE;
789     }
790 }
791 
792 U_CDECL_END
793 
794 
795 
796 /**
797  * Get a set containing the contractions defined by the collator. The set includes
798  * both the UCA contractions and the contractions defined by the collator
799  * @param coll collator
800  * @param conts the set to hold the result
801  * @param status to hold the error code
802  * @return the size of the contraction set
803  */
804 U_CAPI int32_t U_EXPORT2
ucol_getContractions(const UCollator * coll,USet * contractions,UErrorCode * status)805 ucol_getContractions( const UCollator *coll,
806                   USet *contractions,
807                   UErrorCode *status)
808 {
809   ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status);
810   return uset_getItemCount(contractions);
811 }
812 
813 /**
814  * Get a set containing the expansions defined by the collator. The set includes
815  * both the UCA expansions and the expansions defined by the tailoring
816  * @param coll collator
817  * @param conts the set to hold the result
818  * @param addPrefixes add the prefix contextual elements to contractions
819  * @param status to hold the error code
820  *
821  * @draft ICU 3.4
822  */
823 U_CAPI void U_EXPORT2
ucol_getContractionsAndExpansions(const UCollator * coll,USet * contractions,USet * expansions,UBool addPrefixes,UErrorCode * status)824 ucol_getContractionsAndExpansions( const UCollator *coll,
825                   USet *contractions,
826                   USet *expansions,
827                   UBool addPrefixes,
828                   UErrorCode *status)
829 {
830     if(U_FAILURE(*status)) {
831         return;
832     }
833     if(coll == NULL) {
834         *status = U_ILLEGAL_ARGUMENT_ERROR;
835         return;
836     }
837 
838     if(contractions) {
839       uset_clear(contractions);
840     }
841     if(expansions) {
842       uset_clear(expansions);
843     }
844     int32_t rulesLen = 0;
845     const UChar* rules = ucol_getRules(coll, &rulesLen);
846     UColTokenParser src;
847     ucol_tok_initTokenList(&src, rules, rulesLen, coll->UCA,
848                            ucol_tok_getRulesFromBundle, NULL, status);
849 
850     contContext c = { NULL, contractions, expansions, src.removeSet, addPrefixes, status };
851 
852     // Add the UCA contractions
853     c.coll = coll->UCA;
854     utrie_enum(&coll->UCA->mapping, NULL, _processSpecials, &c);
855 
856     // This is collator specific. Add contractions from a collator
857     c.coll = coll;
858     c.removedContractions =  NULL;
859     utrie_enum(&coll->mapping, NULL, _processSpecials, &c);
860     ucol_tok_closeTokenList(&src);
861 }
862 
863 U_CAPI int32_t U_EXPORT2
ucol_getUnsafeSet(const UCollator * coll,USet * unsafe,UErrorCode * status)864 ucol_getUnsafeSet( const UCollator *coll,
865                   USet *unsafe,
866                   UErrorCode *status)
867 {
868     UChar buffer[internalBufferSize];
869     int32_t len = 0;
870 
871     uset_clear(unsafe);
872 
873     // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant
874     static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d,
875                                     0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 };
876 
877     // add chars that fail the fcd check
878     uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status);
879 
880     // add Thai/Lao prevowels
881     uset_addRange(unsafe, 0xe40, 0xe44);
882     uset_addRange(unsafe, 0xec0, 0xec4);
883     // add lead/trail surrogates
884     uset_addRange(unsafe, 0xd800, 0xdfff);
885 
886     USet *contractions = uset_open(0,0);
887 
888     int32_t i = 0, j = 0;
889     int32_t contsSize = ucol_getContractions(coll, contractions, status);
890     UChar32 c = 0;
891     // Contraction set consists only of strings
892     // to get unsafe code points, we need to
893     // break the strings apart and add them to the unsafe set
894     for(i = 0; i < contsSize; i++) {
895         len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status);
896         if(len > 0) {
897             j = 0;
898             while(j < len) {
899                 U16_NEXT(buffer, j, len, c);
900                 if(j < len) {
901                     uset_add(unsafe, c);
902                 }
903             }
904         }
905     }
906 
907     uset_close(contractions);
908 
909     return uset_size(unsafe);
910 }
911 #endif
912