• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (c) 2002-2014, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 * Author: Alan Liu
9 * Created: October 30 2002
10 * Since: ICU 2.4
11 * 2010nov19 Markus Scherer  Rewrite for formatVersion 2.
12 **********************************************************************
13 */
14 #include "propname.h"
15 #include "unicode/uchar.h"
16 #include "unicode/udata.h"
17 #include "unicode/uscript.h"
18 #include "umutex.h"
19 #include "cmemory.h"
20 #include "cstring.h"
21 #include "uarrsort.h"
22 #include "uinvchar.h"
23 
24 #define INCLUDED_FROM_PROPNAME_CPP
25 #include "propname_data.h"
26 
27 U_CDECL_BEGIN
28 
29 /**
30  * Get the next non-ignorable ASCII character from a property name
31  * and lowercases it.
32  * @return ((advance count for the name)<<8)|character
33  */
34 static inline int32_t
getASCIIPropertyNameChar(const char * name)35 getASCIIPropertyNameChar(const char *name) {
36     int32_t i;
37     char c;
38 
39     /* Ignore delimiters '-', '_', and ASCII White_Space */
40     for(i=0;
41         (c=name[i++])==0x2d || c==0x5f ||
42         c==0x20 || (0x09<=c && c<=0x0d);
43     ) {}
44 
45     if(c!=0) {
46         return (i<<8)|(uint8_t)uprv_asciitolower((char)c);
47     } else {
48         return i<<8;
49     }
50 }
51 
52 /**
53  * Get the next non-ignorable EBCDIC character from a property name
54  * and lowercases it.
55  * @return ((advance count for the name)<<8)|character
56  */
57 static inline int32_t
getEBCDICPropertyNameChar(const char * name)58 getEBCDICPropertyNameChar(const char *name) {
59     int32_t i;
60     char c;
61 
62     /* Ignore delimiters '-', '_', and EBCDIC White_Space */
63     for(i=0;
64         (c=name[i++])==0x60 || c==0x6d ||
65         c==0x40 || c==0x05 || c==0x15 || c==0x25 || c==0x0b || c==0x0c || c==0x0d;
66     ) {}
67 
68     if(c!=0) {
69         return (i<<8)|(uint8_t)uprv_ebcdictolower((char)c);
70     } else {
71         return i<<8;
72     }
73 }
74 
75 /**
76  * Unicode property names and property value names are compared "loosely".
77  *
78  * UCD.html 4.0.1 says:
79  *   For all property names, property value names, and for property values for
80  *   Enumerated, Binary, or Catalog properties, use the following
81  *   loose matching rule:
82  *
83  *   LM3. Ignore case, whitespace, underscore ('_'), and hyphens.
84  *
85  * This function does just that, for (char *) name strings.
86  * It is almost identical to ucnv_compareNames() but also ignores
87  * C0 White_Space characters (U+0009..U+000d, and U+0085 on EBCDIC).
88  *
89  * @internal
90  */
91 
92 U_CAPI int32_t U_EXPORT2
uprv_compareASCIIPropertyNames(const char * name1,const char * name2)93 uprv_compareASCIIPropertyNames(const char *name1, const char *name2) {
94     int32_t rc, r1, r2;
95 
96     for(;;) {
97         r1=getASCIIPropertyNameChar(name1);
98         r2=getASCIIPropertyNameChar(name2);
99 
100         /* If we reach the ends of both strings then they match */
101         if(((r1|r2)&0xff)==0) {
102             return 0;
103         }
104 
105         /* Compare the lowercased characters */
106         if(r1!=r2) {
107             rc=(r1&0xff)-(r2&0xff);
108             if(rc!=0) {
109                 return rc;
110             }
111         }
112 
113         name1+=r1>>8;
114         name2+=r2>>8;
115     }
116 }
117 
118 U_CAPI int32_t U_EXPORT2
uprv_compareEBCDICPropertyNames(const char * name1,const char * name2)119 uprv_compareEBCDICPropertyNames(const char *name1, const char *name2) {
120     int32_t rc, r1, r2;
121 
122     for(;;) {
123         r1=getEBCDICPropertyNameChar(name1);
124         r2=getEBCDICPropertyNameChar(name2);
125 
126         /* If we reach the ends of both strings then they match */
127         if(((r1|r2)&0xff)==0) {
128             return 0;
129         }
130 
131         /* Compare the lowercased characters */
132         if(r1!=r2) {
133             rc=(r1&0xff)-(r2&0xff);
134             if(rc!=0) {
135                 return rc;
136             }
137         }
138 
139         name1+=r1>>8;
140         name2+=r2>>8;
141     }
142 }
143 
144 U_CDECL_END
145 
146 U_NAMESPACE_BEGIN
147 
findProperty(int32_t property)148 int32_t PropNameData::findProperty(int32_t property) {
149     int32_t i=1;  // valueMaps index, initially after numRanges
150     for(int32_t numRanges=valueMaps[0]; numRanges>0; --numRanges) {
151         // Read and skip the start and limit of this range.
152         int32_t start=valueMaps[i];
153         int32_t limit=valueMaps[i+1];
154         i+=2;
155         if(property<start) {
156             break;
157         }
158         if(property<limit) {
159             return i+(property-start)*2;
160         }
161         i+=(limit-start)*2;  // Skip all entries for this range.
162     }
163     return 0;
164 }
165 
findPropertyValueNameGroup(int32_t valueMapIndex,int32_t value)166 int32_t PropNameData::findPropertyValueNameGroup(int32_t valueMapIndex, int32_t value) {
167     if(valueMapIndex==0) {
168         return 0;  // The property does not have named values.
169     }
170     ++valueMapIndex;  // Skip the BytesTrie offset.
171     int32_t numRanges=valueMaps[valueMapIndex++];
172     if(numRanges<0x10) {
173         // Ranges of values.
174         for(; numRanges>0; --numRanges) {
175             // Read and skip the start and limit of this range.
176             int32_t start=valueMaps[valueMapIndex];
177             int32_t limit=valueMaps[valueMapIndex+1];
178             valueMapIndex+=2;
179             if(value<start) {
180                 break;
181             }
182             if(value<limit) {
183                 return valueMaps[valueMapIndex+value-start];
184             }
185             valueMapIndex+=limit-start;  // Skip all entries for this range.
186         }
187     } else {
188         // List of values.
189         int32_t valuesStart=valueMapIndex;
190         int32_t nameGroupOffsetsStart=valueMapIndex+numRanges-0x10;
191         do {
192             int32_t v=valueMaps[valueMapIndex];
193             if(value<v) {
194                 break;
195             }
196             if(value==v) {
197                 return valueMaps[nameGroupOffsetsStart+valueMapIndex-valuesStart];
198             }
199         } while(++valueMapIndex<nameGroupOffsetsStart);
200     }
201     return 0;
202 }
203 
getName(const char * nameGroup,int32_t nameIndex)204 const char *PropNameData::getName(const char *nameGroup, int32_t nameIndex) {
205     int32_t numNames=*nameGroup++;
206     if(nameIndex<0 || numNames<=nameIndex) {
207         return nullptr;
208     }
209     // Skip nameIndex names.
210     for(; nameIndex>0; --nameIndex) {
211         nameGroup=uprv_strchr(nameGroup, 0)+1;
212     }
213     if(*nameGroup==0) {
214         return nullptr;  // no name (Property[Value]Aliases.txt has "n/a")
215     }
216     return nameGroup;
217 }
218 
containsName(BytesTrie & trie,const char * name)219 UBool PropNameData::containsName(BytesTrie &trie, const char *name) {
220     if(name==nullptr) {
221         return false;
222     }
223     UStringTrieResult result=USTRINGTRIE_NO_VALUE;
224     char c;
225     while((c=*name++)!=0) {
226         c=uprv_invCharToLowercaseAscii(c);
227         // Ignore delimiters '-', '_', and ASCII White_Space.
228         if(c==0x2d || c==0x5f || c==0x20 || (0x09<=c && c<=0x0d)) {
229             continue;
230         }
231         if(!USTRINGTRIE_HAS_NEXT(result)) {
232             return false;
233         }
234         result=trie.next((uint8_t)c);
235     }
236     return USTRINGTRIE_HAS_VALUE(result);
237 }
238 
getPropertyName(int32_t property,int32_t nameChoice)239 const char *PropNameData::getPropertyName(int32_t property, int32_t nameChoice) {
240     int32_t valueMapIndex=findProperty(property);
241     if(valueMapIndex==0) {
242         return nullptr;  // Not a known property.
243     }
244     return getName(nameGroups+valueMaps[valueMapIndex], nameChoice);
245 }
246 
getPropertyValueName(int32_t property,int32_t value,int32_t nameChoice)247 const char *PropNameData::getPropertyValueName(int32_t property, int32_t value, int32_t nameChoice) {
248     int32_t valueMapIndex=findProperty(property);
249     if(valueMapIndex==0) {
250         return nullptr;  // Not a known property.
251     }
252     int32_t nameGroupOffset=findPropertyValueNameGroup(valueMaps[valueMapIndex+1], value);
253     if(nameGroupOffset==0) {
254         return nullptr;
255     }
256     return getName(nameGroups+nameGroupOffset, nameChoice);
257 }
258 
getPropertyOrValueEnum(int32_t bytesTrieOffset,const char * alias)259 int32_t PropNameData::getPropertyOrValueEnum(int32_t bytesTrieOffset, const char *alias) {
260     BytesTrie trie(bytesTries+bytesTrieOffset);
261     if(containsName(trie, alias)) {
262         return trie.getValue();
263     } else {
264         return UCHAR_INVALID_CODE;
265     }
266 }
267 
getPropertyEnum(const char * alias)268 int32_t PropNameData::getPropertyEnum(const char *alias) {
269     return getPropertyOrValueEnum(0, alias);
270 }
271 
getPropertyValueEnum(int32_t property,const char * alias)272 int32_t PropNameData::getPropertyValueEnum(int32_t property, const char *alias) {
273     int32_t valueMapIndex=findProperty(property);
274     if(valueMapIndex==0) {
275         return UCHAR_INVALID_CODE;  // Not a known property.
276     }
277     valueMapIndex=valueMaps[valueMapIndex+1];
278     if(valueMapIndex==0) {
279         return UCHAR_INVALID_CODE;  // The property does not have named values.
280     }
281     // valueMapIndex is the start of the property's valueMap,
282     // where the first word is the BytesTrie offset.
283     return getPropertyOrValueEnum(valueMaps[valueMapIndex], alias);
284 }
285 U_NAMESPACE_END
286 
287 //----------------------------------------------------------------------
288 // Public API implementation
289 
290 U_CAPI const char* U_EXPORT2
u_getPropertyName(UProperty property,UPropertyNameChoice nameChoice)291 u_getPropertyName(UProperty property,
292                   UPropertyNameChoice nameChoice) UPRV_NO_SANITIZE_UNDEFINED {
293     // The nameChoice is really an integer with a couple of named constants.
294     // Unicode allows for names other than short and long ones.
295     // If present, these will be returned for U_LONG_PROPERTY_NAME + i, where i=1, 2,...
296     U_NAMESPACE_USE
297     return PropNameData::getPropertyName(property, nameChoice);
298 }
299 
300 U_CAPI UProperty U_EXPORT2
u_getPropertyEnum(const char * alias)301 u_getPropertyEnum(const char* alias) {
302     U_NAMESPACE_USE
303     return (UProperty)PropNameData::getPropertyEnum(alias);
304 }
305 
306 U_CAPI const char* U_EXPORT2
u_getPropertyValueName(UProperty property,int32_t value,UPropertyNameChoice nameChoice)307 u_getPropertyValueName(UProperty property,
308                        int32_t value,
309                        UPropertyNameChoice nameChoice) UPRV_NO_SANITIZE_UNDEFINED {
310     // The nameChoice is really an integer with a couple of named constants.
311     // Unicode allows for names other than short and long ones.
312     // If present, these will be returned for U_LONG_PROPERTY_NAME + i, where i=1, 2,...
313     U_NAMESPACE_USE
314     return PropNameData::getPropertyValueName(property, value, nameChoice);
315 }
316 
317 U_CAPI int32_t U_EXPORT2
u_getPropertyValueEnum(UProperty property,const char * alias)318 u_getPropertyValueEnum(UProperty property,
319                        const char* alias) {
320     U_NAMESPACE_USE
321     return PropNameData::getPropertyValueEnum(property, alias);
322 }
323 
324 U_CAPI const char*  U_EXPORT2
uscript_getName(UScriptCode scriptCode)325 uscript_getName(UScriptCode scriptCode){
326     return u_getPropertyValueName(UCHAR_SCRIPT, scriptCode,
327                                   U_LONG_PROPERTY_NAME);
328 }
329 
330 U_CAPI const char*  U_EXPORT2
uscript_getShortName(UScriptCode scriptCode)331 uscript_getShortName(UScriptCode scriptCode){
332     return u_getPropertyValueName(UCHAR_SCRIPT, scriptCode,
333                                   U_SHORT_PROPERTY_NAME);
334 }
335