• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ******************************************************************************
3 *
4 *   Copyright (C) 1999-2011, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ******************************************************************************
8 *   file name:  unames.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 1999oct04
14 *   created by: Markus W. Scherer
15 */
16 
17 #include "unicode/utypes.h"
18 #include "unicode/putil.h"
19 #include "unicode/uchar.h"
20 #include "unicode/udata.h"
21 #include "unicode/utf.h"
22 #include "unicode/utf16.h"
23 #include "ustr_imp.h"
24 #include "umutex.h"
25 #include "cmemory.h"
26 #include "cstring.h"
27 #include "ucln_cmn.h"
28 #include "udataswp.h"
29 #include "uprops.h"
30 
31 /* prototypes ------------------------------------------------------------- */
32 
33 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
34 
35 static const char DATA_NAME[] = "unames";
36 static const char DATA_TYPE[] = "icu";
37 
38 #define GROUP_SHIFT 5
39 #define LINES_PER_GROUP (1L<<GROUP_SHIFT)
40 #define GROUP_MASK (LINES_PER_GROUP-1)
41 
42 /*
43  * This struct was replaced by explicitly accessing equivalent
44  * fields from triples of uint16_t.
45  * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
46  * which broke the assumption that sizeof(Group)==6 and that the ++ operator
47  * would advance by 6 bytes (3 uint16_t).
48  *
49  * We can't just change the data structure because it's loaded from a data file,
50  * and we don't want to make it less compact, so we changed the access code.
51  *
52  * For details see ICU tickets 6331 and 6008.
53 typedef struct {
54     uint16_t groupMSB,
55              offsetHigh, offsetLow; / * avoid padding * /
56 } Group;
57  */
58 enum {
59     GROUP_MSB,
60     GROUP_OFFSET_HIGH,
61     GROUP_OFFSET_LOW,
62     GROUP_LENGTH
63 };
64 
65 /*
66  * Get the 32-bit group offset.
67  * @param group (const uint16_t *) pointer to a Group triple of uint16_t
68  * @return group offset (int32_t)
69  */
70 #define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
71 
72 #define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
73 #define PREV_GROUP(group) ((group)-GROUP_LENGTH)
74 
75 typedef struct {
76     uint32_t start, end;
77     uint8_t type, variant;
78     uint16_t size;
79 } AlgorithmicRange;
80 
81 typedef struct {
82     uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
83 } UCharNames;
84 
85 /*
86  * Get the groups table from a UCharNames struct.
87  * The groups table consists of one uint16_t groupCount followed by
88  * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
89  * and the comment for the old struct Group above.
90  *
91  * @param names (const UCharNames *) pointer to the UCharNames indexes
92  * @return (const uint16_t *) pointer to the groups table
93  */
94 #define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
95 
96 typedef struct {
97     const char *otherName;
98     UChar32 code;
99 } FindName;
100 
101 #define DO_FIND_NAME NULL
102 
103 static UDataMemory *uCharNamesData=NULL;
104 static UCharNames *uCharNames=NULL;
105 static UErrorCode gLoadErrorCode=U_ZERO_ERROR;
106 
107 /*
108  * Maximum length of character names (regular & 1.0).
109  */
110 static int32_t gMaxNameLength=0;
111 
112 /*
113  * Set of chars used in character names (regular & 1.0).
114  * Chars are platform-dependent (can be EBCDIC).
115  */
116 static uint32_t gNameSet[8]={ 0 };
117 
118 #define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
119 #define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
120 #define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
121 
122 #define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
123 
124 static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
125     "unassigned",
126     "uppercase letter",
127     "lowercase letter",
128     "titlecase letter",
129     "modifier letter",
130     "other letter",
131     "non spacing mark",
132     "enclosing mark",
133     "combining spacing mark",
134     "decimal digit number",
135     "letter number",
136     "other number",
137     "space separator",
138     "line separator",
139     "paragraph separator",
140     "control",
141     "format",
142     "private use area",
143     "surrogate",
144     "dash punctuation",
145     "start punctuation",
146     "end punctuation",
147     "connector punctuation",
148     "other punctuation",
149     "math symbol",
150     "currency symbol",
151     "modifier symbol",
152     "other symbol",
153     "initial punctuation",
154     "final punctuation",
155     "noncharacter",
156     "lead surrogate",
157     "trail surrogate"
158 };
159 
160 /* implementation ----------------------------------------------------------- */
161 
unames_cleanup(void)162 static UBool U_CALLCONV unames_cleanup(void)
163 {
164     if(uCharNamesData) {
165         udata_close(uCharNamesData);
166         uCharNamesData = NULL;
167     }
168     if(uCharNames) {
169         uCharNames = NULL;
170     }
171     gMaxNameLength=0;
172     return TRUE;
173 }
174 
175 static UBool U_CALLCONV
isAcceptable(void *,const char *,const char *,const UDataInfo * pInfo)176 isAcceptable(void * /*context*/,
177              const char * /*type*/, const char * /*name*/,
178              const UDataInfo *pInfo) {
179     return (UBool)(
180         pInfo->size>=20 &&
181         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
182         pInfo->charsetFamily==U_CHARSET_FAMILY &&
183         pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
184         pInfo->dataFormat[1]==0x6e &&
185         pInfo->dataFormat[2]==0x61 &&
186         pInfo->dataFormat[3]==0x6d &&
187         pInfo->formatVersion[0]==1);
188 }
189 
190 static UBool
isDataLoaded(UErrorCode * pErrorCode)191 isDataLoaded(UErrorCode *pErrorCode) {
192     /* load UCharNames from file if necessary */
193     UBool isCached;
194 
195     /* do this because double-checked locking is broken */
196     UMTX_CHECK(NULL, (uCharNames!=NULL), isCached);
197 
198     if(!isCached) {
199         UCharNames *names;
200         UDataMemory *data;
201 
202         /* check error code from previous attempt */
203         if(U_FAILURE(gLoadErrorCode)) {
204             *pErrorCode=gLoadErrorCode;
205             return FALSE;
206         }
207 
208         /* open the data outside the mutex block */
209         data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode);
210         if(U_FAILURE(*pErrorCode)) {
211             gLoadErrorCode=*pErrorCode;
212             return FALSE;
213         }
214 
215         names=(UCharNames *)udata_getMemory(data);
216 
217         /* in the mutex block, set the data for this process */
218         {
219             umtx_lock(NULL);
220             if(uCharNames==NULL) {
221                 uCharNamesData=data;
222                 uCharNames=names;
223                 data=NULL;
224                 names=NULL;
225                 ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
226             }
227             umtx_unlock(NULL);
228         }
229 
230         /* if a different thread set it first, then close the extra data */
231         if(data!=NULL) {
232             udata_close(data); /* NULL if it was set correctly */
233         }
234     }
235     return TRUE;
236 }
237 
238 #define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
239     if((bufferLength)>0) { \
240         *(buffer)++=c; \
241         --(bufferLength); \
242     } \
243     ++(bufferPos); \
244 }
245 
246 #define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
247 
248 /*
249  * Important: expandName() and compareName() are almost the same -
250  * apply fixes to both.
251  *
252  * UnicodeData.txt uses ';' as a field separator, so no
253  * field can contain ';' as part of its contents.
254  * In unames.dat, it is marked as token[';']==-1 only if the
255  * semicolon is used in the data file - which is iff we
256  * have Unicode 1.0 names or ISO comments or aliases.
257  * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
258  * although we know that it will never be part of a name.
259  */
260 static uint16_t
expandName(UCharNames * names,const uint8_t * name,uint16_t nameLength,UCharNameChoice nameChoice,char * buffer,uint16_t bufferLength)261 expandName(UCharNames *names,
262            const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
263            char *buffer, uint16_t bufferLength) {
264     uint16_t *tokens=(uint16_t *)names+8;
265     uint16_t token, tokenCount=*tokens++, bufferPos=0;
266     uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
267     uint8_t c;
268 
269     if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
270         /*
271          * skip the modern name if it is not requested _and_
272          * if the semicolon byte value is a character, not a token number
273          */
274         if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
275             int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
276             do {
277                 while(nameLength>0) {
278                     --nameLength;
279                     if(*name++==';') {
280                         break;
281                     }
282                 }
283             } while(--fieldIndex>0);
284         } else {
285             /*
286              * the semicolon byte value is a token number, therefore
287              * only modern names are stored in unames.dat and there is no
288              * such requested alternate name here
289              */
290             nameLength=0;
291         }
292     }
293 
294     /* write each letter directly, and write a token word per token */
295     while(nameLength>0) {
296         --nameLength;
297         c=*name++;
298 
299         if(c>=tokenCount) {
300             if(c!=';') {
301                 /* implicit letter */
302                 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
303             } else {
304                 /* finished */
305                 break;
306             }
307         } else {
308             token=tokens[c];
309             if(token==(uint16_t)(-2)) {
310                 /* this is a lead byte for a double-byte token */
311                 token=tokens[c<<8|*name++];
312                 --nameLength;
313             }
314             if(token==(uint16_t)(-1)) {
315                 if(c!=';') {
316                     /* explicit letter */
317                     WRITE_CHAR(buffer, bufferLength, bufferPos, c);
318                 } else {
319                     /* stop, but skip the semicolon if we are seeking
320                        extended names and there was no 2.0 name but there
321                        is a 1.0 name. */
322                     if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) {
323                         if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
324                             continue;
325                         }
326                     }
327                     /* finished */
328                     break;
329                 }
330             } else {
331                 /* write token word */
332                 uint8_t *tokenString=tokenStrings+token;
333                 while((c=*tokenString++)!=0) {
334                     WRITE_CHAR(buffer, bufferLength, bufferPos, c);
335                 }
336             }
337         }
338     }
339 
340     /* zero-terminate */
341     if(bufferLength>0) {
342         *buffer=0;
343     }
344 
345     return bufferPos;
346 }
347 
348 /*
349  * compareName() is almost the same as expandName() except that it compares
350  * the currently expanded name to an input name.
351  * It returns the match/no match result as soon as possible.
352  */
353 static UBool
compareName(UCharNames * names,const uint8_t * name,uint16_t nameLength,UCharNameChoice nameChoice,const char * otherName)354 compareName(UCharNames *names,
355             const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
356             const char *otherName) {
357     uint16_t *tokens=(uint16_t *)names+8;
358     uint16_t token, tokenCount=*tokens++;
359     uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
360     uint8_t c;
361     const char *origOtherName = otherName;
362 
363     if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
364         /*
365          * skip the modern name if it is not requested _and_
366          * if the semicolon byte value is a character, not a token number
367          */
368         if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
369             int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
370             do {
371                 while(nameLength>0) {
372                     --nameLength;
373                     if(*name++==';') {
374                         break;
375                     }
376                 }
377             } while(--fieldIndex>0);
378         } else {
379             /*
380              * the semicolon byte value is a token number, therefore
381              * only modern names are stored in unames.dat and there is no
382              * such requested alternate name here
383              */
384             nameLength=0;
385         }
386     }
387 
388     /* compare each letter directly, and compare a token word per token */
389     while(nameLength>0) {
390         --nameLength;
391         c=*name++;
392 
393         if(c>=tokenCount) {
394             if(c!=';') {
395                 /* implicit letter */
396                 if((char)c!=*otherName++) {
397                     return FALSE;
398                 }
399             } else {
400                 /* finished */
401                 break;
402             }
403         } else {
404             token=tokens[c];
405             if(token==(uint16_t)(-2)) {
406                 /* this is a lead byte for a double-byte token */
407                 token=tokens[c<<8|*name++];
408                 --nameLength;
409             }
410             if(token==(uint16_t)(-1)) {
411                 if(c!=';') {
412                     /* explicit letter */
413                     if((char)c!=*otherName++) {
414                         return FALSE;
415                     }
416                 } else {
417                     /* stop, but skip the semicolon if we are seeking
418                        extended names and there was no 2.0 name but there
419                        is a 1.0 name. */
420                     if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) {
421                         if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
422                             continue;
423                         }
424                     }
425                     /* finished */
426                     break;
427                 }
428             } else {
429                 /* write token word */
430                 uint8_t *tokenString=tokenStrings+token;
431                 while((c=*tokenString++)!=0) {
432                     if((char)c!=*otherName++) {
433                         return FALSE;
434                     }
435                 }
436             }
437         }
438     }
439 
440     /* complete match? */
441     return (UBool)(*otherName==0);
442 }
443 
getCharCat(UChar32 cp)444 static uint8_t getCharCat(UChar32 cp) {
445     uint8_t cat;
446 
447     if (U_IS_UNICODE_NONCHAR(cp)) {
448         return U_NONCHARACTER_CODE_POINT;
449     }
450 
451     if ((cat = u_charType(cp)) == U_SURROGATE) {
452         cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
453     }
454 
455     return cat;
456 }
457 
getCharCatName(UChar32 cp)458 static const char *getCharCatName(UChar32 cp) {
459     uint8_t cat = getCharCat(cp);
460 
461     /* Return unknown if the table of names above is not up to
462        date. */
463 
464     if (cat >= LENGTHOF(charCatNames)) {
465         return "unknown";
466     } else {
467         return charCatNames[cat];
468     }
469 }
470 
getExtName(uint32_t code,char * buffer,uint16_t bufferLength)471 static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
472     const char *catname = getCharCatName(code);
473     uint16_t length = 0;
474 
475     UChar32 cp;
476     int ndigits, i;
477 
478     WRITE_CHAR(buffer, bufferLength, length, '<');
479     while (catname[length - 1]) {
480         WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
481     }
482     WRITE_CHAR(buffer, bufferLength, length, '-');
483     for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
484         ;
485     if (ndigits < 4)
486         ndigits = 4;
487     for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {
488         uint8_t v = (uint8_t)(cp & 0xf);
489         buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
490     }
491     buffer += ndigits;
492     length += ndigits;
493     WRITE_CHAR(buffer, bufferLength, length, '>');
494 
495     return length;
496 }
497 
498 /*
499  * getGroup() does a binary search for the group that contains the
500  * Unicode code point "code".
501  * The return value is always a valid Group* that may contain "code"
502  * or else is the highest group before "code".
503  * If the lowest group is after "code", then that one is returned.
504  */
505 static const uint16_t *
getGroup(UCharNames * names,uint32_t code)506 getGroup(UCharNames *names, uint32_t code) {
507     const uint16_t *groups=GET_GROUPS(names);
508     uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),
509              start=0,
510              limit=*groups++,
511              number;
512 
513     /* binary search for the group of names that contains the one for code */
514     while(start<limit-1) {
515         number=(uint16_t)((start+limit)/2);
516         if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {
517             limit=number;
518         } else {
519             start=number;
520         }
521     }
522 
523     /* return this regardless of whether it is an exact match */
524     return groups+start*GROUP_LENGTH;
525 }
526 
527 /*
528  * expandGroupLengths() reads a block of compressed lengths of 32 strings and
529  * expands them into offsets and lengths for each string.
530  * Lengths are stored with a variable-width encoding in consecutive nibbles:
531  * If a nibble<0xc, then it is the length itself (0=empty string).
532  * If a nibble>=0xc, then it forms a length value with the following nibble.
533  * Calculation see below.
534  * The offsets and lengths arrays must be at least 33 (one more) long because
535  * there is no check here at the end if the last nibble is still used.
536  */
537 static const uint8_t *
expandGroupLengths(const uint8_t * s,uint16_t offsets[LINES_PER_GROUP+1],uint16_t lengths[LINES_PER_GROUP+1])538 expandGroupLengths(const uint8_t *s,
539                    uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
540     /* read the lengths of the 32 strings in this group and get each string's offset */
541     uint16_t i=0, offset=0, length=0;
542     uint8_t lengthByte;
543 
544     /* all 32 lengths must be read to get the offset of the first group string */
545     while(i<LINES_PER_GROUP) {
546         lengthByte=*s++;
547 
548         /* read even nibble - MSBs of lengthByte */
549         if(length>=12) {
550             /* double-nibble length spread across two bytes */
551             length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12);
552             lengthByte&=0xf;
553         } else if((lengthByte /* &0xf0 */)>=0xc0) {
554             /* double-nibble length spread across this one byte */
555             length=(uint16_t)((lengthByte&0x3f)+12);
556         } else {
557             /* single-nibble length in MSBs */
558             length=(uint16_t)(lengthByte>>4);
559             lengthByte&=0xf;
560         }
561 
562         *offsets++=offset;
563         *lengths++=length;
564 
565         offset+=length;
566         ++i;
567 
568         /* read odd nibble - LSBs of lengthByte */
569         if((lengthByte&0xf0)==0) {
570             /* this nibble was not consumed for a double-nibble length above */
571             length=lengthByte;
572             if(length<12) {
573                 /* single-nibble length in LSBs */
574                 *offsets++=offset;
575                 *lengths++=length;
576 
577                 offset+=length;
578                 ++i;
579             }
580         } else {
581             length=0;   /* prevent double-nibble detection in the next iteration */
582         }
583     }
584 
585     /* now, s is at the first group string */
586     return s;
587 }
588 
589 static uint16_t
expandGroupName(UCharNames * names,const uint16_t * group,uint16_t lineNumber,UCharNameChoice nameChoice,char * buffer,uint16_t bufferLength)590 expandGroupName(UCharNames *names, const uint16_t *group,
591                 uint16_t lineNumber, UCharNameChoice nameChoice,
592                 char *buffer, uint16_t bufferLength) {
593     uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
594     const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
595     s=expandGroupLengths(s, offsets, lengths);
596     return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
597                       buffer, bufferLength);
598 }
599 
600 static uint16_t
getName(UCharNames * names,uint32_t code,UCharNameChoice nameChoice,char * buffer,uint16_t bufferLength)601 getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
602         char *buffer, uint16_t bufferLength) {
603     const uint16_t *group=getGroup(names, code);
604     if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) {
605         return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice,
606                                buffer, bufferLength);
607     } else {
608         /* group not found */
609         /* zero-terminate */
610         if(bufferLength>0) {
611             *buffer=0;
612         }
613         return 0;
614     }
615 }
616 
617 /*
618  * enumGroupNames() enumerates all the names in a 32-group
619  * and either calls the enumerator function or finds a given input name.
620  */
621 static UBool
enumGroupNames(UCharNames * names,const uint16_t * group,UChar32 start,UChar32 end,UEnumCharNamesFn * fn,void * context,UCharNameChoice nameChoice)622 enumGroupNames(UCharNames *names, const uint16_t *group,
623                UChar32 start, UChar32 end,
624                UEnumCharNamesFn *fn, void *context,
625                UCharNameChoice nameChoice) {
626     uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
627     const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
628 
629     s=expandGroupLengths(s, offsets, lengths);
630     if(fn!=DO_FIND_NAME) {
631         char buffer[200];
632         uint16_t length;
633 
634         while(start<=end) {
635             length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer));
636             if (!length && nameChoice == U_EXTENDED_CHAR_NAME) {
637                 buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
638             }
639             /* here, we assume that the buffer is large enough */
640             if(length>0) {
641                 if(!fn(context, start, nameChoice, buffer, length)) {
642                     return FALSE;
643                 }
644             }
645             ++start;
646         }
647     } else {
648         const char *otherName=((FindName *)context)->otherName;
649         while(start<=end) {
650             if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) {
651                 ((FindName *)context)->code=start;
652                 return FALSE;
653             }
654             ++start;
655         }
656     }
657     return TRUE;
658 }
659 
660 /*
661  * enumExtNames enumerate extended names.
662  * It only needs to do it if it is called with a real function and not
663  * with the dummy DO_FIND_NAME, because u_charFromName() does a check
664  * for extended names by itself.
665  */
666 static UBool
enumExtNames(UChar32 start,UChar32 end,UEnumCharNamesFn * fn,void * context)667 enumExtNames(UChar32 start, UChar32 end,
668              UEnumCharNamesFn *fn, void *context)
669 {
670     if(fn!=DO_FIND_NAME) {
671         char buffer[200];
672         uint16_t length;
673 
674         while(start<=end) {
675             buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
676             /* here, we assume that the buffer is large enough */
677             if(length>0) {
678                 if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) {
679                     return FALSE;
680                 }
681             }
682             ++start;
683         }
684     }
685 
686     return TRUE;
687 }
688 
689 static UBool
enumNames(UCharNames * names,UChar32 start,UChar32 limit,UEnumCharNamesFn * fn,void * context,UCharNameChoice nameChoice)690 enumNames(UCharNames *names,
691           UChar32 start, UChar32 limit,
692           UEnumCharNamesFn *fn, void *context,
693           UCharNameChoice nameChoice) {
694     uint16_t startGroupMSB, endGroupMSB, groupCount;
695     const uint16_t *group, *groupLimit;
696 
697     startGroupMSB=(uint16_t)(start>>GROUP_SHIFT);
698     endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT);
699 
700     /* find the group that contains start, or the highest before it */
701     group=getGroup(names, start);
702 
703     if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) {
704         /* enumerate synthetic names between start and the group start */
705         UChar32 extLimit=((UChar32)group[GROUP_MSB]<<GROUP_SHIFT);
706         if(extLimit>limit) {
707             extLimit=limit;
708         }
709         if(!enumExtNames(start, extLimit-1, fn, context)) {
710             return FALSE;
711         }
712         start=extLimit;
713     }
714 
715     if(startGroupMSB==endGroupMSB) {
716         if(startGroupMSB==group[GROUP_MSB]) {
717             /* if start and limit-1 are in the same group, then enumerate only in that one */
718             return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
719         }
720     } else {
721         const uint16_t *groups=GET_GROUPS(names);
722         groupCount=*groups++;
723         groupLimit=groups+groupCount*GROUP_LENGTH;
724 
725         if(startGroupMSB==group[GROUP_MSB]) {
726             /* enumerate characters in the partial start group */
727             if((start&GROUP_MASK)!=0) {
728                 if(!enumGroupNames(names, group,
729                                    start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1,
730                                    fn, context, nameChoice)) {
731                     return FALSE;
732                 }
733                 group=NEXT_GROUP(group); /* continue with the next group */
734             }
735         } else if(startGroupMSB>group[GROUP_MSB]) {
736             /* make sure that we start enumerating with the first group after start */
737             const uint16_t *nextGroup=NEXT_GROUP(group);
738             if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
739                 UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
740                 if (end > limit) {
741                     end = limit;
742                 }
743                 if (!enumExtNames(start, end - 1, fn, context)) {
744                     return FALSE;
745                 }
746             }
747             group=nextGroup;
748         }
749 
750         /* enumerate entire groups between the start- and end-groups */
751         while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) {
752             const uint16_t *nextGroup;
753             start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT;
754             if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {
755                 return FALSE;
756             }
757             nextGroup=NEXT_GROUP(group);
758             if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
759                 UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
760                 if (end > limit) {
761                     end = limit;
762                 }
763                 if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) {
764                     return FALSE;
765                 }
766             }
767             group=nextGroup;
768         }
769 
770         /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
771         if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) {
772             return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
773         } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
774             UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT;
775             if (next > start) {
776                 start = next;
777             }
778         } else {
779             return TRUE;
780         }
781     }
782 
783     /* we have not found a group, which means everything is made of
784        extended names. */
785     if (nameChoice == U_EXTENDED_CHAR_NAME) {
786         if (limit > UCHAR_MAX_VALUE + 1) {
787             limit = UCHAR_MAX_VALUE + 1;
788         }
789         return enumExtNames(start, limit - 1, fn, context);
790     }
791 
792     return TRUE;
793 }
794 
795 static uint16_t
writeFactorSuffix(const uint16_t * factors,uint16_t count,const char * s,uint32_t code,uint16_t indexes[8],const char * elementBases[8],const char * elements[8],char * buffer,uint16_t bufferLength)796 writeFactorSuffix(const uint16_t *factors, uint16_t count,
797                   const char *s, /* suffix elements */
798                   uint32_t code,
799                   uint16_t indexes[8], /* output fields from here */
800                   const char *elementBases[8], const char *elements[8],
801                   char *buffer, uint16_t bufferLength) {
802     uint16_t i, factor, bufferPos=0;
803     char c;
804 
805     /* write elements according to the factors */
806 
807     /*
808      * the factorized elements are determined by modulo arithmetic
809      * with the factors of this algorithm
810      *
811      * note that for fewer operations, count is decremented here
812      */
813     --count;
814     for(i=count; i>0; --i) {
815         factor=factors[i];
816         indexes[i]=(uint16_t)(code%factor);
817         code/=factor;
818     }
819     /*
820      * we don't need to calculate the last modulus because start<=code<=end
821      * guarantees here that code<=factors[0]
822      */
823     indexes[0]=(uint16_t)code;
824 
825     /* write each element */
826     for(;;) {
827         if(elementBases!=NULL) {
828             *elementBases++=s;
829         }
830 
831         /* skip indexes[i] strings */
832         factor=indexes[i];
833         while(factor>0) {
834             while(*s++!=0) {}
835             --factor;
836         }
837         if(elements!=NULL) {
838             *elements++=s;
839         }
840 
841         /* write element */
842         while((c=*s++)!=0) {
843             WRITE_CHAR(buffer, bufferLength, bufferPos, c);
844         }
845 
846         /* we do not need to perform the rest of this loop for i==count - break here */
847         if(i>=count) {
848             break;
849         }
850 
851         /* skip the rest of the strings for this factors[i] */
852         factor=(uint16_t)(factors[i]-indexes[i]-1);
853         while(factor>0) {
854             while(*s++!=0) {}
855             --factor;
856         }
857 
858         ++i;
859     }
860 
861     /* zero-terminate */
862     if(bufferLength>0) {
863         *buffer=0;
864     }
865 
866     return bufferPos;
867 }
868 
869 /*
870  * Important:
871  * Parts of findAlgName() are almost the same as some of getAlgName().
872  * Fixes must be applied to both.
873  */
874 static uint16_t
getAlgName(AlgorithmicRange * range,uint32_t code,UCharNameChoice nameChoice,char * buffer,uint16_t bufferLength)875 getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
876         char *buffer, uint16_t bufferLength) {
877     uint16_t bufferPos=0;
878 
879     /* Only the normative character name can be algorithmic. */
880     if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
881         /* zero-terminate */
882         if(bufferLength>0) {
883             *buffer=0;
884         }
885         return 0;
886     }
887 
888     switch(range->type) {
889     case 0: {
890         /* name = prefix hex-digits */
891         const char *s=(const char *)(range+1);
892         char c;
893 
894         uint16_t i, count;
895 
896         /* copy prefix */
897         while((c=*s++)!=0) {
898             WRITE_CHAR(buffer, bufferLength, bufferPos, c);
899         }
900 
901         /* write hexadecimal code point value */
902         count=range->variant;
903 
904         /* zero-terminate */
905         if(count<bufferLength) {
906             buffer[count]=0;
907         }
908 
909         for(i=count; i>0;) {
910             if(--i<bufferLength) {
911                 c=(char)(code&0xf);
912                 if(c<10) {
913                     c+='0';
914                 } else {
915                     c+='A'-10;
916                 }
917                 buffer[i]=c;
918             }
919             code>>=4;
920         }
921 
922         bufferPos+=count;
923         break;
924     }
925     case 1: {
926         /* name = prefix factorized-elements */
927         uint16_t indexes[8];
928         const uint16_t *factors=(const uint16_t *)(range+1);
929         uint16_t count=range->variant;
930         const char *s=(const char *)(factors+count);
931         char c;
932 
933         /* copy prefix */
934         while((c=*s++)!=0) {
935             WRITE_CHAR(buffer, bufferLength, bufferPos, c);
936         }
937 
938         bufferPos+=writeFactorSuffix(factors, count,
939                                      s, code-range->start, indexes, NULL, NULL, buffer, bufferLength);
940         break;
941     }
942     default:
943         /* undefined type */
944         /* zero-terminate */
945         if(bufferLength>0) {
946             *buffer=0;
947         }
948         break;
949     }
950 
951     return bufferPos;
952 }
953 
954 /*
955  * Important: enumAlgNames() and findAlgName() are almost the same.
956  * Any fix must be applied to both.
957  */
958 static UBool
enumAlgNames(AlgorithmicRange * range,UChar32 start,UChar32 limit,UEnumCharNamesFn * fn,void * context,UCharNameChoice nameChoice)959 enumAlgNames(AlgorithmicRange *range,
960              UChar32 start, UChar32 limit,
961              UEnumCharNamesFn *fn, void *context,
962              UCharNameChoice nameChoice) {
963     char buffer[200];
964     uint16_t length;
965 
966     if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
967         return TRUE;
968     }
969 
970     switch(range->type) {
971     case 0: {
972         char *s, *end;
973         char c;
974 
975         /* get the full name of the start character */
976         length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer));
977         if(length<=0) {
978             return TRUE;
979         }
980 
981         /* call the enumerator function with this first character */
982         if(!fn(context, start, nameChoice, buffer, length)) {
983             return FALSE;
984         }
985 
986         /* go to the end of the name; all these names have the same length */
987         end=buffer;
988         while(*end!=0) {
989             ++end;
990         }
991 
992         /* enumerate the rest of the names */
993         while(++start<limit) {
994             /* increment the hexadecimal number on a character-basis */
995             s=end;
996             for (;;) {
997                 c=*--s;
998                 if(('0'<=c && c<'9') || ('A'<=c && c<'F')) {
999                     *s=(char)(c+1);
1000                     break;
1001                 } else if(c=='9') {
1002                     *s='A';
1003                     break;
1004                 } else if(c=='F') {
1005                     *s='0';
1006                 }
1007             }
1008 
1009             if(!fn(context, start, nameChoice, buffer, length)) {
1010                 return FALSE;
1011             }
1012         }
1013         break;
1014     }
1015     case 1: {
1016         uint16_t indexes[8];
1017         const char *elementBases[8], *elements[8];
1018         const uint16_t *factors=(const uint16_t *)(range+1);
1019         uint16_t count=range->variant;
1020         const char *s=(const char *)(factors+count);
1021         char *suffix, *t;
1022         uint16_t prefixLength, i, idx;
1023 
1024         char c;
1025 
1026         /* name = prefix factorized-elements */
1027 
1028         /* copy prefix */
1029         suffix=buffer;
1030         prefixLength=0;
1031         while((c=*s++)!=0) {
1032             *suffix++=c;
1033             ++prefixLength;
1034         }
1035 
1036         /* append the suffix of the start character */
1037         length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count,
1038                                               s, (uint32_t)start-range->start,
1039                                               indexes, elementBases, elements,
1040                                               suffix, (uint16_t)(sizeof(buffer)-prefixLength)));
1041 
1042         /* call the enumerator function with this first character */
1043         if(!fn(context, start, nameChoice, buffer, length)) {
1044             return FALSE;
1045         }
1046 
1047         /* enumerate the rest of the names */
1048         while(++start<limit) {
1049             /* increment the indexes in lexical order bound by the factors */
1050             i=count;
1051             for (;;) {
1052                 idx=(uint16_t)(indexes[--i]+1);
1053                 if(idx<factors[i]) {
1054                     /* skip one index and its element string */
1055                     indexes[i]=idx;
1056                     s=elements[i];
1057                     while(*s++!=0) {
1058                     }
1059                     elements[i]=s;
1060                     break;
1061                 } else {
1062                     /* reset this index to 0 and its element string to the first one */
1063                     indexes[i]=0;
1064                     elements[i]=elementBases[i];
1065                 }
1066             }
1067 
1068             /* to make matters a little easier, just append all elements to the suffix */
1069             t=suffix;
1070             length=prefixLength;
1071             for(i=0; i<count; ++i) {
1072                 s=elements[i];
1073                 while((c=*s++)!=0) {
1074                     *t++=c;
1075                     ++length;
1076                 }
1077             }
1078             /* zero-terminate */
1079             *t=0;
1080 
1081             if(!fn(context, start, nameChoice, buffer, length)) {
1082                 return FALSE;
1083             }
1084         }
1085         break;
1086     }
1087     default:
1088         /* undefined type */
1089         break;
1090     }
1091 
1092     return TRUE;
1093 }
1094 
1095 /*
1096  * findAlgName() is almost the same as enumAlgNames() except that it
1097  * returns the code point for a name if it fits into the range.
1098  * It returns 0xffff otherwise.
1099  */
1100 static UChar32
findAlgName(AlgorithmicRange * range,UCharNameChoice nameChoice,const char * otherName)1101 findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) {
1102     UChar32 code;
1103 
1104     if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
1105         return 0xffff;
1106     }
1107 
1108     switch(range->type) {
1109     case 0: {
1110         /* name = prefix hex-digits */
1111         const char *s=(const char *)(range+1);
1112         char c;
1113 
1114         uint16_t i, count;
1115 
1116         /* compare prefix */
1117         while((c=*s++)!=0) {
1118             if((char)c!=*otherName++) {
1119                 return 0xffff;
1120             }
1121         }
1122 
1123         /* read hexadecimal code point value */
1124         count=range->variant;
1125         code=0;
1126         for(i=0; i<count; ++i) {
1127             c=*otherName++;
1128             if('0'<=c && c<='9') {
1129                 code=(code<<4)|(c-'0');
1130             } else if('A'<=c && c<='F') {
1131                 code=(code<<4)|(c-'A'+10);
1132             } else {
1133                 return 0xffff;
1134             }
1135         }
1136 
1137         /* does it fit into the range? */
1138         if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) {
1139             return code;
1140         }
1141         break;
1142     }
1143     case 1: {
1144         char buffer[64];
1145         uint16_t indexes[8];
1146         const char *elementBases[8], *elements[8];
1147         const uint16_t *factors=(const uint16_t *)(range+1);
1148         uint16_t count=range->variant;
1149         const char *s=(const char *)(factors+count), *t;
1150         UChar32 start, limit;
1151         uint16_t i, idx;
1152 
1153         char c;
1154 
1155         /* name = prefix factorized-elements */
1156 
1157         /* compare prefix */
1158         while((c=*s++)!=0) {
1159             if((char)c!=*otherName++) {
1160                 return 0xffff;
1161             }
1162         }
1163 
1164         start=(UChar32)range->start;
1165         limit=(UChar32)(range->end+1);
1166 
1167         /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
1168         writeFactorSuffix(factors, count, s, 0,
1169                           indexes, elementBases, elements, buffer, sizeof(buffer));
1170 
1171         /* compare the first suffix */
1172         if(0==uprv_strcmp(otherName, buffer)) {
1173             return start;
1174         }
1175 
1176         /* enumerate and compare the rest of the suffixes */
1177         while(++start<limit) {
1178             /* increment the indexes in lexical order bound by the factors */
1179             i=count;
1180             for (;;) {
1181                 idx=(uint16_t)(indexes[--i]+1);
1182                 if(idx<factors[i]) {
1183                     /* skip one index and its element string */
1184                     indexes[i]=idx;
1185                     s=elements[i];
1186                     while(*s++!=0) {}
1187                     elements[i]=s;
1188                     break;
1189                 } else {
1190                     /* reset this index to 0 and its element string to the first one */
1191                     indexes[i]=0;
1192                     elements[i]=elementBases[i];
1193                 }
1194             }
1195 
1196             /* to make matters a little easier, just compare all elements of the suffix */
1197             t=otherName;
1198             for(i=0; i<count; ++i) {
1199                 s=elements[i];
1200                 while((c=*s++)!=0) {
1201                     if(c!=*t++) {
1202                         s=""; /* does not match */
1203                         i=99;
1204                     }
1205                 }
1206             }
1207             if(i<99 && *t==0) {
1208                 return start;
1209             }
1210         }
1211         break;
1212     }
1213     default:
1214         /* undefined type */
1215         break;
1216     }
1217 
1218     return 0xffff;
1219 }
1220 
1221 /* sets of name characters, maximum name lengths ---------------------------- */
1222 
1223 #define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
1224 #define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
1225 
1226 static int32_t
calcStringSetLength(uint32_t set[8],const char * s)1227 calcStringSetLength(uint32_t set[8], const char *s) {
1228     int32_t length=0;
1229     char c;
1230 
1231     while((c=*s++)!=0) {
1232         SET_ADD(set, c);
1233         ++length;
1234     }
1235     return length;
1236 }
1237 
1238 static int32_t
calcAlgNameSetsLengths(int32_t maxNameLength)1239 calcAlgNameSetsLengths(int32_t maxNameLength) {
1240     AlgorithmicRange *range;
1241     uint32_t *p;
1242     uint32_t rangeCount;
1243     int32_t length;
1244 
1245     /* enumerate algorithmic ranges */
1246     p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1247     rangeCount=*p;
1248     range=(AlgorithmicRange *)(p+1);
1249     while(rangeCount>0) {
1250         switch(range->type) {
1251         case 0:
1252             /* name = prefix + (range->variant times) hex-digits */
1253             /* prefix */
1254             length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant;
1255             if(length>maxNameLength) {
1256                 maxNameLength=length;
1257             }
1258             break;
1259         case 1: {
1260             /* name = prefix factorized-elements */
1261             const uint16_t *factors=(const uint16_t *)(range+1);
1262             const char *s;
1263             int32_t i, count=range->variant, factor, factorLength, maxFactorLength;
1264 
1265             /* prefix length */
1266             s=(const char *)(factors+count);
1267             length=calcStringSetLength(gNameSet, s);
1268             s+=length+1; /* start of factor suffixes */
1269 
1270             /* get the set and maximum factor suffix length for each factor */
1271             for(i=0; i<count; ++i) {
1272                 maxFactorLength=0;
1273                 for(factor=factors[i]; factor>0; --factor) {
1274                     factorLength=calcStringSetLength(gNameSet, s);
1275                     s+=factorLength+1;
1276                     if(factorLength>maxFactorLength) {
1277                         maxFactorLength=factorLength;
1278                     }
1279                 }
1280                 length+=maxFactorLength;
1281             }
1282 
1283             if(length>maxNameLength) {
1284                 maxNameLength=length;
1285             }
1286             break;
1287         }
1288         default:
1289             /* unknown type */
1290             break;
1291         }
1292 
1293         range=(AlgorithmicRange *)((uint8_t *)range+range->size);
1294         --rangeCount;
1295     }
1296     return maxNameLength;
1297 }
1298 
1299 static int32_t
calcExtNameSetsLengths(int32_t maxNameLength)1300 calcExtNameSetsLengths(int32_t maxNameLength) {
1301     int32_t i, length;
1302 
1303     for(i=0; i<LENGTHOF(charCatNames); ++i) {
1304         /*
1305          * for each category, count the length of the category name
1306          * plus 9=
1307          * 2 for <>
1308          * 1 for -
1309          * 6 for most hex digits per code point
1310          */
1311         length=9+calcStringSetLength(gNameSet, charCatNames[i]);
1312         if(length>maxNameLength) {
1313             maxNameLength=length;
1314         }
1315     }
1316     return maxNameLength;
1317 }
1318 
1319 static int32_t
calcNameSetLength(const uint16_t * tokens,uint16_t tokenCount,const uint8_t * tokenStrings,int8_t * tokenLengths,uint32_t set[8],const uint8_t ** pLine,const uint8_t * lineLimit)1320 calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths,
1321                   uint32_t set[8],
1322                   const uint8_t **pLine, const uint8_t *lineLimit) {
1323     const uint8_t *line=*pLine;
1324     int32_t length=0, tokenLength;
1325     uint16_t c, token;
1326 
1327     while(line!=lineLimit && (c=*line++)!=(uint8_t)';') {
1328         if(c>=tokenCount) {
1329             /* implicit letter */
1330             SET_ADD(set, c);
1331             ++length;
1332         } else {
1333             token=tokens[c];
1334             if(token==(uint16_t)(-2)) {
1335                 /* this is a lead byte for a double-byte token */
1336                 c=c<<8|*line++;
1337                 token=tokens[c];
1338             }
1339             if(token==(uint16_t)(-1)) {
1340                 /* explicit letter */
1341                 SET_ADD(set, c);
1342                 ++length;
1343             } else {
1344                 /* count token word */
1345                 if(tokenLengths!=NULL) {
1346                     /* use cached token length */
1347                     tokenLength=tokenLengths[c];
1348                     if(tokenLength==0) {
1349                         tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1350                         tokenLengths[c]=(int8_t)tokenLength;
1351                     }
1352                 } else {
1353                     tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1354                 }
1355                 length+=tokenLength;
1356             }
1357         }
1358     }
1359 
1360     *pLine=line;
1361     return length;
1362 }
1363 
1364 static void
calcGroupNameSetsLengths(int32_t maxNameLength)1365 calcGroupNameSetsLengths(int32_t maxNameLength) {
1366     uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
1367 
1368     uint16_t *tokens=(uint16_t *)uCharNames+8;
1369     uint16_t tokenCount=*tokens++;
1370     uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset;
1371 
1372     int8_t *tokenLengths;
1373 
1374     const uint16_t *group;
1375     const uint8_t *s, *line, *lineLimit;
1376 
1377     int32_t groupCount, lineNumber, length;
1378 
1379     tokenLengths=(int8_t *)uprv_malloc(tokenCount);
1380     if(tokenLengths!=NULL) {
1381         uprv_memset(tokenLengths, 0, tokenCount);
1382     }
1383 
1384     group=GET_GROUPS(uCharNames);
1385     groupCount=*group++;
1386 
1387     /* enumerate all groups */
1388     while(groupCount>0) {
1389         s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group);
1390         s=expandGroupLengths(s, offsets, lengths);
1391 
1392         /* enumerate all lines in each group */
1393         for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {
1394             line=s+offsets[lineNumber];
1395             length=lengths[lineNumber];
1396             if(length==0) {
1397                 continue;
1398             }
1399 
1400             lineLimit=line+length;
1401 
1402             /* read regular name */
1403             length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1404             if(length>maxNameLength) {
1405                 maxNameLength=length;
1406             }
1407             if(line==lineLimit) {
1408                 continue;
1409             }
1410 
1411             /* read Unicode 1.0 name */
1412             length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1413             if(length>maxNameLength) {
1414                 maxNameLength=length;
1415             }
1416             if(line==lineLimit) {
1417                 continue;
1418             }
1419 
1420             /* read ISO comment */
1421             /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
1422         }
1423 
1424         group=NEXT_GROUP(group);
1425         --groupCount;
1426     }
1427 
1428     if(tokenLengths!=NULL) {
1429         uprv_free(tokenLengths);
1430     }
1431 
1432     /* set gMax... - name length last for threading */
1433     gMaxNameLength=maxNameLength;
1434 }
1435 
1436 static UBool
calcNameSetsLengths(UErrorCode * pErrorCode)1437 calcNameSetsLengths(UErrorCode *pErrorCode) {
1438     static const char extChars[]="0123456789ABCDEF<>-";
1439     int32_t i, maxNameLength;
1440 
1441     if(gMaxNameLength!=0) {
1442         return TRUE;
1443     }
1444 
1445     if(!isDataLoaded(pErrorCode)) {
1446         return FALSE;
1447     }
1448 
1449     /* set hex digits, used in various names, and <>-, used in extended names */
1450     for(i=0; i<(int32_t)sizeof(extChars)-1; ++i) {
1451         SET_ADD(gNameSet, extChars[i]);
1452     }
1453 
1454     /* set sets and lengths from algorithmic names */
1455     maxNameLength=calcAlgNameSetsLengths(0);
1456 
1457     /* set sets and lengths from extended names */
1458     maxNameLength=calcExtNameSetsLengths(maxNameLength);
1459 
1460     /* set sets and lengths from group names, set global maximum values */
1461     calcGroupNameSetsLengths(maxNameLength);
1462 
1463     return TRUE;
1464 }
1465 
1466 /* public API --------------------------------------------------------------- */
1467 
1468 U_CAPI int32_t U_EXPORT2
u_charName(UChar32 code,UCharNameChoice nameChoice,char * buffer,int32_t bufferLength,UErrorCode * pErrorCode)1469 u_charName(UChar32 code, UCharNameChoice nameChoice,
1470            char *buffer, int32_t bufferLength,
1471            UErrorCode *pErrorCode) {
1472     AlgorithmicRange *algRange;
1473     uint32_t *p;
1474     uint32_t i;
1475     int32_t length;
1476 
1477     /* check the argument values */
1478     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1479         return 0;
1480     } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT ||
1481               bufferLength<0 || (bufferLength>0 && buffer==NULL)
1482     ) {
1483         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1484         return 0;
1485     }
1486 
1487     if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
1488         return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
1489     }
1490 
1491     length=0;
1492 
1493     /* try algorithmic names first */
1494     p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1495     i=*p;
1496     algRange=(AlgorithmicRange *)(p+1);
1497     while(i>0) {
1498         if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
1499             length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1500             break;
1501         }
1502         algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1503         --i;
1504     }
1505 
1506     if(i==0) {
1507         if (nameChoice == U_EXTENDED_CHAR_NAME) {
1508             length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
1509             if (!length) {
1510                 /* extended character name */
1511                 length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
1512             }
1513         } else {
1514             /* normal character name */
1515             length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1516         }
1517     }
1518 
1519     return u_terminateChars(buffer, bufferLength, length, pErrorCode);
1520 }
1521 
1522 U_CAPI int32_t U_EXPORT2
u_getISOComment(UChar32,char * dest,int32_t destCapacity,UErrorCode * pErrorCode)1523 u_getISOComment(UChar32 /*c*/,
1524                 char *dest, int32_t destCapacity,
1525                 UErrorCode *pErrorCode) {
1526     /* check the argument values */
1527     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1528         return 0;
1529     } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1530         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1531         return 0;
1532     }
1533 
1534     return u_terminateChars(dest, destCapacity, 0, pErrorCode);
1535 }
1536 
1537 U_CAPI UChar32 U_EXPORT2
u_charFromName(UCharNameChoice nameChoice,const char * name,UErrorCode * pErrorCode)1538 u_charFromName(UCharNameChoice nameChoice,
1539                const char *name,
1540                UErrorCode *pErrorCode) {
1541     char upper[120], lower[120];
1542     FindName findName;
1543     AlgorithmicRange *algRange;
1544     uint32_t *p;
1545     uint32_t i;
1546     UChar32 cp = 0;
1547     char c0;
1548     UChar32 error = 0xffff;     /* Undefined, but use this for backwards compatibility. */
1549 
1550     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1551         return error;
1552     }
1553 
1554     if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) {
1555         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1556         return error;
1557     }
1558 
1559     if(!isDataLoaded(pErrorCode)) {
1560         return error;
1561     }
1562 
1563     /* construct the uppercase and lowercase of the name first */
1564     for(i=0; i<sizeof(upper); ++i) {
1565         if((c0=*name++)!=0) {
1566             upper[i]=uprv_toupper(c0);
1567             lower[i]=uprv_tolower(c0);
1568         } else {
1569             upper[i]=lower[i]=0;
1570             break;
1571         }
1572     }
1573     if(i==sizeof(upper)) {
1574         /* name too long, there is no such character */
1575         *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1576         return error;
1577     }
1578 
1579     /* try extended names first */
1580     if (lower[0] == '<') {
1581         if (nameChoice == U_EXTENDED_CHAR_NAME) {
1582             if (lower[--i] == '>') {
1583                 for (--i; lower[i] && lower[i] != '-'; --i) {
1584                 }
1585 
1586                 if (lower[i] == '-') { /* We've got a category. */
1587                     uint32_t cIdx;
1588 
1589                     lower[i] = 0;
1590 
1591                     for (++i; lower[i] != '>'; ++i) {
1592                         if (lower[i] >= '0' && lower[i] <= '9') {
1593                             cp = (cp << 4) + lower[i] - '0';
1594                         } else if (lower[i] >= 'a' && lower[i] <= 'f') {
1595                             cp = (cp << 4) + lower[i] - 'a' + 10;
1596                         } else {
1597                             *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1598                             return error;
1599                         }
1600                     }
1601 
1602                     /* Now validate the category name.
1603                        We could use a binary search, or a trie, if
1604                        we really wanted to. */
1605 
1606                     for (lower[i] = 0, cIdx = 0; cIdx < LENGTHOF(charCatNames); ++cIdx) {
1607 
1608                         if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
1609                             if (getCharCat(cp) == cIdx) {
1610                                 return cp;
1611                             }
1612                             break;
1613                         }
1614                     }
1615                 }
1616             }
1617         }
1618 
1619         *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1620         return error;
1621     }
1622 
1623     /* try algorithmic names now */
1624     p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1625     i=*p;
1626     algRange=(AlgorithmicRange *)(p+1);
1627     while(i>0) {
1628         if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
1629             return cp;
1630         }
1631         algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1632         --i;
1633     }
1634 
1635     /* normal character name */
1636     findName.otherName=upper;
1637     findName.code=error;
1638     enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
1639     if (findName.code == error) {
1640          *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1641     }
1642     return findName.code;
1643 }
1644 
1645 U_CAPI void U_EXPORT2
u_enumCharNames(UChar32 start,UChar32 limit,UEnumCharNamesFn * fn,void * context,UCharNameChoice nameChoice,UErrorCode * pErrorCode)1646 u_enumCharNames(UChar32 start, UChar32 limit,
1647                 UEnumCharNamesFn *fn,
1648                 void *context,
1649                 UCharNameChoice nameChoice,
1650                 UErrorCode *pErrorCode) {
1651     AlgorithmicRange *algRange;
1652     uint32_t *p;
1653     uint32_t i;
1654 
1655     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1656         return;
1657     }
1658 
1659     if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) {
1660         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1661         return;
1662     }
1663 
1664     if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
1665         limit = UCHAR_MAX_VALUE + 1;
1666     }
1667     if((uint32_t)start>=(uint32_t)limit) {
1668         return;
1669     }
1670 
1671     if(!isDataLoaded(pErrorCode)) {
1672         return;
1673     }
1674 
1675     /* interleave the data-driven ones with the algorithmic ones */
1676     /* iterate over all algorithmic ranges; assume that they are in ascending order */
1677     p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1678     i=*p;
1679     algRange=(AlgorithmicRange *)(p+1);
1680     while(i>0) {
1681         /* enumerate the character names before the current algorithmic range */
1682         /* here: start<limit */
1683         if((uint32_t)start<algRange->start) {
1684             if((uint32_t)limit<=algRange->start) {
1685                 enumNames(uCharNames, start, limit, fn, context, nameChoice);
1686                 return;
1687             }
1688             if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
1689                 return;
1690             }
1691             start=(UChar32)algRange->start;
1692         }
1693         /* enumerate the character names in the current algorithmic range */
1694         /* here: algRange->start<=start<limit */
1695         if((uint32_t)start<=algRange->end) {
1696             if((uint32_t)limit<=(algRange->end+1)) {
1697                 enumAlgNames(algRange, start, limit, fn, context, nameChoice);
1698                 return;
1699             }
1700             if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
1701                 return;
1702             }
1703             start=(UChar32)algRange->end+1;
1704         }
1705         /* continue to the next algorithmic range (here: start<limit) */
1706         algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1707         --i;
1708     }
1709     /* enumerate the character names after the last algorithmic range */
1710     enumNames(uCharNames, start, limit, fn, context, nameChoice);
1711 }
1712 
1713 U_CAPI int32_t U_EXPORT2
uprv_getMaxCharNameLength()1714 uprv_getMaxCharNameLength() {
1715     UErrorCode errorCode=U_ZERO_ERROR;
1716     if(calcNameSetsLengths(&errorCode)) {
1717         return gMaxNameLength;
1718     } else {
1719         return 0;
1720     }
1721 }
1722 
1723 /**
1724  * Converts the char set cset into a Unicode set uset.
1725  * @param cset Set of 256 bit flags corresponding to a set of chars.
1726  * @param uset USet to receive characters. Existing contents are deleted.
1727  */
1728 static void
charSetToUSet(uint32_t cset[8],const USetAdder * sa)1729 charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
1730     UChar us[256];
1731     char cs[256];
1732 
1733     int32_t i, length;
1734     UErrorCode errorCode;
1735 
1736     errorCode=U_ZERO_ERROR;
1737 
1738     if(!calcNameSetsLengths(&errorCode)) {
1739         return;
1740     }
1741 
1742     /* build a char string with all chars that are used in character names */
1743     length=0;
1744     for(i=0; i<256; ++i) {
1745         if(SET_CONTAINS(cset, i)) {
1746             cs[length++]=(char)i;
1747         }
1748     }
1749 
1750     /* convert the char string to a UChar string */
1751     u_charsToUChars(cs, us, length);
1752 
1753     /* add each UChar to the USet */
1754     for(i=0; i<length; ++i) {
1755         if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */
1756             sa->add(sa->set, us[i]);
1757         }
1758     }
1759 }
1760 
1761 /**
1762  * Fills set with characters that are used in Unicode character names.
1763  * @param set USet to receive characters.
1764  */
1765 U_CAPI void U_EXPORT2
uprv_getCharNameCharacters(const USetAdder * sa)1766 uprv_getCharNameCharacters(const USetAdder *sa) {
1767     charSetToUSet(gNameSet, sa);
1768 }
1769 
1770 /* data swapping ------------------------------------------------------------ */
1771 
1772 /*
1773  * The token table contains non-negative entries for token bytes,
1774  * and -1 for bytes that represent themselves in the data file's charset.
1775  * -2 entries are used for lead bytes.
1776  *
1777  * Direct bytes (-1 entries) must be translated from the input charset family
1778  * to the output charset family.
1779  * makeTokenMap() writes a permutation mapping for this.
1780  * Use it once for single-/lead-byte tokens and once more for all trail byte
1781  * tokens. (';' is an unused trail byte marked with -1.)
1782  */
1783 static void
makeTokenMap(const UDataSwapper * ds,int16_t tokens[],uint16_t tokenCount,uint8_t map[256],UErrorCode * pErrorCode)1784 makeTokenMap(const UDataSwapper *ds,
1785              int16_t tokens[], uint16_t tokenCount,
1786              uint8_t map[256],
1787              UErrorCode *pErrorCode) {
1788     UBool usedOutChar[256];
1789     uint16_t i, j;
1790     uint8_t c1, c2;
1791 
1792     if(U_FAILURE(*pErrorCode)) {
1793         return;
1794     }
1795 
1796     if(ds->inCharset==ds->outCharset) {
1797         /* Same charset family: identity permutation */
1798         for(i=0; i<256; ++i) {
1799             map[i]=(uint8_t)i;
1800         }
1801     } else {
1802         uprv_memset(map, 0, 256);
1803         uprv_memset(usedOutChar, 0, 256);
1804 
1805         if(tokenCount>256) {
1806             tokenCount=256;
1807         }
1808 
1809         /* set the direct bytes (byte 0 always maps to itself) */
1810         for(i=1; i<tokenCount; ++i) {
1811             if(tokens[i]==-1) {
1812                 /* convert the direct byte character */
1813                 c1=(uint8_t)i;
1814                 ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
1815                 if(U_FAILURE(*pErrorCode)) {
1816                     udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
1817                                      i, ds->inCharset);
1818                     return;
1819                 }
1820 
1821                 /* enter the converted character into the map and mark it used */
1822                 map[c1]=c2;
1823                 usedOutChar[c2]=TRUE;
1824             }
1825         }
1826 
1827         /* set the mappings for the rest of the permutation */
1828         for(i=j=1; i<tokenCount; ++i) {
1829             /* set mappings that were not set for direct bytes */
1830             if(map[i]==0) {
1831                 /* set an output byte value that was not used as an output byte above */
1832                 while(usedOutChar[j]) {
1833                     ++j;
1834                 }
1835                 map[i]=(uint8_t)j++;
1836             }
1837         }
1838 
1839         /*
1840          * leave mappings at tokenCount and above unset if tokenCount<256
1841          * because they won't be used
1842          */
1843     }
1844 }
1845 
1846 U_CAPI int32_t U_EXPORT2
uchar_swapNames(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)1847 uchar_swapNames(const UDataSwapper *ds,
1848                 const void *inData, int32_t length, void *outData,
1849                 UErrorCode *pErrorCode) {
1850     const UDataInfo *pInfo;
1851     int32_t headerSize;
1852 
1853     const uint8_t *inBytes;
1854     uint8_t *outBytes;
1855 
1856     uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
1857              offset, i, count, stringsCount;
1858 
1859     const AlgorithmicRange *inRange;
1860     AlgorithmicRange *outRange;
1861 
1862     /* udata_swapDataHeader checks the arguments */
1863     headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
1864     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1865         return 0;
1866     }
1867 
1868     /* check data format and format version */
1869     pInfo=(const UDataInfo *)((const char *)inData+4);
1870     if(!(
1871         pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
1872         pInfo->dataFormat[1]==0x6e &&
1873         pInfo->dataFormat[2]==0x61 &&
1874         pInfo->dataFormat[3]==0x6d &&
1875         pInfo->formatVersion[0]==1
1876     )) {
1877         udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
1878                          pInfo->dataFormat[0], pInfo->dataFormat[1],
1879                          pInfo->dataFormat[2], pInfo->dataFormat[3],
1880                          pInfo->formatVersion[0]);
1881         *pErrorCode=U_UNSUPPORTED_ERROR;
1882         return 0;
1883     }
1884 
1885     inBytes=(const uint8_t *)inData+headerSize;
1886     outBytes=(uint8_t *)outData+headerSize;
1887     if(length<0) {
1888         algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
1889     } else {
1890         length-=headerSize;
1891         if( length<20 ||
1892             (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
1893         ) {
1894             udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
1895                              length);
1896             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1897             return 0;
1898         }
1899     }
1900 
1901     if(length<0) {
1902         /* preflighting: iterate through algorithmic ranges */
1903         offset=algNamesOffset;
1904         count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
1905         offset+=4;
1906 
1907         for(i=0; i<count; ++i) {
1908             inRange=(const AlgorithmicRange *)(inBytes+offset);
1909             offset+=ds->readUInt16(inRange->size);
1910         }
1911     } else {
1912         /* swap data */
1913         const uint16_t *p;
1914         uint16_t *q, *temp;
1915 
1916         int16_t tokens[512];
1917         uint16_t tokenCount;
1918 
1919         uint8_t map[256], trailMap[256];
1920 
1921         /* copy the data for inaccessible bytes */
1922         if(inBytes!=outBytes) {
1923             uprv_memcpy(outBytes, inBytes, length);
1924         }
1925 
1926         /* the initial 4 offsets first */
1927         tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
1928         groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
1929         groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
1930         ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
1931 
1932         /*
1933          * now the tokens table
1934          * it needs to be permutated along with the compressed name strings
1935          */
1936         p=(const uint16_t *)(inBytes+16);
1937         q=(uint16_t *)(outBytes+16);
1938 
1939         /* read and swap the tokenCount */
1940         tokenCount=ds->readUInt16(*p);
1941         ds->swapArray16(ds, p, 2, q, pErrorCode);
1942         ++p;
1943         ++q;
1944 
1945         /* read the first 512 tokens and make the token maps */
1946         if(tokenCount<=512) {
1947             count=tokenCount;
1948         } else {
1949             count=512;
1950         }
1951         for(i=0; i<count; ++i) {
1952             tokens[i]=udata_readInt16(ds, p[i]);
1953         }
1954         for(; i<512; ++i) {
1955             tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
1956         }
1957         makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
1958         makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
1959         if(U_FAILURE(*pErrorCode)) {
1960             return 0;
1961         }
1962 
1963         /*
1964          * swap and permutate the tokens
1965          * go through a temporary array to support in-place swapping
1966          */
1967         temp=(uint16_t *)uprv_malloc(tokenCount*2);
1968         if(temp==NULL) {
1969             udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
1970                              tokenCount);
1971             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1972             return 0;
1973         }
1974 
1975         /* swap and permutate single-/lead-byte tokens */
1976         for(i=0; i<tokenCount && i<256; ++i) {
1977             ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
1978         }
1979 
1980         /* swap and permutate trail-byte tokens */
1981         for(; i<tokenCount; ++i) {
1982             ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
1983         }
1984 
1985         /* copy the result into the output and free the temporary array */
1986         uprv_memcpy(q, temp, tokenCount*2);
1987         uprv_free(temp);
1988 
1989         /*
1990          * swap the token strings but not a possible padding byte after
1991          * the terminating NUL of the last string
1992          */
1993         udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
1994                                     outBytes+tokenStringOffset, pErrorCode);
1995         if(U_FAILURE(*pErrorCode)) {
1996             udata_printError(ds, "uchar_swapNames(token strings) failed\n");
1997             return 0;
1998         }
1999 
2000         /* swap the group table */
2001         count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset)));
2002         ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),
2003                            outBytes+groupsOffset, pErrorCode);
2004 
2005         /*
2006          * swap the group strings
2007          * swap the string bytes but not the nibble-encoded string lengths
2008          */
2009         if(ds->inCharset!=ds->outCharset) {
2010             uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
2011 
2012             const uint8_t *inStrings, *nextInStrings;
2013             uint8_t *outStrings;
2014 
2015             uint8_t c;
2016 
2017             inStrings=inBytes+groupStringOffset;
2018             outStrings=outBytes+groupStringOffset;
2019 
2020             stringsCount=algNamesOffset-groupStringOffset;
2021 
2022             /* iterate through string groups until only a few padding bytes are left */
2023             while(stringsCount>32) {
2024                 nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
2025 
2026                 /* move past the length bytes */
2027                 stringsCount-=(uint32_t)(nextInStrings-inStrings);
2028                 outStrings+=nextInStrings-inStrings;
2029                 inStrings=nextInStrings;
2030 
2031                 count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
2032                 stringsCount-=count;
2033 
2034                 /* swap the string bytes using map[] and trailMap[] */
2035                 while(count>0) {
2036                     c=*inStrings++;
2037                     *outStrings++=map[c];
2038                     if(tokens[c]!=-2) {
2039                         --count;
2040                     } else {
2041                         /* token lead byte: swap the trail byte, too */
2042                         *outStrings++=trailMap[*inStrings++];
2043                         count-=2;
2044                     }
2045                 }
2046             }
2047         }
2048 
2049         /* swap the algorithmic ranges */
2050         offset=algNamesOffset;
2051         count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
2052         ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
2053         offset+=4;
2054 
2055         for(i=0; i<count; ++i) {
2056             if(offset>(uint32_t)length) {
2057                 udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
2058                                  length, i);
2059                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2060                 return 0;
2061             }
2062 
2063             inRange=(const AlgorithmicRange *)(inBytes+offset);
2064             outRange=(AlgorithmicRange *)(outBytes+offset);
2065             offset+=ds->readUInt16(inRange->size);
2066 
2067             ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
2068             ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
2069             switch(inRange->type) {
2070             case 0:
2071                 /* swap prefix string */
2072                 ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),
2073                                     outRange+1, pErrorCode);
2074                 if(U_FAILURE(*pErrorCode)) {
2075                     udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
2076                                      i);
2077                     return 0;
2078                 }
2079                 break;
2080             case 1:
2081                 {
2082                     /* swap factors and the prefix and factor strings */
2083                     uint32_t factorsCount;
2084 
2085                     factorsCount=inRange->variant;
2086                     p=(const uint16_t *)(inRange+1);
2087                     q=(uint16_t *)(outRange+1);
2088                     ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
2089 
2090                     /* swap the strings, up to the last terminating NUL */
2091                     p+=factorsCount;
2092                     q+=factorsCount;
2093                     stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
2094                     while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
2095                         --stringsCount;
2096                     }
2097                     ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
2098                 }
2099                 break;
2100             default:
2101                 udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
2102                                  inRange->type, i);
2103                 *pErrorCode=U_UNSUPPORTED_ERROR;
2104                 return 0;
2105             }
2106         }
2107     }
2108 
2109     return headerSize+(int32_t)offset;
2110 }
2111 
2112 /*
2113  * Hey, Emacs, please set the following:
2114  *
2115  * Local Variables:
2116  * indent-tabs-mode: nil
2117  * End:
2118  *
2119  */
2120