• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ******************************************************************************
3 *
4 *   Copyright (C) 1999-2009, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ******************************************************************************
8 *   file name:  unames.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 1999oct04
14 *   created by: Markus W. Scherer
15 */
16 
17 #include "unicode/utypes.h"
18 #include "unicode/putil.h"
19 #include "unicode/uchar.h"
20 #include "unicode/udata.h"
21 #include "ustr_imp.h"
22 #include "umutex.h"
23 #include "cmemory.h"
24 #include "cstring.h"
25 #include "ucln_cmn.h"
26 #include "udataswp.h"
27 #include "uprops.h"
28 
29 /* prototypes ------------------------------------------------------------- */
30 
31 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
32 
33 static const char DATA_NAME[] = "unames";
34 static const char DATA_TYPE[] = "icu";
35 
36 #define GROUP_SHIFT 5
37 #define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
38 #define GROUP_MASK (LINES_PER_GROUP-1)
39 
40 /*
41  * This struct was replaced by explicitly accessing equivalent
42  * fields from triples of uint16_t.
43  * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
44  * which broke the assumption that sizeof(Group)==6 and that the ++ operator
45  * would advance by 6 bytes (3 uint16_t).
46  *
47  * We can't just change the data structure because it's loaded from a data file,
48  * and we don't want to make it less compact, so we changed the access code.
49  *
50  * For details see ICU tickets 6331 and 6008.
51 typedef struct {
52     uint16_t groupMSB,
53              offsetHigh, offsetLow; / * avoid padding * /
54 } Group;
55  */
56 enum {
57     GROUP_MSB,
58     GROUP_OFFSET_HIGH,
59     GROUP_OFFSET_LOW,
60     GROUP_LENGTH
61 };
62 
63 /*
64  * Get the 32-bit group offset.
65  * @param group (const uint16_t *) pointer to a Group triple of uint16_t
66  * @return group offset (int32_t)
67  */
68 #define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
69 
70 #define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
71 #define PREV_GROUP(group) ((group)-GROUP_LENGTH)
72 
73 typedef struct {
74     uint32_t start, end;
75     uint8_t type, variant;
76     uint16_t size;
77 } AlgorithmicRange;
78 
79 typedef struct {
80     uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
81 } UCharNames;
82 
83 /*
84  * Get the groups table from a UCharNames struct.
85  * The groups table consists of one uint16_t groupCount followed by
86  * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
87  * and the comment for the old struct Group above.
88  *
89  * @param names (const UCharNames *) pointer to the UCharNames indexes
90  * @return (const uint16_t *) pointer to the groups table
91  */
92 #define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
93 
94 typedef struct {
95     const char *otherName;
96     UChar32 code;
97 } FindName;
98 
99 #define DO_FIND_NAME NULL
100 
101 static UDataMemory *uCharNamesData=NULL;
102 static UCharNames *uCharNames=NULL;
103 static UErrorCode gLoadErrorCode=U_ZERO_ERROR;
104 
105 /*
106  * Maximum length of character names (regular & 1.0).
107  */
108 static int32_t gMaxNameLength=0;
109 
110 /*
111  * Set of chars used in character names (regular & 1.0).
112  * Chars are platform-dependent (can be EBCDIC).
113  */
114 static uint32_t gNameSet[8]={ 0 };
115 
116 #define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
117 #define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
118 #define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
119 
120 #define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
121 
122 static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
123     "unassigned",
124     "uppercase letter",
125     "lowercase letter",
126     "titlecase letter",
127     "modifier letter",
128     "other letter",
129     "non spacing mark",
130     "enclosing mark",
131     "combining spacing mark",
132     "decimal digit number",
133     "letter number",
134     "other number",
135     "space separator",
136     "line separator",
137     "paragraph separator",
138     "control",
139     "format",
140     "private use area",
141     "surrogate",
142     "dash punctuation",
143     "start punctuation",
144     "end punctuation",
145     "connector punctuation",
146     "other punctuation",
147     "math symbol",
148     "currency symbol",
149     "modifier symbol",
150     "other symbol",
151     "initial punctuation",
152     "final punctuation",
153     "noncharacter",
154     "lead surrogate",
155     "trail surrogate"
156 };
157 
158 /* implementation ----------------------------------------------------------- */
159 
unames_cleanup(void)160 static UBool U_CALLCONV unames_cleanup(void)
161 {
162     if(uCharNamesData) {
163         udata_close(uCharNamesData);
164         uCharNamesData = NULL;
165     }
166     if(uCharNames) {
167         uCharNames = NULL;
168     }
169     gMaxNameLength=0;
170     return TRUE;
171 }
172 
173 static UBool U_CALLCONV
isAcceptable(void * context,const char * type,const char * name,const UDataInfo * pInfo)174 isAcceptable(void *context,
175              const char *type, const char *name,
176              const UDataInfo *pInfo) {
177     return (UBool)(
178         pInfo->size>=20 &&
179         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
180         pInfo->charsetFamily==U_CHARSET_FAMILY &&
181         pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
182         pInfo->dataFormat[1]==0x6e &&
183         pInfo->dataFormat[2]==0x61 &&
184         pInfo->dataFormat[3]==0x6d &&
185         pInfo->formatVersion[0]==1);
186 }
187 
188 static UBool
isDataLoaded(UErrorCode * pErrorCode)189 isDataLoaded(UErrorCode *pErrorCode) {
190     /* load UCharNames from file if necessary */
191     UBool isCached;
192 
193     /* do this because double-checked locking is broken */
194     UMTX_CHECK(NULL, (uCharNames!=NULL), isCached);
195 
196     if(!isCached) {
197         UCharNames *names;
198         UDataMemory *data;
199 
200         /* check error code from previous attempt */
201         if(U_FAILURE(gLoadErrorCode)) {
202             *pErrorCode=gLoadErrorCode;
203             return FALSE;
204         }
205 
206         /* open the data outside the mutex block */
207         data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode);
208         if(U_FAILURE(*pErrorCode)) {
209             gLoadErrorCode=*pErrorCode;
210             return FALSE;
211         }
212 
213         names=(UCharNames *)udata_getMemory(data);
214 
215         /* in the mutex block, set the data for this process */
216         {
217             umtx_lock(NULL);
218             if(uCharNames==NULL) {
219                 uCharNamesData=data;
220                 uCharNames=names;
221                 data=NULL;
222                 names=NULL;
223                 ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
224             }
225             umtx_unlock(NULL);
226         }
227 
228         /* if a different thread set it first, then close the extra data */
229         if(data!=NULL) {
230             udata_close(data); /* NULL if it was set correctly */
231         }
232     }
233     return TRUE;
234 }
235 
236 #define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
237     if((bufferLength)>0) { \
238         *(buffer)++=c; \
239         --(bufferLength); \
240     } \
241     ++(bufferPos); \
242 }
243 
244 #define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
245 
246 /*
247  * Important: expandName() and compareName() are almost the same -
248  * apply fixes to both.
249  *
250  * UnicodeData.txt uses ';' as a field separator, so no
251  * field can contain ';' as part of its contents.
252  * In unames.dat, it is marked as token[';']==-1 only if the
253  * semicolon is used in the data file - which is iff we
254  * have Unicode 1.0 names or ISO comments or aliases.
255  * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
256  * although we know that it will never be part of a name.
257  */
258 static uint16_t
expandName(UCharNames * names,const uint8_t * name,uint16_t nameLength,UCharNameChoice nameChoice,char * buffer,uint16_t bufferLength)259 expandName(UCharNames *names,
260            const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
261            char *buffer, uint16_t bufferLength) {
262     uint16_t *tokens=(uint16_t *)names+8;
263     uint16_t token, tokenCount=*tokens++, bufferPos=0;
264     uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
265     uint8_t c;
266 
267     if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
268         /*
269          * skip the modern name if it is not requested _and_
270          * if the semicolon byte value is a character, not a token number
271          */
272         if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
273             int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
274             do {
275                 while(nameLength>0) {
276                     --nameLength;
277                     if(*name++==';') {
278                         break;
279                     }
280                 }
281             } while(--fieldIndex>0);
282         } else {
283             /*
284              * the semicolon byte value is a token number, therefore
285              * only modern names are stored in unames.dat and there is no
286              * such requested alternate name here
287              */
288             nameLength=0;
289         }
290     }
291 
292     /* write each letter directly, and write a token word per token */
293     while(nameLength>0) {
294         --nameLength;
295         c=*name++;
296 
297         if(c>=tokenCount) {
298             if(c!=';') {
299                 /* implicit letter */
300                 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
301             } else {
302                 /* finished */
303                 break;
304             }
305         } else {
306             token=tokens[c];
307             if(token==(uint16_t)(-2)) {
308                 /* this is a lead byte for a double-byte token */
309                 token=tokens[c<<8|*name++];
310                 --nameLength;
311             }
312             if(token==(uint16_t)(-1)) {
313                 if(c!=';') {
314                     /* explicit letter */
315                     WRITE_CHAR(buffer, bufferLength, bufferPos, c);
316                 } else {
317                     /* stop, but skip the semicolon if we are seeking
318                        extended names and there was no 2.0 name but there
319                        is a 1.0 name. */
320                     if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) {
321                         if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
322                             continue;
323                         }
324                     }
325                     /* finished */
326                     break;
327                 }
328             } else {
329                 /* write token word */
330                 uint8_t *tokenString=tokenStrings+token;
331                 while((c=*tokenString++)!=0) {
332                     WRITE_CHAR(buffer, bufferLength, bufferPos, c);
333                 }
334             }
335         }
336     }
337 
338     /* zero-terminate */
339     if(bufferLength>0) {
340         *buffer=0;
341     }
342 
343     return bufferPos;
344 }
345 
346 /*
347  * compareName() is almost the same as expandName() except that it compares
348  * the currently expanded name to an input name.
349  * It returns the match/no match result as soon as possible.
350  */
351 static UBool
compareName(UCharNames * names,const uint8_t * name,uint16_t nameLength,UCharNameChoice nameChoice,const char * otherName)352 compareName(UCharNames *names,
353             const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
354             const char *otherName) {
355     uint16_t *tokens=(uint16_t *)names+8;
356     uint16_t token, tokenCount=*tokens++;
357     uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
358     uint8_t c;
359     const char *origOtherName = otherName;
360 
361     if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
362         /*
363          * skip the modern name if it is not requested _and_
364          * if the semicolon byte value is a character, not a token number
365          */
366         if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
367             int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
368             do {
369                 while(nameLength>0) {
370                     --nameLength;
371                     if(*name++==';') {
372                         break;
373                     }
374                 }
375             } while(--fieldIndex>0);
376         } else {
377             /*
378              * the semicolon byte value is a token number, therefore
379              * only modern names are stored in unames.dat and there is no
380              * such requested alternate name here
381              */
382             nameLength=0;
383         }
384     }
385 
386     /* compare each letter directly, and compare a token word per token */
387     while(nameLength>0) {
388         --nameLength;
389         c=*name++;
390 
391         if(c>=tokenCount) {
392             if(c!=';') {
393                 /* implicit letter */
394                 if((char)c!=*otherName++) {
395                     return FALSE;
396                 }
397             } else {
398                 /* finished */
399                 break;
400             }
401         } else {
402             token=tokens[c];
403             if(token==(uint16_t)(-2)) {
404                 /* this is a lead byte for a double-byte token */
405                 token=tokens[c<<8|*name++];
406                 --nameLength;
407             }
408             if(token==(uint16_t)(-1)) {
409                 if(c!=';') {
410                     /* explicit letter */
411                     if((char)c!=*otherName++) {
412                         return FALSE;
413                     }
414                 } else {
415                     /* stop, but skip the semicolon if we are seeking
416                        extended names and there was no 2.0 name but there
417                        is a 1.0 name. */
418                     if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) {
419                         if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
420                             continue;
421                         }
422                     }
423                     /* finished */
424                     break;
425                 }
426             } else {
427                 /* write token word */
428                 uint8_t *tokenString=tokenStrings+token;
429                 while((c=*tokenString++)!=0) {
430                     if((char)c!=*otherName++) {
431                         return FALSE;
432                     }
433                 }
434             }
435         }
436     }
437 
438     /* complete match? */
439     return (UBool)(*otherName==0);
440 }
441 
getCharCat(UChar32 cp)442 static uint8_t getCharCat(UChar32 cp) {
443     uint8_t cat;
444 
445     if (UTF_IS_UNICODE_NONCHAR(cp)) {
446         return U_NONCHARACTER_CODE_POINT;
447     }
448 
449     if ((cat = u_charType(cp)) == U_SURROGATE) {
450         cat = UTF_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
451     }
452 
453     return cat;
454 }
455 
getCharCatName(UChar32 cp)456 static const char *getCharCatName(UChar32 cp) {
457     uint8_t cat = getCharCat(cp);
458 
459     /* Return unknown if the table of names above is not up to
460        date. */
461 
462     if (cat >= LENGTHOF(charCatNames)) {
463         return "unknown";
464     } else {
465         return charCatNames[cat];
466     }
467 }
468 
getExtName(uint32_t code,char * buffer,uint16_t bufferLength)469 static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
470     const char *catname = getCharCatName(code);
471     uint16_t length = 0;
472 
473     UChar32 cp;
474     int ndigits, i;
475 
476     WRITE_CHAR(buffer, bufferLength, length, '<');
477     while (catname[length - 1]) {
478         WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
479     }
480     WRITE_CHAR(buffer, bufferLength, length, '-');
481     for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
482         ;
483     if (ndigits < 4)
484         ndigits = 4;
485     for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {
486         uint8_t v = (uint8_t)(cp & 0xf);
487         buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
488     }
489     buffer += ndigits;
490     length += ndigits;
491     WRITE_CHAR(buffer, bufferLength, length, '>');
492 
493     return length;
494 }
495 
496 /*
497  * getGroup() does a binary search for the group that contains the
498  * Unicode code point "code".
499  * The return value is always a valid Group* that may contain "code"
500  * or else is the highest group before "code".
501  * If the lowest group is after "code", then that one is returned.
502  */
503 static const uint16_t *
getGroup(UCharNames * names,uint32_t code)504 getGroup(UCharNames *names, uint32_t code) {
505     const uint16_t *groups=GET_GROUPS(names);
506     uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),
507              start=0,
508              limit=*groups++,
509              number;
510 
511     /* binary search for the group of names that contains the one for code */
512     while(start<limit-1) {
513         number=(uint16_t)((start+limit)/2);
514         if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {
515             limit=number;
516         } else {
517             start=number;
518         }
519     }
520 
521     /* return this regardless of whether it is an exact match */
522     return groups+start*GROUP_LENGTH;
523 }
524 
525 /*
526  * expandGroupLengths() reads a block of compressed lengths of 32 strings and
527  * expands them into offsets and lengths for each string.
528  * Lengths are stored with a variable-width encoding in consecutive nibbles:
529  * If a nibble<0xc, then it is the length itself (0=empty string).
530  * If a nibble>=0xc, then it forms a length value with the following nibble.
531  * Calculation see below.
532  * The offsets and lengths arrays must be at least 33 (one more) long because
533  * there is no check here at the end if the last nibble is still used.
534  */
535 static const uint8_t *
expandGroupLengths(const uint8_t * s,uint16_t offsets[LINES_PER_GROUP+1],uint16_t lengths[LINES_PER_GROUP+1])536 expandGroupLengths(const uint8_t *s,
537                    uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
538     /* read the lengths of the 32 strings in this group and get each string's offset */
539     uint16_t i=0, offset=0, length=0;
540     uint8_t lengthByte;
541 
542     /* all 32 lengths must be read to get the offset of the first group string */
543     while(i<LINES_PER_GROUP) {
544         lengthByte=*s++;
545 
546         /* read even nibble - MSBs of lengthByte */
547         if(length>=12) {
548             /* double-nibble length spread across two bytes */
549             length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12);
550             lengthByte&=0xf;
551         } else if((lengthByte /* &0xf0 */)>=0xc0) {
552             /* double-nibble length spread across this one byte */
553             length=(uint16_t)((lengthByte&0x3f)+12);
554         } else {
555             /* single-nibble length in MSBs */
556             length=(uint16_t)(lengthByte>>4);
557             lengthByte&=0xf;
558         }
559 
560         *offsets++=offset;
561         *lengths++=length;
562 
563         offset+=length;
564         ++i;
565 
566         /* read odd nibble - LSBs of lengthByte */
567         if((lengthByte&0xf0)==0) {
568             /* this nibble was not consumed for a double-nibble length above */
569             length=lengthByte;
570             if(length<12) {
571                 /* single-nibble length in LSBs */
572                 *offsets++=offset;
573                 *lengths++=length;
574 
575                 offset+=length;
576                 ++i;
577             }
578         } else {
579             length=0;   /* prevent double-nibble detection in the next iteration */
580         }
581     }
582 
583     /* now, s is at the first group string */
584     return s;
585 }
586 
587 static uint16_t
expandGroupName(UCharNames * names,const uint16_t * group,uint16_t lineNumber,UCharNameChoice nameChoice,char * buffer,uint16_t bufferLength)588 expandGroupName(UCharNames *names, const uint16_t *group,
589                 uint16_t lineNumber, UCharNameChoice nameChoice,
590                 char *buffer, uint16_t bufferLength) {
591     uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
592     const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
593     s=expandGroupLengths(s, offsets, lengths);
594     return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
595                       buffer, bufferLength);
596 }
597 
598 static uint16_t
getName(UCharNames * names,uint32_t code,UCharNameChoice nameChoice,char * buffer,uint16_t bufferLength)599 getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
600         char *buffer, uint16_t bufferLength) {
601     const uint16_t *group=getGroup(names, code);
602     if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) {
603         return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice,
604                                buffer, bufferLength);
605     } else {
606         /* group not found */
607         /* zero-terminate */
608         if(bufferLength>0) {
609             *buffer=0;
610         }
611         return 0;
612     }
613 }
614 
615 /*
616  * enumGroupNames() enumerates all the names in a 32-group
617  * and either calls the enumerator function or finds a given input name.
618  */
619 static UBool
enumGroupNames(UCharNames * names,const uint16_t * group,UChar32 start,UChar32 end,UEnumCharNamesFn * fn,void * context,UCharNameChoice nameChoice)620 enumGroupNames(UCharNames *names, const uint16_t *group,
621                UChar32 start, UChar32 end,
622                UEnumCharNamesFn *fn, void *context,
623                UCharNameChoice nameChoice) {
624     uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
625     const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
626 
627     s=expandGroupLengths(s, offsets, lengths);
628     if(fn!=DO_FIND_NAME) {
629         char buffer[200];
630         uint16_t length;
631 
632         while(start<=end) {
633             length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer));
634             if (!length && nameChoice == U_EXTENDED_CHAR_NAME) {
635                 buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
636             }
637             /* here, we assume that the buffer is large enough */
638             if(length>0) {
639                 if(!fn(context, start, nameChoice, buffer, length)) {
640                     return FALSE;
641                 }
642             }
643             ++start;
644         }
645     } else {
646         const char *otherName=((FindName *)context)->otherName;
647         while(start<=end) {
648             if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) {
649                 ((FindName *)context)->code=start;
650                 return FALSE;
651             }
652             ++start;
653         }
654     }
655     return TRUE;
656 }
657 
658 /*
659  * enumExtNames enumerate extended names.
660  * It only needs to do it if it is called with a real function and not
661  * with the dummy DO_FIND_NAME, because u_charFromName() does a check
662  * for extended names by itself.
663  */
664 static UBool
enumExtNames(UChar32 start,UChar32 end,UEnumCharNamesFn * fn,void * context)665 enumExtNames(UChar32 start, UChar32 end,
666              UEnumCharNamesFn *fn, void *context)
667 {
668     if(fn!=DO_FIND_NAME) {
669         char buffer[200];
670         uint16_t length;
671 
672         while(start<=end) {
673             buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
674             /* here, we assume that the buffer is large enough */
675             if(length>0) {
676                 if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) {
677                     return FALSE;
678                 }
679             }
680             ++start;
681         }
682     }
683 
684     return TRUE;
685 }
686 
687 static UBool
enumNames(UCharNames * names,UChar32 start,UChar32 limit,UEnumCharNamesFn * fn,void * context,UCharNameChoice nameChoice)688 enumNames(UCharNames *names,
689           UChar32 start, UChar32 limit,
690           UEnumCharNamesFn *fn, void *context,
691           UCharNameChoice nameChoice) {
692     uint16_t startGroupMSB, endGroupMSB, groupCount;
693     const uint16_t *group, *groupLimit;
694 
695     startGroupMSB=(uint16_t)(start>>GROUP_SHIFT);
696     endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT);
697 
698     /* find the group that contains start, or the highest before it */
699     group=getGroup(names, start);
700 
701     if(startGroupMSB==endGroupMSB) {
702         if(startGroupMSB==group[GROUP_MSB]) {
703             /* if start and limit-1 are in the same group, then enumerate only in that one */
704             return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
705         }
706     } else {
707         const uint16_t *groups=GET_GROUPS(names);
708         groupCount=*groups++;
709         groupLimit=groups+groupCount*GROUP_LENGTH;
710 
711         if(startGroupMSB==group[GROUP_MSB]) {
712             /* enumerate characters in the partial start group */
713             if((start&GROUP_MASK)!=0) {
714                 if(!enumGroupNames(names, group,
715                                    start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1,
716                                    fn, context, nameChoice)) {
717                     return FALSE;
718                 }
719                 group=NEXT_GROUP(group); /* continue with the next group */
720             }
721         } else if(startGroupMSB>group[GROUP_MSB]) {
722             /* make sure that we start enumerating with the first group after start */
723             const uint16_t *nextGroup=NEXT_GROUP(group);
724             if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
725                 UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
726                 if (end > limit) {
727                     end = limit;
728                 }
729                 if (!enumExtNames(start, end - 1, fn, context)) {
730                     return FALSE;
731                 }
732             }
733             group=nextGroup;
734         }
735 
736         /* enumerate entire groups between the start- and end-groups */
737         while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) {
738             const uint16_t *nextGroup;
739             start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT;
740             if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {
741                 return FALSE;
742             }
743             nextGroup=NEXT_GROUP(group);
744             if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
745                 UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
746                 if (end > limit) {
747                     end = limit;
748                 }
749                 if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) {
750                     return FALSE;
751                 }
752             }
753             group=nextGroup;
754         }
755 
756         /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
757         if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) {
758             return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
759         } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
760             UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT;
761             if (next > start) {
762                 start = next;
763             }
764         } else {
765             return TRUE;
766         }
767     }
768 
769     /* we have not found a group, which means everything is made of
770        extended names. */
771     if (nameChoice == U_EXTENDED_CHAR_NAME) {
772         if (limit > UCHAR_MAX_VALUE + 1) {
773             limit = UCHAR_MAX_VALUE + 1;
774         }
775         return enumExtNames(start, limit - 1, fn, context);
776     }
777 
778     return TRUE;
779 }
780 
781 static uint16_t
writeFactorSuffix(const uint16_t * factors,uint16_t count,const char * s,uint32_t code,uint16_t indexes[8],const char * elementBases[8],const char * elements[8],char * buffer,uint16_t bufferLength)782 writeFactorSuffix(const uint16_t *factors, uint16_t count,
783                   const char *s, /* suffix elements */
784                   uint32_t code,
785                   uint16_t indexes[8], /* output fields from here */
786                   const char *elementBases[8], const char *elements[8],
787                   char *buffer, uint16_t bufferLength) {
788     uint16_t i, factor, bufferPos=0;
789     char c;
790 
791     /* write elements according to the factors */
792 
793     /*
794      * the factorized elements are determined by modulo arithmetic
795      * with the factors of this algorithm
796      *
797      * note that for fewer operations, count is decremented here
798      */
799     --count;
800     for(i=count; i>0; --i) {
801         factor=factors[i];
802         indexes[i]=(uint16_t)(code%factor);
803         code/=factor;
804     }
805     /*
806      * we don't need to calculate the last modulus because start<=code<=end
807      * guarantees here that code<=factors[0]
808      */
809     indexes[0]=(uint16_t)code;
810 
811     /* write each element */
812     for(;;) {
813         if(elementBases!=NULL) {
814             *elementBases++=s;
815         }
816 
817         /* skip indexes[i] strings */
818         factor=indexes[i];
819         while(factor>0) {
820             while(*s++!=0) {}
821             --factor;
822         }
823         if(elements!=NULL) {
824             *elements++=s;
825         }
826 
827         /* write element */
828         while((c=*s++)!=0) {
829             WRITE_CHAR(buffer, bufferLength, bufferPos, c);
830         }
831 
832         /* we do not need to perform the rest of this loop for i==count - break here */
833         if(i>=count) {
834             break;
835         }
836 
837         /* skip the rest of the strings for this factors[i] */
838         factor=(uint16_t)(factors[i]-indexes[i]-1);
839         while(factor>0) {
840             while(*s++!=0) {}
841             --factor;
842         }
843 
844         ++i;
845     }
846 
847     /* zero-terminate */
848     if(bufferLength>0) {
849         *buffer=0;
850     }
851 
852     return bufferPos;
853 }
854 
855 /*
856  * Important:
857  * Parts of findAlgName() are almost the same as some of getAlgName().
858  * Fixes must be applied to both.
859  */
860 static uint16_t
getAlgName(AlgorithmicRange * range,uint32_t code,UCharNameChoice nameChoice,char * buffer,uint16_t bufferLength)861 getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
862         char *buffer, uint16_t bufferLength) {
863     uint16_t bufferPos=0;
864 
865     /* Only the normative character name can be algorithmic. */
866     if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
867         /* zero-terminate */
868         if(bufferLength>0) {
869             *buffer=0;
870         }
871         return 0;
872     }
873 
874     switch(range->type) {
875     case 0: {
876         /* name = prefix hex-digits */
877         const char *s=(const char *)(range+1);
878         char c;
879 
880         uint16_t i, count;
881 
882         /* copy prefix */
883         while((c=*s++)!=0) {
884             WRITE_CHAR(buffer, bufferLength, bufferPos, c);
885         }
886 
887         /* write hexadecimal code point value */
888         count=range->variant;
889 
890         /* zero-terminate */
891         if(count<bufferLength) {
892             buffer[count]=0;
893         }
894 
895         for(i=count; i>0;) {
896             if(--i<bufferLength) {
897                 c=(char)(code&0xf);
898                 if(c<10) {
899                     c+='0';
900                 } else {
901                     c+='A'-10;
902                 }
903                 buffer[i]=c;
904             }
905             code>>=4;
906         }
907 
908         bufferPos+=count;
909         break;
910     }
911     case 1: {
912         /* name = prefix factorized-elements */
913         uint16_t indexes[8];
914         const uint16_t *factors=(const uint16_t *)(range+1);
915         uint16_t count=range->variant;
916         const char *s=(const char *)(factors+count);
917         char c;
918 
919         /* copy prefix */
920         while((c=*s++)!=0) {
921             WRITE_CHAR(buffer, bufferLength, bufferPos, c);
922         }
923 
924         bufferPos+=writeFactorSuffix(factors, count,
925                                      s, code-range->start, indexes, NULL, NULL, buffer, bufferLength);
926         break;
927     }
928     default:
929         /* undefined type */
930         /* zero-terminate */
931         if(bufferLength>0) {
932             *buffer=0;
933         }
934         break;
935     }
936 
937     return bufferPos;
938 }
939 
940 /*
941  * Important: enumAlgNames() and findAlgName() are almost the same.
942  * Any fix must be applied to both.
943  */
944 static UBool
enumAlgNames(AlgorithmicRange * range,UChar32 start,UChar32 limit,UEnumCharNamesFn * fn,void * context,UCharNameChoice nameChoice)945 enumAlgNames(AlgorithmicRange *range,
946              UChar32 start, UChar32 limit,
947              UEnumCharNamesFn *fn, void *context,
948              UCharNameChoice nameChoice) {
949     char buffer[200];
950     uint16_t length;
951 
952     if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
953         return TRUE;
954     }
955 
956     switch(range->type) {
957     case 0: {
958         char *s, *end;
959         char c;
960 
961         /* get the full name of the start character */
962         length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer));
963         if(length<=0) {
964             return TRUE;
965         }
966 
967         /* call the enumerator function with this first character */
968         if(!fn(context, start, nameChoice, buffer, length)) {
969             return FALSE;
970         }
971 
972         /* go to the end of the name; all these names have the same length */
973         end=buffer;
974         while(*end!=0) {
975             ++end;
976         }
977 
978         /* enumerate the rest of the names */
979         while(++start<limit) {
980             /* increment the hexadecimal number on a character-basis */
981             s=end;
982             for (;;) {
983                 c=*--s;
984                 if(('0'<=c && c<'9') || ('A'<=c && c<'F')) {
985                     *s=(char)(c+1);
986                     break;
987                 } else if(c=='9') {
988                     *s='A';
989                     break;
990                 } else if(c=='F') {
991                     *s='0';
992                 }
993             }
994 
995             if(!fn(context, start, nameChoice, buffer, length)) {
996                 return FALSE;
997             }
998         }
999         break;
1000     }
1001     case 1: {
1002         uint16_t indexes[8];
1003         const char *elementBases[8], *elements[8];
1004         const uint16_t *factors=(const uint16_t *)(range+1);
1005         uint16_t count=range->variant;
1006         const char *s=(const char *)(factors+count);
1007         char *suffix, *t;
1008         uint16_t prefixLength, i, idx;
1009 
1010         char c;
1011 
1012         /* name = prefix factorized-elements */
1013 
1014         /* copy prefix */
1015         suffix=buffer;
1016         prefixLength=0;
1017         while((c=*s++)!=0) {
1018             *suffix++=c;
1019             ++prefixLength;
1020         }
1021 
1022         /* append the suffix of the start character */
1023         length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count,
1024                                               s, (uint32_t)start-range->start,
1025                                               indexes, elementBases, elements,
1026                                               suffix, (uint16_t)(sizeof(buffer)-prefixLength)));
1027 
1028         /* call the enumerator function with this first character */
1029         if(!fn(context, start, nameChoice, buffer, length)) {
1030             return FALSE;
1031         }
1032 
1033         /* enumerate the rest of the names */
1034         while(++start<limit) {
1035             /* increment the indexes in lexical order bound by the factors */
1036             i=count;
1037             for (;;) {
1038                 idx=(uint16_t)(indexes[--i]+1);
1039                 if(idx<factors[i]) {
1040                     /* skip one index and its element string */
1041                     indexes[i]=idx;
1042                     s=elements[i];
1043                     while(*s++!=0) {
1044                     }
1045                     elements[i]=s;
1046                     break;
1047                 } else {
1048                     /* reset this index to 0 and its element string to the first one */
1049                     indexes[i]=0;
1050                     elements[i]=elementBases[i];
1051                 }
1052             }
1053 
1054             /* to make matters a little easier, just append all elements to the suffix */
1055             t=suffix;
1056             length=prefixLength;
1057             for(i=0; i<count; ++i) {
1058                 s=elements[i];
1059                 while((c=*s++)!=0) {
1060                     *t++=c;
1061                     ++length;
1062                 }
1063             }
1064             /* zero-terminate */
1065             *t=0;
1066 
1067             if(!fn(context, start, nameChoice, buffer, length)) {
1068                 return FALSE;
1069             }
1070         }
1071         break;
1072     }
1073     default:
1074         /* undefined type */
1075         break;
1076     }
1077 
1078     return TRUE;
1079 }
1080 
1081 /*
1082  * findAlgName() is almost the same as enumAlgNames() except that it
1083  * returns the code point for a name if it fits into the range.
1084  * It returns 0xffff otherwise.
1085  */
1086 static UChar32
findAlgName(AlgorithmicRange * range,UCharNameChoice nameChoice,const char * otherName)1087 findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) {
1088     UChar32 code;
1089 
1090     if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
1091         return 0xffff;
1092     }
1093 
1094     switch(range->type) {
1095     case 0: {
1096         /* name = prefix hex-digits */
1097         const char *s=(const char *)(range+1);
1098         char c;
1099 
1100         uint16_t i, count;
1101 
1102         /* compare prefix */
1103         while((c=*s++)!=0) {
1104             if((char)c!=*otherName++) {
1105                 return 0xffff;
1106             }
1107         }
1108 
1109         /* read hexadecimal code point value */
1110         count=range->variant;
1111         code=0;
1112         for(i=0; i<count; ++i) {
1113             c=*otherName++;
1114             if('0'<=c && c<='9') {
1115                 code=(code<<4)|(c-'0');
1116             } else if('A'<=c && c<='F') {
1117                 code=(code<<4)|(c-'A'+10);
1118             } else {
1119                 return 0xffff;
1120             }
1121         }
1122 
1123         /* does it fit into the range? */
1124         if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) {
1125             return code;
1126         }
1127         break;
1128     }
1129     case 1: {
1130         char buffer[64];
1131         uint16_t indexes[8];
1132         const char *elementBases[8], *elements[8];
1133         const uint16_t *factors=(const uint16_t *)(range+1);
1134         uint16_t count=range->variant;
1135         const char *s=(const char *)(factors+count), *t;
1136         UChar32 start, limit;
1137         uint16_t i, idx;
1138 
1139         char c;
1140 
1141         /* name = prefix factorized-elements */
1142 
1143         /* compare prefix */
1144         while((c=*s++)!=0) {
1145             if((char)c!=*otherName++) {
1146                 return 0xffff;
1147             }
1148         }
1149 
1150         start=(UChar32)range->start;
1151         limit=(UChar32)(range->end+1);
1152 
1153         /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
1154         writeFactorSuffix(factors, count, s, 0,
1155                           indexes, elementBases, elements, buffer, sizeof(buffer));
1156 
1157         /* compare the first suffix */
1158         if(0==uprv_strcmp(otherName, buffer)) {
1159             return start;
1160         }
1161 
1162         /* enumerate and compare the rest of the suffixes */
1163         while(++start<limit) {
1164             /* increment the indexes in lexical order bound by the factors */
1165             i=count;
1166             for (;;) {
1167                 idx=(uint16_t)(indexes[--i]+1);
1168                 if(idx<factors[i]) {
1169                     /* skip one index and its element string */
1170                     indexes[i]=idx;
1171                     s=elements[i];
1172                     while(*s++!=0) {}
1173                     elements[i]=s;
1174                     break;
1175                 } else {
1176                     /* reset this index to 0 and its element string to the first one */
1177                     indexes[i]=0;
1178                     elements[i]=elementBases[i];
1179                 }
1180             }
1181 
1182             /* to make matters a little easier, just compare all elements of the suffix */
1183             t=otherName;
1184             for(i=0; i<count; ++i) {
1185                 s=elements[i];
1186                 while((c=*s++)!=0) {
1187                     if(c!=*t++) {
1188                         s=""; /* does not match */
1189                         i=99;
1190                     }
1191                 }
1192             }
1193             if(i<99 && *t==0) {
1194                 return start;
1195             }
1196         }
1197         break;
1198     }
1199     default:
1200         /* undefined type */
1201         break;
1202     }
1203 
1204     return 0xffff;
1205 }
1206 
1207 /* sets of name characters, maximum name lengths ---------------------------- */
1208 
1209 #define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
1210 #define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
1211 
1212 static int32_t
calcStringSetLength(uint32_t set[8],const char * s)1213 calcStringSetLength(uint32_t set[8], const char *s) {
1214     int32_t length=0;
1215     char c;
1216 
1217     while((c=*s++)!=0) {
1218         SET_ADD(set, c);
1219         ++length;
1220     }
1221     return length;
1222 }
1223 
1224 static int32_t
calcAlgNameSetsLengths(int32_t maxNameLength)1225 calcAlgNameSetsLengths(int32_t maxNameLength) {
1226     AlgorithmicRange *range;
1227     uint32_t *p;
1228     uint32_t rangeCount;
1229     int32_t length;
1230 
1231     /* enumerate algorithmic ranges */
1232     p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1233     rangeCount=*p;
1234     range=(AlgorithmicRange *)(p+1);
1235     while(rangeCount>0) {
1236         switch(range->type) {
1237         case 0:
1238             /* name = prefix + (range->variant times) hex-digits */
1239             /* prefix */
1240             length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant;
1241             if(length>maxNameLength) {
1242                 maxNameLength=length;
1243             }
1244             break;
1245         case 1: {
1246             /* name = prefix factorized-elements */
1247             const uint16_t *factors=(const uint16_t *)(range+1);
1248             const char *s;
1249             int32_t i, count=range->variant, factor, factorLength, maxFactorLength;
1250 
1251             /* prefix length */
1252             s=(const char *)(factors+count);
1253             length=calcStringSetLength(gNameSet, s);
1254             s+=length+1; /* start of factor suffixes */
1255 
1256             /* get the set and maximum factor suffix length for each factor */
1257             for(i=0; i<count; ++i) {
1258                 maxFactorLength=0;
1259                 for(factor=factors[i]; factor>0; --factor) {
1260                     factorLength=calcStringSetLength(gNameSet, s);
1261                     s+=factorLength+1;
1262                     if(factorLength>maxFactorLength) {
1263                         maxFactorLength=factorLength;
1264                     }
1265                 }
1266                 length+=maxFactorLength;
1267             }
1268 
1269             if(length>maxNameLength) {
1270                 maxNameLength=length;
1271             }
1272             break;
1273         }
1274         default:
1275             /* unknown type */
1276             break;
1277         }
1278 
1279         range=(AlgorithmicRange *)((uint8_t *)range+range->size);
1280         --rangeCount;
1281     }
1282     return maxNameLength;
1283 }
1284 
1285 static int32_t
calcExtNameSetsLengths(int32_t maxNameLength)1286 calcExtNameSetsLengths(int32_t maxNameLength) {
1287     int32_t i, length;
1288 
1289     for(i=0; i<LENGTHOF(charCatNames); ++i) {
1290         /*
1291          * for each category, count the length of the category name
1292          * plus 9=
1293          * 2 for <>
1294          * 1 for -
1295          * 6 for most hex digits per code point
1296          */
1297         length=9+calcStringSetLength(gNameSet, charCatNames[i]);
1298         if(length>maxNameLength) {
1299             maxNameLength=length;
1300         }
1301     }
1302     return maxNameLength;
1303 }
1304 
1305 static int32_t
calcNameSetLength(const uint16_t * tokens,uint16_t tokenCount,const uint8_t * tokenStrings,int8_t * tokenLengths,uint32_t set[8],const uint8_t ** pLine,const uint8_t * lineLimit)1306 calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths,
1307                   uint32_t set[8],
1308                   const uint8_t **pLine, const uint8_t *lineLimit) {
1309     const uint8_t *line=*pLine;
1310     int32_t length=0, tokenLength;
1311     uint16_t c, token;
1312 
1313     while(line!=lineLimit && (c=*line++)!=(uint8_t)';') {
1314         if(c>=tokenCount) {
1315             /* implicit letter */
1316             SET_ADD(set, c);
1317             ++length;
1318         } else {
1319             token=tokens[c];
1320             if(token==(uint16_t)(-2)) {
1321                 /* this is a lead byte for a double-byte token */
1322                 c=c<<8|*line++;
1323                 token=tokens[c];
1324             }
1325             if(token==(uint16_t)(-1)) {
1326                 /* explicit letter */
1327                 SET_ADD(set, c);
1328                 ++length;
1329             } else {
1330                 /* count token word */
1331                 if(tokenLengths!=NULL) {
1332                     /* use cached token length */
1333                     tokenLength=tokenLengths[c];
1334                     if(tokenLength==0) {
1335                         tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1336                         tokenLengths[c]=(int8_t)tokenLength;
1337                     }
1338                 } else {
1339                     tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1340                 }
1341                 length+=tokenLength;
1342             }
1343         }
1344     }
1345 
1346     *pLine=line;
1347     return length;
1348 }
1349 
1350 static void
calcGroupNameSetsLengths(int32_t maxNameLength)1351 calcGroupNameSetsLengths(int32_t maxNameLength) {
1352     uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
1353 
1354     uint16_t *tokens=(uint16_t *)uCharNames+8;
1355     uint16_t tokenCount=*tokens++;
1356     uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset;
1357 
1358     int8_t *tokenLengths;
1359 
1360     const uint16_t *group;
1361     const uint8_t *s, *line, *lineLimit;
1362 
1363     int32_t groupCount, lineNumber, length;
1364 
1365     tokenLengths=(int8_t *)uprv_malloc(tokenCount);
1366     if(tokenLengths!=NULL) {
1367         uprv_memset(tokenLengths, 0, tokenCount);
1368     }
1369 
1370     group=GET_GROUPS(uCharNames);
1371     groupCount=*group++;
1372 
1373     /* enumerate all groups */
1374     while(groupCount>0) {
1375         s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group);
1376         s=expandGroupLengths(s, offsets, lengths);
1377 
1378         /* enumerate all lines in each group */
1379         for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {
1380             line=s+offsets[lineNumber];
1381             length=lengths[lineNumber];
1382             if(length==0) {
1383                 continue;
1384             }
1385 
1386             lineLimit=line+length;
1387 
1388             /* read regular name */
1389             length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1390             if(length>maxNameLength) {
1391                 maxNameLength=length;
1392             }
1393             if(line==lineLimit) {
1394                 continue;
1395             }
1396 
1397             /* read Unicode 1.0 name */
1398             length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1399             if(length>maxNameLength) {
1400                 maxNameLength=length;
1401             }
1402             if(line==lineLimit) {
1403                 continue;
1404             }
1405 
1406             /* read ISO comment */
1407             /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
1408         }
1409 
1410         group=NEXT_GROUP(group);
1411         --groupCount;
1412     }
1413 
1414     if(tokenLengths!=NULL) {
1415         uprv_free(tokenLengths);
1416     }
1417 
1418     /* set gMax... - name length last for threading */
1419     gMaxNameLength=maxNameLength;
1420 }
1421 
1422 static UBool
calcNameSetsLengths(UErrorCode * pErrorCode)1423 calcNameSetsLengths(UErrorCode *pErrorCode) {
1424     static const char extChars[]="0123456789ABCDEF<>-";
1425     int32_t i, maxNameLength;
1426 
1427     if(gMaxNameLength!=0) {
1428         return TRUE;
1429     }
1430 
1431     if(!isDataLoaded(pErrorCode)) {
1432         return FALSE;
1433     }
1434 
1435     /* set hex digits, used in various names, and <>-, used in extended names */
1436     for(i=0; i<sizeof(extChars)-1; ++i) {
1437         SET_ADD(gNameSet, extChars[i]);
1438     }
1439 
1440     /* set sets and lengths from algorithmic names */
1441     maxNameLength=calcAlgNameSetsLengths(0);
1442 
1443     /* set sets and lengths from extended names */
1444     maxNameLength=calcExtNameSetsLengths(maxNameLength);
1445 
1446     /* set sets and lengths from group names, set global maximum values */
1447     calcGroupNameSetsLengths(maxNameLength);
1448 
1449     return TRUE;
1450 }
1451 
1452 /* public API --------------------------------------------------------------- */
1453 
1454 U_CAPI int32_t U_EXPORT2
u_charName(UChar32 code,UCharNameChoice nameChoice,char * buffer,int32_t bufferLength,UErrorCode * pErrorCode)1455 u_charName(UChar32 code, UCharNameChoice nameChoice,
1456            char *buffer, int32_t bufferLength,
1457            UErrorCode *pErrorCode) {
1458     AlgorithmicRange *algRange;
1459     uint32_t *p;
1460     uint32_t i;
1461     int32_t length;
1462 
1463     /* check the argument values */
1464     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1465         return 0;
1466     } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT ||
1467               bufferLength<0 || (bufferLength>0 && buffer==NULL)
1468     ) {
1469         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1470         return 0;
1471     }
1472 
1473     if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
1474         return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
1475     }
1476 
1477     length=0;
1478 
1479     /* try algorithmic names first */
1480     p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1481     i=*p;
1482     algRange=(AlgorithmicRange *)(p+1);
1483     while(i>0) {
1484         if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
1485             length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1486             break;
1487         }
1488         algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1489         --i;
1490     }
1491 
1492     if(i==0) {
1493         if (nameChoice == U_EXTENDED_CHAR_NAME) {
1494             length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
1495             if (!length) {
1496                 /* extended character name */
1497                 length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
1498             }
1499         } else {
1500             /* normal character name */
1501             length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1502         }
1503     }
1504 
1505     return u_terminateChars(buffer, bufferLength, length, pErrorCode);
1506 }
1507 
1508 U_CAPI int32_t U_EXPORT2
u_getISOComment(UChar32 c,char * dest,int32_t destCapacity,UErrorCode * pErrorCode)1509 u_getISOComment(UChar32 c,
1510                 char *dest, int32_t destCapacity,
1511                 UErrorCode *pErrorCode) {
1512     int32_t length;
1513 
1514     /* check the argument values */
1515     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1516         return 0;
1517     } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1518         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1519         return 0;
1520     }
1521 
1522     if((uint32_t)c>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
1523         return u_terminateChars(dest, destCapacity, 0, pErrorCode);
1524     }
1525 
1526     /* the ISO comment is stored like a normal character name */
1527     length=getName(uCharNames, (uint32_t)c, U_ISO_COMMENT, dest, (uint16_t)destCapacity);
1528     return u_terminateChars(dest, destCapacity, length, pErrorCode);
1529 }
1530 
1531 U_CAPI UChar32 U_EXPORT2
u_charFromName(UCharNameChoice nameChoice,const char * name,UErrorCode * pErrorCode)1532 u_charFromName(UCharNameChoice nameChoice,
1533                const char *name,
1534                UErrorCode *pErrorCode) {
1535     char upper[120], lower[120];
1536     FindName findName;
1537     AlgorithmicRange *algRange;
1538     uint32_t *p;
1539     uint32_t i;
1540     UChar32 cp = 0;
1541     char c0;
1542     UChar32 error = 0xffff;     /* Undefined, but use this for backwards compatibility. */
1543 
1544     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1545         return error;
1546     }
1547 
1548     if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) {
1549         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1550         return error;
1551     }
1552 
1553     if(!isDataLoaded(pErrorCode)) {
1554         return error;
1555     }
1556 
1557     /* construct the uppercase and lowercase of the name first */
1558     for(i=0; i<sizeof(upper); ++i) {
1559         if((c0=*name++)!=0) {
1560             upper[i]=uprv_toupper(c0);
1561             lower[i]=uprv_tolower(c0);
1562         } else {
1563             upper[i]=lower[i]=0;
1564             break;
1565         }
1566     }
1567     if(i==sizeof(upper)) {
1568         /* name too long, there is no such character */
1569         *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1570         return error;
1571     }
1572 
1573     /* try extended names first */
1574     if (lower[0] == '<') {
1575         if (nameChoice == U_EXTENDED_CHAR_NAME) {
1576             if (lower[--i] == '>') {
1577                 for (--i; lower[i] && lower[i] != '-'; --i) {
1578                 }
1579 
1580                 if (lower[i] == '-') { /* We've got a category. */
1581                     uint32_t cIdx;
1582 
1583                     lower[i] = 0;
1584 
1585                     for (++i; lower[i] != '>'; ++i) {
1586                         if (lower[i] >= '0' && lower[i] <= '9') {
1587                             cp = (cp << 4) + lower[i] - '0';
1588                         } else if (lower[i] >= 'a' && lower[i] <= 'f') {
1589                             cp = (cp << 4) + lower[i] - 'a' + 10;
1590                         } else {
1591                             *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1592                             return error;
1593                         }
1594                     }
1595 
1596                     /* Now validate the category name.
1597                        We could use a binary search, or a trie, if
1598                        we really wanted to. */
1599 
1600                     for (lower[i] = 0, cIdx = 0; cIdx < LENGTHOF(charCatNames); ++cIdx) {
1601 
1602                         if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
1603                             if (getCharCat(cp) == cIdx) {
1604                                 return cp;
1605                             }
1606                             break;
1607                         }
1608                     }
1609                 }
1610             }
1611         }
1612 
1613         *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1614         return error;
1615     }
1616 
1617     /* try algorithmic names now */
1618     p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1619     i=*p;
1620     algRange=(AlgorithmicRange *)(p+1);
1621     while(i>0) {
1622         if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
1623             return cp;
1624         }
1625         algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1626         --i;
1627     }
1628 
1629     /* normal character name */
1630     findName.otherName=upper;
1631     findName.code=error;
1632     enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
1633     if (findName.code == error) {
1634          *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1635     }
1636     return findName.code;
1637 }
1638 
1639 U_CAPI void U_EXPORT2
u_enumCharNames(UChar32 start,UChar32 limit,UEnumCharNamesFn * fn,void * context,UCharNameChoice nameChoice,UErrorCode * pErrorCode)1640 u_enumCharNames(UChar32 start, UChar32 limit,
1641                 UEnumCharNamesFn *fn,
1642                 void *context,
1643                 UCharNameChoice nameChoice,
1644                 UErrorCode *pErrorCode) {
1645     AlgorithmicRange *algRange;
1646     uint32_t *p;
1647     uint32_t i;
1648 
1649     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1650         return;
1651     }
1652 
1653     if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) {
1654         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1655         return;
1656     }
1657 
1658     if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
1659         limit = UCHAR_MAX_VALUE + 1;
1660     }
1661     if((uint32_t)start>=(uint32_t)limit) {
1662         return;
1663     }
1664 
1665     if(!isDataLoaded(pErrorCode)) {
1666         return;
1667     }
1668 
1669     /* interleave the data-driven ones with the algorithmic ones */
1670     /* iterate over all algorithmic ranges; assume that they are in ascending order */
1671     p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1672     i=*p;
1673     algRange=(AlgorithmicRange *)(p+1);
1674     while(i>0) {
1675         /* enumerate the character names before the current algorithmic range */
1676         /* here: start<limit */
1677         if((uint32_t)start<algRange->start) {
1678             if((uint32_t)limit<=algRange->start) {
1679                 enumNames(uCharNames, start, limit, fn, context, nameChoice);
1680                 return;
1681             }
1682             if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
1683                 return;
1684             }
1685             start=(UChar32)algRange->start;
1686         }
1687         /* enumerate the character names in the current algorithmic range */
1688         /* here: algRange->start<=start<limit */
1689         if((uint32_t)start<=algRange->end) {
1690             if((uint32_t)limit<=(algRange->end+1)) {
1691                 enumAlgNames(algRange, start, limit, fn, context, nameChoice);
1692                 return;
1693             }
1694             if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
1695                 return;
1696             }
1697             start=(UChar32)algRange->end+1;
1698         }
1699         /* continue to the next algorithmic range (here: start<limit) */
1700         algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1701         --i;
1702     }
1703     /* enumerate the character names after the last algorithmic range */
1704     enumNames(uCharNames, start, limit, fn, context, nameChoice);
1705 }
1706 
1707 U_CAPI int32_t U_EXPORT2
uprv_getMaxCharNameLength()1708 uprv_getMaxCharNameLength() {
1709     UErrorCode errorCode=U_ZERO_ERROR;
1710     if(calcNameSetsLengths(&errorCode)) {
1711         return gMaxNameLength;
1712     } else {
1713         return 0;
1714     }
1715 }
1716 
1717 /**
1718  * Converts the char set cset into a Unicode set uset.
1719  * @param cset Set of 256 bit flags corresponding to a set of chars.
1720  * @param uset USet to receive characters. Existing contents are deleted.
1721  */
1722 static void
charSetToUSet(uint32_t cset[8],const USetAdder * sa)1723 charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
1724     UChar us[256];
1725     char cs[256];
1726 
1727     int32_t i, length;
1728     UErrorCode errorCode;
1729 
1730     errorCode=U_ZERO_ERROR;
1731 
1732     if(!calcNameSetsLengths(&errorCode)) {
1733         return;
1734     }
1735 
1736     /* build a char string with all chars that are used in character names */
1737     length=0;
1738     for(i=0; i<256; ++i) {
1739         if(SET_CONTAINS(cset, i)) {
1740             cs[length++]=(char)i;
1741         }
1742     }
1743 
1744     /* convert the char string to a UChar string */
1745     u_charsToUChars(cs, us, length);
1746 
1747     /* add each UChar to the USet */
1748     for(i=0; i<length; ++i) {
1749         if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */
1750             sa->add(sa->set, us[i]);
1751         }
1752     }
1753 }
1754 
1755 /**
1756  * Fills set with characters that are used in Unicode character names.
1757  * @param set USet to receive characters.
1758  */
1759 U_CAPI void U_EXPORT2
uprv_getCharNameCharacters(const USetAdder * sa)1760 uprv_getCharNameCharacters(const USetAdder *sa) {
1761     charSetToUSet(gNameSet, sa);
1762 }
1763 
1764 /* data swapping ------------------------------------------------------------ */
1765 
1766 /*
1767  * The token table contains non-negative entries for token bytes,
1768  * and -1 for bytes that represent themselves in the data file's charset.
1769  * -2 entries are used for lead bytes.
1770  *
1771  * Direct bytes (-1 entries) must be translated from the input charset family
1772  * to the output charset family.
1773  * makeTokenMap() writes a permutation mapping for this.
1774  * Use it once for single-/lead-byte tokens and once more for all trail byte
1775  * tokens. (';' is an unused trail byte marked with -1.)
1776  */
1777 static void
makeTokenMap(const UDataSwapper * ds,int16_t tokens[],uint16_t tokenCount,uint8_t map[256],UErrorCode * pErrorCode)1778 makeTokenMap(const UDataSwapper *ds,
1779              int16_t tokens[], uint16_t tokenCount,
1780              uint8_t map[256],
1781              UErrorCode *pErrorCode) {
1782     UBool usedOutChar[256];
1783     uint16_t i, j;
1784     uint8_t c1, c2;
1785 
1786     if(U_FAILURE(*pErrorCode)) {
1787         return;
1788     }
1789 
1790     if(ds->inCharset==ds->outCharset) {
1791         /* Same charset family: identity permutation */
1792         for(i=0; i<256; ++i) {
1793             map[i]=(uint8_t)i;
1794         }
1795     } else {
1796         uprv_memset(map, 0, 256);
1797         uprv_memset(usedOutChar, 0, 256);
1798 
1799         if(tokenCount>256) {
1800             tokenCount=256;
1801         }
1802 
1803         /* set the direct bytes (byte 0 always maps to itself) */
1804         for(i=1; i<tokenCount; ++i) {
1805             if(tokens[i]==-1) {
1806                 /* convert the direct byte character */
1807                 c1=(uint8_t)i;
1808                 ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
1809                 if(U_FAILURE(*pErrorCode)) {
1810                     udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
1811                                      i, ds->inCharset);
1812                     return;
1813                 }
1814 
1815                 /* enter the converted character into the map and mark it used */
1816                 map[c1]=c2;
1817                 usedOutChar[c2]=TRUE;
1818             }
1819         }
1820 
1821         /* set the mappings for the rest of the permutation */
1822         for(i=j=1; i<tokenCount; ++i) {
1823             /* set mappings that were not set for direct bytes */
1824             if(map[i]==0) {
1825                 /* set an output byte value that was not used as an output byte above */
1826                 while(usedOutChar[j]) {
1827                     ++j;
1828                 }
1829                 map[i]=(uint8_t)j++;
1830             }
1831         }
1832 
1833         /*
1834          * leave mappings at tokenCount and above unset if tokenCount<256
1835          * because they won't be used
1836          */
1837     }
1838 }
1839 
1840 U_CAPI int32_t U_EXPORT2
uchar_swapNames(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)1841 uchar_swapNames(const UDataSwapper *ds,
1842                 const void *inData, int32_t length, void *outData,
1843                 UErrorCode *pErrorCode) {
1844     const UDataInfo *pInfo;
1845     int32_t headerSize;
1846 
1847     const uint8_t *inBytes;
1848     uint8_t *outBytes;
1849 
1850     uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
1851              offset, i, count, stringsCount;
1852 
1853     const AlgorithmicRange *inRange;
1854     AlgorithmicRange *outRange;
1855 
1856     /* udata_swapDataHeader checks the arguments */
1857     headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
1858     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1859         return 0;
1860     }
1861 
1862     /* check data format and format version */
1863     pInfo=(const UDataInfo *)((const char *)inData+4);
1864     if(!(
1865         pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
1866         pInfo->dataFormat[1]==0x6e &&
1867         pInfo->dataFormat[2]==0x61 &&
1868         pInfo->dataFormat[3]==0x6d &&
1869         pInfo->formatVersion[0]==1
1870     )) {
1871         udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
1872                          pInfo->dataFormat[0], pInfo->dataFormat[1],
1873                          pInfo->dataFormat[2], pInfo->dataFormat[3],
1874                          pInfo->formatVersion[0]);
1875         *pErrorCode=U_UNSUPPORTED_ERROR;
1876         return 0;
1877     }
1878 
1879     inBytes=(const uint8_t *)inData+headerSize;
1880     outBytes=(uint8_t *)outData+headerSize;
1881     if(length<0) {
1882         algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
1883     } else {
1884         length-=headerSize;
1885         if( length<20 ||
1886             (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
1887         ) {
1888             udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
1889                              length);
1890             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1891             return 0;
1892         }
1893     }
1894 
1895     if(length<0) {
1896         /* preflighting: iterate through algorithmic ranges */
1897         offset=algNamesOffset;
1898         count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
1899         offset+=4;
1900 
1901         for(i=0; i<count; ++i) {
1902             inRange=(const AlgorithmicRange *)(inBytes+offset);
1903             offset+=ds->readUInt16(inRange->size);
1904         }
1905     } else {
1906         /* swap data */
1907         const uint16_t *p;
1908         uint16_t *q, *temp;
1909 
1910         int16_t tokens[512];
1911         uint16_t tokenCount;
1912 
1913         uint8_t map[256], trailMap[256];
1914 
1915         /* copy the data for inaccessible bytes */
1916         if(inBytes!=outBytes) {
1917             uprv_memcpy(outBytes, inBytes, length);
1918         }
1919 
1920         /* the initial 4 offsets first */
1921         tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
1922         groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
1923         groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
1924         ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
1925 
1926         /*
1927          * now the tokens table
1928          * it needs to be permutated along with the compressed name strings
1929          */
1930         p=(const uint16_t *)(inBytes+16);
1931         q=(uint16_t *)(outBytes+16);
1932 
1933         /* read and swap the tokenCount */
1934         tokenCount=ds->readUInt16(*p);
1935         ds->swapArray16(ds, p, 2, q, pErrorCode);
1936         ++p;
1937         ++q;
1938 
1939         /* read the first 512 tokens and make the token maps */
1940         if(tokenCount<=512) {
1941             count=tokenCount;
1942         } else {
1943             count=512;
1944         }
1945         for(i=0; i<count; ++i) {
1946             tokens[i]=udata_readInt16(ds, p[i]);
1947         }
1948         for(; i<512; ++i) {
1949             tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
1950         }
1951         makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
1952         makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
1953         if(U_FAILURE(*pErrorCode)) {
1954             return 0;
1955         }
1956 
1957         /*
1958          * swap and permutate the tokens
1959          * go through a temporary array to support in-place swapping
1960          */
1961         temp=(uint16_t *)uprv_malloc(tokenCount*2);
1962         if(temp==NULL) {
1963             udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
1964                              tokenCount);
1965             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1966             return 0;
1967         }
1968 
1969         /* swap and permutate single-/lead-byte tokens */
1970         for(i=0; i<tokenCount && i<256; ++i) {
1971             ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
1972         }
1973 
1974         /* swap and permutate trail-byte tokens */
1975         for(; i<tokenCount; ++i) {
1976             ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
1977         }
1978 
1979         /* copy the result into the output and free the temporary array */
1980         uprv_memcpy(q, temp, tokenCount*2);
1981         uprv_free(temp);
1982 
1983         /*
1984          * swap the token strings but not a possible padding byte after
1985          * the terminating NUL of the last string
1986          */
1987         udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
1988                                     outBytes+tokenStringOffset, pErrorCode);
1989         if(U_FAILURE(*pErrorCode)) {
1990             udata_printError(ds, "uchar_swapNames(token strings) failed\n");
1991             return 0;
1992         }
1993 
1994         /* swap the group table */
1995         count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset)));
1996         ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),
1997                            outBytes+groupsOffset, pErrorCode);
1998 
1999         /*
2000          * swap the group strings
2001          * swap the string bytes but not the nibble-encoded string lengths
2002          */
2003         if(ds->inCharset!=ds->outCharset) {
2004             uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
2005 
2006             const uint8_t *inStrings, *nextInStrings;
2007             uint8_t *outStrings;
2008 
2009             uint8_t c;
2010 
2011             inStrings=inBytes+groupStringOffset;
2012             outStrings=outBytes+groupStringOffset;
2013 
2014             stringsCount=algNamesOffset-groupStringOffset;
2015 
2016             /* iterate through string groups until only a few padding bytes are left */
2017             while(stringsCount>32) {
2018                 nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
2019 
2020                 /* move past the length bytes */
2021                 stringsCount-=(uint32_t)(nextInStrings-inStrings);
2022                 outStrings+=nextInStrings-inStrings;
2023                 inStrings=nextInStrings;
2024 
2025                 count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
2026                 stringsCount-=count;
2027 
2028                 /* swap the string bytes using map[] and trailMap[] */
2029                 while(count>0) {
2030                     c=*inStrings++;
2031                     *outStrings++=map[c];
2032                     if(tokens[c]!=-2) {
2033                         --count;
2034                     } else {
2035                         /* token lead byte: swap the trail byte, too */
2036                         *outStrings++=trailMap[*inStrings++];
2037                         count-=2;
2038                     }
2039                 }
2040             }
2041         }
2042 
2043         /* swap the algorithmic ranges */
2044         offset=algNamesOffset;
2045         count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
2046         ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
2047         offset+=4;
2048 
2049         for(i=0; i<count; ++i) {
2050             if(offset>(uint32_t)length) {
2051                 udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
2052                                  length, i);
2053                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2054                 return 0;
2055             }
2056 
2057             inRange=(const AlgorithmicRange *)(inBytes+offset);
2058             outRange=(AlgorithmicRange *)(outBytes+offset);
2059             offset+=ds->readUInt16(inRange->size);
2060 
2061             ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
2062             ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
2063             switch(inRange->type) {
2064             case 0:
2065                 /* swap prefix string */
2066                 ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),
2067                                     outRange+1, pErrorCode);
2068                 if(U_FAILURE(*pErrorCode)) {
2069                     udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
2070                                      i);
2071                     return 0;
2072                 }
2073                 break;
2074             case 1:
2075                 {
2076                     /* swap factors and the prefix and factor strings */
2077                     uint32_t factorsCount;
2078 
2079                     factorsCount=inRange->variant;
2080                     p=(const uint16_t *)(inRange+1);
2081                     q=(uint16_t *)(outRange+1);
2082                     ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
2083 
2084                     /* swap the strings, up to the last terminating NUL */
2085                     p+=factorsCount;
2086                     q+=factorsCount;
2087                     stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
2088                     while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
2089                         --stringsCount;
2090                     }
2091                     ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
2092                 }
2093                 break;
2094             default:
2095                 udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
2096                                  inRange->type, i);
2097                 *pErrorCode=U_UNSUPPORTED_ERROR;
2098                 return 0;
2099             }
2100         }
2101     }
2102 
2103     return headerSize+(int32_t)offset;
2104 }
2105 
2106 /*
2107  * Hey, Emacs, please set the following:
2108  *
2109  * Local Variables:
2110  * indent-tabs-mode: nil
2111  * End:
2112  *
2113  */
2114