1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 1999-2009, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 * file name: unames.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 1999oct04
14 * created by: Markus W. Scherer
15 */
16
17 #include "unicode/utypes.h"
18 #include "unicode/putil.h"
19 #include "unicode/uchar.h"
20 #include "unicode/udata.h"
21 #include "ustr_imp.h"
22 #include "umutex.h"
23 #include "cmemory.h"
24 #include "cstring.h"
25 #include "ucln_cmn.h"
26 #include "udataswp.h"
27 #include "uprops.h"
28
29 /* prototypes ------------------------------------------------------------- */
30
31 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
32
33 static const char DATA_NAME[] = "unames";
34 static const char DATA_TYPE[] = "icu";
35
36 #define GROUP_SHIFT 5
37 #define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
38 #define GROUP_MASK (LINES_PER_GROUP-1)
39
40 /*
41 * This struct was replaced by explicitly accessing equivalent
42 * fields from triples of uint16_t.
43 * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
44 * which broke the assumption that sizeof(Group)==6 and that the ++ operator
45 * would advance by 6 bytes (3 uint16_t).
46 *
47 * We can't just change the data structure because it's loaded from a data file,
48 * and we don't want to make it less compact, so we changed the access code.
49 *
50 * For details see ICU tickets 6331 and 6008.
51 typedef struct {
52 uint16_t groupMSB,
53 offsetHigh, offsetLow; / * avoid padding * /
54 } Group;
55 */
56 enum {
57 GROUP_MSB,
58 GROUP_OFFSET_HIGH,
59 GROUP_OFFSET_LOW,
60 GROUP_LENGTH
61 };
62
63 /*
64 * Get the 32-bit group offset.
65 * @param group (const uint16_t *) pointer to a Group triple of uint16_t
66 * @return group offset (int32_t)
67 */
68 #define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
69
70 #define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
71 #define PREV_GROUP(group) ((group)-GROUP_LENGTH)
72
73 typedef struct {
74 uint32_t start, end;
75 uint8_t type, variant;
76 uint16_t size;
77 } AlgorithmicRange;
78
79 typedef struct {
80 uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
81 } UCharNames;
82
83 /*
84 * Get the groups table from a UCharNames struct.
85 * The groups table consists of one uint16_t groupCount followed by
86 * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
87 * and the comment for the old struct Group above.
88 *
89 * @param names (const UCharNames *) pointer to the UCharNames indexes
90 * @return (const uint16_t *) pointer to the groups table
91 */
92 #define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
93
94 typedef struct {
95 const char *otherName;
96 UChar32 code;
97 } FindName;
98
99 #define DO_FIND_NAME NULL
100
101 static UDataMemory *uCharNamesData=NULL;
102 static UCharNames *uCharNames=NULL;
103 static UErrorCode gLoadErrorCode=U_ZERO_ERROR;
104
105 /*
106 * Maximum length of character names (regular & 1.0).
107 */
108 static int32_t gMaxNameLength=0;
109
110 /*
111 * Set of chars used in character names (regular & 1.0).
112 * Chars are platform-dependent (can be EBCDIC).
113 */
114 static uint32_t gNameSet[8]={ 0 };
115
116 #define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
117 #define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
118 #define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
119
120 #define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
121
122 static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
123 "unassigned",
124 "uppercase letter",
125 "lowercase letter",
126 "titlecase letter",
127 "modifier letter",
128 "other letter",
129 "non spacing mark",
130 "enclosing mark",
131 "combining spacing mark",
132 "decimal digit number",
133 "letter number",
134 "other number",
135 "space separator",
136 "line separator",
137 "paragraph separator",
138 "control",
139 "format",
140 "private use area",
141 "surrogate",
142 "dash punctuation",
143 "start punctuation",
144 "end punctuation",
145 "connector punctuation",
146 "other punctuation",
147 "math symbol",
148 "currency symbol",
149 "modifier symbol",
150 "other symbol",
151 "initial punctuation",
152 "final punctuation",
153 "noncharacter",
154 "lead surrogate",
155 "trail surrogate"
156 };
157
158 /* implementation ----------------------------------------------------------- */
159
unames_cleanup(void)160 static UBool U_CALLCONV unames_cleanup(void)
161 {
162 if(uCharNamesData) {
163 udata_close(uCharNamesData);
164 uCharNamesData = NULL;
165 }
166 if(uCharNames) {
167 uCharNames = NULL;
168 }
169 gMaxNameLength=0;
170 return TRUE;
171 }
172
173 static UBool U_CALLCONV
isAcceptable(void * context,const char * type,const char * name,const UDataInfo * pInfo)174 isAcceptable(void *context,
175 const char *type, const char *name,
176 const UDataInfo *pInfo) {
177 return (UBool)(
178 pInfo->size>=20 &&
179 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
180 pInfo->charsetFamily==U_CHARSET_FAMILY &&
181 pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
182 pInfo->dataFormat[1]==0x6e &&
183 pInfo->dataFormat[2]==0x61 &&
184 pInfo->dataFormat[3]==0x6d &&
185 pInfo->formatVersion[0]==1);
186 }
187
188 static UBool
isDataLoaded(UErrorCode * pErrorCode)189 isDataLoaded(UErrorCode *pErrorCode) {
190 /* load UCharNames from file if necessary */
191 UBool isCached;
192
193 /* do this because double-checked locking is broken */
194 UMTX_CHECK(NULL, (uCharNames!=NULL), isCached);
195
196 if(!isCached) {
197 UCharNames *names;
198 UDataMemory *data;
199
200 /* check error code from previous attempt */
201 if(U_FAILURE(gLoadErrorCode)) {
202 *pErrorCode=gLoadErrorCode;
203 return FALSE;
204 }
205
206 /* open the data outside the mutex block */
207 data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode);
208 if(U_FAILURE(*pErrorCode)) {
209 gLoadErrorCode=*pErrorCode;
210 return FALSE;
211 }
212
213 names=(UCharNames *)udata_getMemory(data);
214
215 /* in the mutex block, set the data for this process */
216 {
217 umtx_lock(NULL);
218 if(uCharNames==NULL) {
219 uCharNamesData=data;
220 uCharNames=names;
221 data=NULL;
222 names=NULL;
223 ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
224 }
225 umtx_unlock(NULL);
226 }
227
228 /* if a different thread set it first, then close the extra data */
229 if(data!=NULL) {
230 udata_close(data); /* NULL if it was set correctly */
231 }
232 }
233 return TRUE;
234 }
235
236 #define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
237 if((bufferLength)>0) { \
238 *(buffer)++=c; \
239 --(bufferLength); \
240 } \
241 ++(bufferPos); \
242 }
243
244 #define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
245
246 /*
247 * Important: expandName() and compareName() are almost the same -
248 * apply fixes to both.
249 *
250 * UnicodeData.txt uses ';' as a field separator, so no
251 * field can contain ';' as part of its contents.
252 * In unames.dat, it is marked as token[';']==-1 only if the
253 * semicolon is used in the data file - which is iff we
254 * have Unicode 1.0 names or ISO comments or aliases.
255 * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
256 * although we know that it will never be part of a name.
257 */
258 static uint16_t
expandName(UCharNames * names,const uint8_t * name,uint16_t nameLength,UCharNameChoice nameChoice,char * buffer,uint16_t bufferLength)259 expandName(UCharNames *names,
260 const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
261 char *buffer, uint16_t bufferLength) {
262 uint16_t *tokens=(uint16_t *)names+8;
263 uint16_t token, tokenCount=*tokens++, bufferPos=0;
264 uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
265 uint8_t c;
266
267 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
268 /*
269 * skip the modern name if it is not requested _and_
270 * if the semicolon byte value is a character, not a token number
271 */
272 if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
273 int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
274 do {
275 while(nameLength>0) {
276 --nameLength;
277 if(*name++==';') {
278 break;
279 }
280 }
281 } while(--fieldIndex>0);
282 } else {
283 /*
284 * the semicolon byte value is a token number, therefore
285 * only modern names are stored in unames.dat and there is no
286 * such requested alternate name here
287 */
288 nameLength=0;
289 }
290 }
291
292 /* write each letter directly, and write a token word per token */
293 while(nameLength>0) {
294 --nameLength;
295 c=*name++;
296
297 if(c>=tokenCount) {
298 if(c!=';') {
299 /* implicit letter */
300 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
301 } else {
302 /* finished */
303 break;
304 }
305 } else {
306 token=tokens[c];
307 if(token==(uint16_t)(-2)) {
308 /* this is a lead byte for a double-byte token */
309 token=tokens[c<<8|*name++];
310 --nameLength;
311 }
312 if(token==(uint16_t)(-1)) {
313 if(c!=';') {
314 /* explicit letter */
315 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
316 } else {
317 /* stop, but skip the semicolon if we are seeking
318 extended names and there was no 2.0 name but there
319 is a 1.0 name. */
320 if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) {
321 if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
322 continue;
323 }
324 }
325 /* finished */
326 break;
327 }
328 } else {
329 /* write token word */
330 uint8_t *tokenString=tokenStrings+token;
331 while((c=*tokenString++)!=0) {
332 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
333 }
334 }
335 }
336 }
337
338 /* zero-terminate */
339 if(bufferLength>0) {
340 *buffer=0;
341 }
342
343 return bufferPos;
344 }
345
346 /*
347 * compareName() is almost the same as expandName() except that it compares
348 * the currently expanded name to an input name.
349 * It returns the match/no match result as soon as possible.
350 */
351 static UBool
compareName(UCharNames * names,const uint8_t * name,uint16_t nameLength,UCharNameChoice nameChoice,const char * otherName)352 compareName(UCharNames *names,
353 const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
354 const char *otherName) {
355 uint16_t *tokens=(uint16_t *)names+8;
356 uint16_t token, tokenCount=*tokens++;
357 uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
358 uint8_t c;
359 const char *origOtherName = otherName;
360
361 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
362 /*
363 * skip the modern name if it is not requested _and_
364 * if the semicolon byte value is a character, not a token number
365 */
366 if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
367 int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
368 do {
369 while(nameLength>0) {
370 --nameLength;
371 if(*name++==';') {
372 break;
373 }
374 }
375 } while(--fieldIndex>0);
376 } else {
377 /*
378 * the semicolon byte value is a token number, therefore
379 * only modern names are stored in unames.dat and there is no
380 * such requested alternate name here
381 */
382 nameLength=0;
383 }
384 }
385
386 /* compare each letter directly, and compare a token word per token */
387 while(nameLength>0) {
388 --nameLength;
389 c=*name++;
390
391 if(c>=tokenCount) {
392 if(c!=';') {
393 /* implicit letter */
394 if((char)c!=*otherName++) {
395 return FALSE;
396 }
397 } else {
398 /* finished */
399 break;
400 }
401 } else {
402 token=tokens[c];
403 if(token==(uint16_t)(-2)) {
404 /* this is a lead byte for a double-byte token */
405 token=tokens[c<<8|*name++];
406 --nameLength;
407 }
408 if(token==(uint16_t)(-1)) {
409 if(c!=';') {
410 /* explicit letter */
411 if((char)c!=*otherName++) {
412 return FALSE;
413 }
414 } else {
415 /* stop, but skip the semicolon if we are seeking
416 extended names and there was no 2.0 name but there
417 is a 1.0 name. */
418 if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) {
419 if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
420 continue;
421 }
422 }
423 /* finished */
424 break;
425 }
426 } else {
427 /* write token word */
428 uint8_t *tokenString=tokenStrings+token;
429 while((c=*tokenString++)!=0) {
430 if((char)c!=*otherName++) {
431 return FALSE;
432 }
433 }
434 }
435 }
436 }
437
438 /* complete match? */
439 return (UBool)(*otherName==0);
440 }
441
getCharCat(UChar32 cp)442 static uint8_t getCharCat(UChar32 cp) {
443 uint8_t cat;
444
445 if (UTF_IS_UNICODE_NONCHAR(cp)) {
446 return U_NONCHARACTER_CODE_POINT;
447 }
448
449 if ((cat = u_charType(cp)) == U_SURROGATE) {
450 cat = UTF_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
451 }
452
453 return cat;
454 }
455
getCharCatName(UChar32 cp)456 static const char *getCharCatName(UChar32 cp) {
457 uint8_t cat = getCharCat(cp);
458
459 /* Return unknown if the table of names above is not up to
460 date. */
461
462 if (cat >= LENGTHOF(charCatNames)) {
463 return "unknown";
464 } else {
465 return charCatNames[cat];
466 }
467 }
468
getExtName(uint32_t code,char * buffer,uint16_t bufferLength)469 static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
470 const char *catname = getCharCatName(code);
471 uint16_t length = 0;
472
473 UChar32 cp;
474 int ndigits, i;
475
476 WRITE_CHAR(buffer, bufferLength, length, '<');
477 while (catname[length - 1]) {
478 WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
479 }
480 WRITE_CHAR(buffer, bufferLength, length, '-');
481 for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
482 ;
483 if (ndigits < 4)
484 ndigits = 4;
485 for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {
486 uint8_t v = (uint8_t)(cp & 0xf);
487 buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
488 }
489 buffer += ndigits;
490 length += ndigits;
491 WRITE_CHAR(buffer, bufferLength, length, '>');
492
493 return length;
494 }
495
496 /*
497 * getGroup() does a binary search for the group that contains the
498 * Unicode code point "code".
499 * The return value is always a valid Group* that may contain "code"
500 * or else is the highest group before "code".
501 * If the lowest group is after "code", then that one is returned.
502 */
503 static const uint16_t *
getGroup(UCharNames * names,uint32_t code)504 getGroup(UCharNames *names, uint32_t code) {
505 const uint16_t *groups=GET_GROUPS(names);
506 uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),
507 start=0,
508 limit=*groups++,
509 number;
510
511 /* binary search for the group of names that contains the one for code */
512 while(start<limit-1) {
513 number=(uint16_t)((start+limit)/2);
514 if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {
515 limit=number;
516 } else {
517 start=number;
518 }
519 }
520
521 /* return this regardless of whether it is an exact match */
522 return groups+start*GROUP_LENGTH;
523 }
524
525 /*
526 * expandGroupLengths() reads a block of compressed lengths of 32 strings and
527 * expands them into offsets and lengths for each string.
528 * Lengths are stored with a variable-width encoding in consecutive nibbles:
529 * If a nibble<0xc, then it is the length itself (0=empty string).
530 * If a nibble>=0xc, then it forms a length value with the following nibble.
531 * Calculation see below.
532 * The offsets and lengths arrays must be at least 33 (one more) long because
533 * there is no check here at the end if the last nibble is still used.
534 */
535 static const uint8_t *
expandGroupLengths(const uint8_t * s,uint16_t offsets[LINES_PER_GROUP+1],uint16_t lengths[LINES_PER_GROUP+1])536 expandGroupLengths(const uint8_t *s,
537 uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
538 /* read the lengths of the 32 strings in this group and get each string's offset */
539 uint16_t i=0, offset=0, length=0;
540 uint8_t lengthByte;
541
542 /* all 32 lengths must be read to get the offset of the first group string */
543 while(i<LINES_PER_GROUP) {
544 lengthByte=*s++;
545
546 /* read even nibble - MSBs of lengthByte */
547 if(length>=12) {
548 /* double-nibble length spread across two bytes */
549 length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12);
550 lengthByte&=0xf;
551 } else if((lengthByte /* &0xf0 */)>=0xc0) {
552 /* double-nibble length spread across this one byte */
553 length=(uint16_t)((lengthByte&0x3f)+12);
554 } else {
555 /* single-nibble length in MSBs */
556 length=(uint16_t)(lengthByte>>4);
557 lengthByte&=0xf;
558 }
559
560 *offsets++=offset;
561 *lengths++=length;
562
563 offset+=length;
564 ++i;
565
566 /* read odd nibble - LSBs of lengthByte */
567 if((lengthByte&0xf0)==0) {
568 /* this nibble was not consumed for a double-nibble length above */
569 length=lengthByte;
570 if(length<12) {
571 /* single-nibble length in LSBs */
572 *offsets++=offset;
573 *lengths++=length;
574
575 offset+=length;
576 ++i;
577 }
578 } else {
579 length=0; /* prevent double-nibble detection in the next iteration */
580 }
581 }
582
583 /* now, s is at the first group string */
584 return s;
585 }
586
587 static uint16_t
expandGroupName(UCharNames * names,const uint16_t * group,uint16_t lineNumber,UCharNameChoice nameChoice,char * buffer,uint16_t bufferLength)588 expandGroupName(UCharNames *names, const uint16_t *group,
589 uint16_t lineNumber, UCharNameChoice nameChoice,
590 char *buffer, uint16_t bufferLength) {
591 uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
592 const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
593 s=expandGroupLengths(s, offsets, lengths);
594 return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
595 buffer, bufferLength);
596 }
597
598 static uint16_t
getName(UCharNames * names,uint32_t code,UCharNameChoice nameChoice,char * buffer,uint16_t bufferLength)599 getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
600 char *buffer, uint16_t bufferLength) {
601 const uint16_t *group=getGroup(names, code);
602 if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) {
603 return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice,
604 buffer, bufferLength);
605 } else {
606 /* group not found */
607 /* zero-terminate */
608 if(bufferLength>0) {
609 *buffer=0;
610 }
611 return 0;
612 }
613 }
614
615 /*
616 * enumGroupNames() enumerates all the names in a 32-group
617 * and either calls the enumerator function or finds a given input name.
618 */
619 static UBool
enumGroupNames(UCharNames * names,const uint16_t * group,UChar32 start,UChar32 end,UEnumCharNamesFn * fn,void * context,UCharNameChoice nameChoice)620 enumGroupNames(UCharNames *names, const uint16_t *group,
621 UChar32 start, UChar32 end,
622 UEnumCharNamesFn *fn, void *context,
623 UCharNameChoice nameChoice) {
624 uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
625 const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
626
627 s=expandGroupLengths(s, offsets, lengths);
628 if(fn!=DO_FIND_NAME) {
629 char buffer[200];
630 uint16_t length;
631
632 while(start<=end) {
633 length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer));
634 if (!length && nameChoice == U_EXTENDED_CHAR_NAME) {
635 buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
636 }
637 /* here, we assume that the buffer is large enough */
638 if(length>0) {
639 if(!fn(context, start, nameChoice, buffer, length)) {
640 return FALSE;
641 }
642 }
643 ++start;
644 }
645 } else {
646 const char *otherName=((FindName *)context)->otherName;
647 while(start<=end) {
648 if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) {
649 ((FindName *)context)->code=start;
650 return FALSE;
651 }
652 ++start;
653 }
654 }
655 return TRUE;
656 }
657
658 /*
659 * enumExtNames enumerate extended names.
660 * It only needs to do it if it is called with a real function and not
661 * with the dummy DO_FIND_NAME, because u_charFromName() does a check
662 * for extended names by itself.
663 */
664 static UBool
enumExtNames(UChar32 start,UChar32 end,UEnumCharNamesFn * fn,void * context)665 enumExtNames(UChar32 start, UChar32 end,
666 UEnumCharNamesFn *fn, void *context)
667 {
668 if(fn!=DO_FIND_NAME) {
669 char buffer[200];
670 uint16_t length;
671
672 while(start<=end) {
673 buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
674 /* here, we assume that the buffer is large enough */
675 if(length>0) {
676 if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) {
677 return FALSE;
678 }
679 }
680 ++start;
681 }
682 }
683
684 return TRUE;
685 }
686
687 static UBool
enumNames(UCharNames * names,UChar32 start,UChar32 limit,UEnumCharNamesFn * fn,void * context,UCharNameChoice nameChoice)688 enumNames(UCharNames *names,
689 UChar32 start, UChar32 limit,
690 UEnumCharNamesFn *fn, void *context,
691 UCharNameChoice nameChoice) {
692 uint16_t startGroupMSB, endGroupMSB, groupCount;
693 const uint16_t *group, *groupLimit;
694
695 startGroupMSB=(uint16_t)(start>>GROUP_SHIFT);
696 endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT);
697
698 /* find the group that contains start, or the highest before it */
699 group=getGroup(names, start);
700
701 if(startGroupMSB==endGroupMSB) {
702 if(startGroupMSB==group[GROUP_MSB]) {
703 /* if start and limit-1 are in the same group, then enumerate only in that one */
704 return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
705 }
706 } else {
707 const uint16_t *groups=GET_GROUPS(names);
708 groupCount=*groups++;
709 groupLimit=groups+groupCount*GROUP_LENGTH;
710
711 if(startGroupMSB==group[GROUP_MSB]) {
712 /* enumerate characters in the partial start group */
713 if((start&GROUP_MASK)!=0) {
714 if(!enumGroupNames(names, group,
715 start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1,
716 fn, context, nameChoice)) {
717 return FALSE;
718 }
719 group=NEXT_GROUP(group); /* continue with the next group */
720 }
721 } else if(startGroupMSB>group[GROUP_MSB]) {
722 /* make sure that we start enumerating with the first group after start */
723 const uint16_t *nextGroup=NEXT_GROUP(group);
724 if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
725 UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
726 if (end > limit) {
727 end = limit;
728 }
729 if (!enumExtNames(start, end - 1, fn, context)) {
730 return FALSE;
731 }
732 }
733 group=nextGroup;
734 }
735
736 /* enumerate entire groups between the start- and end-groups */
737 while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) {
738 const uint16_t *nextGroup;
739 start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT;
740 if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {
741 return FALSE;
742 }
743 nextGroup=NEXT_GROUP(group);
744 if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
745 UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
746 if (end > limit) {
747 end = limit;
748 }
749 if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) {
750 return FALSE;
751 }
752 }
753 group=nextGroup;
754 }
755
756 /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
757 if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) {
758 return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
759 } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
760 UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT;
761 if (next > start) {
762 start = next;
763 }
764 } else {
765 return TRUE;
766 }
767 }
768
769 /* we have not found a group, which means everything is made of
770 extended names. */
771 if (nameChoice == U_EXTENDED_CHAR_NAME) {
772 if (limit > UCHAR_MAX_VALUE + 1) {
773 limit = UCHAR_MAX_VALUE + 1;
774 }
775 return enumExtNames(start, limit - 1, fn, context);
776 }
777
778 return TRUE;
779 }
780
781 static uint16_t
writeFactorSuffix(const uint16_t * factors,uint16_t count,const char * s,uint32_t code,uint16_t indexes[8],const char * elementBases[8],const char * elements[8],char * buffer,uint16_t bufferLength)782 writeFactorSuffix(const uint16_t *factors, uint16_t count,
783 const char *s, /* suffix elements */
784 uint32_t code,
785 uint16_t indexes[8], /* output fields from here */
786 const char *elementBases[8], const char *elements[8],
787 char *buffer, uint16_t bufferLength) {
788 uint16_t i, factor, bufferPos=0;
789 char c;
790
791 /* write elements according to the factors */
792
793 /*
794 * the factorized elements are determined by modulo arithmetic
795 * with the factors of this algorithm
796 *
797 * note that for fewer operations, count is decremented here
798 */
799 --count;
800 for(i=count; i>0; --i) {
801 factor=factors[i];
802 indexes[i]=(uint16_t)(code%factor);
803 code/=factor;
804 }
805 /*
806 * we don't need to calculate the last modulus because start<=code<=end
807 * guarantees here that code<=factors[0]
808 */
809 indexes[0]=(uint16_t)code;
810
811 /* write each element */
812 for(;;) {
813 if(elementBases!=NULL) {
814 *elementBases++=s;
815 }
816
817 /* skip indexes[i] strings */
818 factor=indexes[i];
819 while(factor>0) {
820 while(*s++!=0) {}
821 --factor;
822 }
823 if(elements!=NULL) {
824 *elements++=s;
825 }
826
827 /* write element */
828 while((c=*s++)!=0) {
829 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
830 }
831
832 /* we do not need to perform the rest of this loop for i==count - break here */
833 if(i>=count) {
834 break;
835 }
836
837 /* skip the rest of the strings for this factors[i] */
838 factor=(uint16_t)(factors[i]-indexes[i]-1);
839 while(factor>0) {
840 while(*s++!=0) {}
841 --factor;
842 }
843
844 ++i;
845 }
846
847 /* zero-terminate */
848 if(bufferLength>0) {
849 *buffer=0;
850 }
851
852 return bufferPos;
853 }
854
855 /*
856 * Important:
857 * Parts of findAlgName() are almost the same as some of getAlgName().
858 * Fixes must be applied to both.
859 */
860 static uint16_t
getAlgName(AlgorithmicRange * range,uint32_t code,UCharNameChoice nameChoice,char * buffer,uint16_t bufferLength)861 getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
862 char *buffer, uint16_t bufferLength) {
863 uint16_t bufferPos=0;
864
865 /* Only the normative character name can be algorithmic. */
866 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
867 /* zero-terminate */
868 if(bufferLength>0) {
869 *buffer=0;
870 }
871 return 0;
872 }
873
874 switch(range->type) {
875 case 0: {
876 /* name = prefix hex-digits */
877 const char *s=(const char *)(range+1);
878 char c;
879
880 uint16_t i, count;
881
882 /* copy prefix */
883 while((c=*s++)!=0) {
884 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
885 }
886
887 /* write hexadecimal code point value */
888 count=range->variant;
889
890 /* zero-terminate */
891 if(count<bufferLength) {
892 buffer[count]=0;
893 }
894
895 for(i=count; i>0;) {
896 if(--i<bufferLength) {
897 c=(char)(code&0xf);
898 if(c<10) {
899 c+='0';
900 } else {
901 c+='A'-10;
902 }
903 buffer[i]=c;
904 }
905 code>>=4;
906 }
907
908 bufferPos+=count;
909 break;
910 }
911 case 1: {
912 /* name = prefix factorized-elements */
913 uint16_t indexes[8];
914 const uint16_t *factors=(const uint16_t *)(range+1);
915 uint16_t count=range->variant;
916 const char *s=(const char *)(factors+count);
917 char c;
918
919 /* copy prefix */
920 while((c=*s++)!=0) {
921 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
922 }
923
924 bufferPos+=writeFactorSuffix(factors, count,
925 s, code-range->start, indexes, NULL, NULL, buffer, bufferLength);
926 break;
927 }
928 default:
929 /* undefined type */
930 /* zero-terminate */
931 if(bufferLength>0) {
932 *buffer=0;
933 }
934 break;
935 }
936
937 return bufferPos;
938 }
939
940 /*
941 * Important: enumAlgNames() and findAlgName() are almost the same.
942 * Any fix must be applied to both.
943 */
944 static UBool
enumAlgNames(AlgorithmicRange * range,UChar32 start,UChar32 limit,UEnumCharNamesFn * fn,void * context,UCharNameChoice nameChoice)945 enumAlgNames(AlgorithmicRange *range,
946 UChar32 start, UChar32 limit,
947 UEnumCharNamesFn *fn, void *context,
948 UCharNameChoice nameChoice) {
949 char buffer[200];
950 uint16_t length;
951
952 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
953 return TRUE;
954 }
955
956 switch(range->type) {
957 case 0: {
958 char *s, *end;
959 char c;
960
961 /* get the full name of the start character */
962 length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer));
963 if(length<=0) {
964 return TRUE;
965 }
966
967 /* call the enumerator function with this first character */
968 if(!fn(context, start, nameChoice, buffer, length)) {
969 return FALSE;
970 }
971
972 /* go to the end of the name; all these names have the same length */
973 end=buffer;
974 while(*end!=0) {
975 ++end;
976 }
977
978 /* enumerate the rest of the names */
979 while(++start<limit) {
980 /* increment the hexadecimal number on a character-basis */
981 s=end;
982 for (;;) {
983 c=*--s;
984 if(('0'<=c && c<'9') || ('A'<=c && c<'F')) {
985 *s=(char)(c+1);
986 break;
987 } else if(c=='9') {
988 *s='A';
989 break;
990 } else if(c=='F') {
991 *s='0';
992 }
993 }
994
995 if(!fn(context, start, nameChoice, buffer, length)) {
996 return FALSE;
997 }
998 }
999 break;
1000 }
1001 case 1: {
1002 uint16_t indexes[8];
1003 const char *elementBases[8], *elements[8];
1004 const uint16_t *factors=(const uint16_t *)(range+1);
1005 uint16_t count=range->variant;
1006 const char *s=(const char *)(factors+count);
1007 char *suffix, *t;
1008 uint16_t prefixLength, i, idx;
1009
1010 char c;
1011
1012 /* name = prefix factorized-elements */
1013
1014 /* copy prefix */
1015 suffix=buffer;
1016 prefixLength=0;
1017 while((c=*s++)!=0) {
1018 *suffix++=c;
1019 ++prefixLength;
1020 }
1021
1022 /* append the suffix of the start character */
1023 length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count,
1024 s, (uint32_t)start-range->start,
1025 indexes, elementBases, elements,
1026 suffix, (uint16_t)(sizeof(buffer)-prefixLength)));
1027
1028 /* call the enumerator function with this first character */
1029 if(!fn(context, start, nameChoice, buffer, length)) {
1030 return FALSE;
1031 }
1032
1033 /* enumerate the rest of the names */
1034 while(++start<limit) {
1035 /* increment the indexes in lexical order bound by the factors */
1036 i=count;
1037 for (;;) {
1038 idx=(uint16_t)(indexes[--i]+1);
1039 if(idx<factors[i]) {
1040 /* skip one index and its element string */
1041 indexes[i]=idx;
1042 s=elements[i];
1043 while(*s++!=0) {
1044 }
1045 elements[i]=s;
1046 break;
1047 } else {
1048 /* reset this index to 0 and its element string to the first one */
1049 indexes[i]=0;
1050 elements[i]=elementBases[i];
1051 }
1052 }
1053
1054 /* to make matters a little easier, just append all elements to the suffix */
1055 t=suffix;
1056 length=prefixLength;
1057 for(i=0; i<count; ++i) {
1058 s=elements[i];
1059 while((c=*s++)!=0) {
1060 *t++=c;
1061 ++length;
1062 }
1063 }
1064 /* zero-terminate */
1065 *t=0;
1066
1067 if(!fn(context, start, nameChoice, buffer, length)) {
1068 return FALSE;
1069 }
1070 }
1071 break;
1072 }
1073 default:
1074 /* undefined type */
1075 break;
1076 }
1077
1078 return TRUE;
1079 }
1080
1081 /*
1082 * findAlgName() is almost the same as enumAlgNames() except that it
1083 * returns the code point for a name if it fits into the range.
1084 * It returns 0xffff otherwise.
1085 */
1086 static UChar32
findAlgName(AlgorithmicRange * range,UCharNameChoice nameChoice,const char * otherName)1087 findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) {
1088 UChar32 code;
1089
1090 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
1091 return 0xffff;
1092 }
1093
1094 switch(range->type) {
1095 case 0: {
1096 /* name = prefix hex-digits */
1097 const char *s=(const char *)(range+1);
1098 char c;
1099
1100 uint16_t i, count;
1101
1102 /* compare prefix */
1103 while((c=*s++)!=0) {
1104 if((char)c!=*otherName++) {
1105 return 0xffff;
1106 }
1107 }
1108
1109 /* read hexadecimal code point value */
1110 count=range->variant;
1111 code=0;
1112 for(i=0; i<count; ++i) {
1113 c=*otherName++;
1114 if('0'<=c && c<='9') {
1115 code=(code<<4)|(c-'0');
1116 } else if('A'<=c && c<='F') {
1117 code=(code<<4)|(c-'A'+10);
1118 } else {
1119 return 0xffff;
1120 }
1121 }
1122
1123 /* does it fit into the range? */
1124 if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) {
1125 return code;
1126 }
1127 break;
1128 }
1129 case 1: {
1130 char buffer[64];
1131 uint16_t indexes[8];
1132 const char *elementBases[8], *elements[8];
1133 const uint16_t *factors=(const uint16_t *)(range+1);
1134 uint16_t count=range->variant;
1135 const char *s=(const char *)(factors+count), *t;
1136 UChar32 start, limit;
1137 uint16_t i, idx;
1138
1139 char c;
1140
1141 /* name = prefix factorized-elements */
1142
1143 /* compare prefix */
1144 while((c=*s++)!=0) {
1145 if((char)c!=*otherName++) {
1146 return 0xffff;
1147 }
1148 }
1149
1150 start=(UChar32)range->start;
1151 limit=(UChar32)(range->end+1);
1152
1153 /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
1154 writeFactorSuffix(factors, count, s, 0,
1155 indexes, elementBases, elements, buffer, sizeof(buffer));
1156
1157 /* compare the first suffix */
1158 if(0==uprv_strcmp(otherName, buffer)) {
1159 return start;
1160 }
1161
1162 /* enumerate and compare the rest of the suffixes */
1163 while(++start<limit) {
1164 /* increment the indexes in lexical order bound by the factors */
1165 i=count;
1166 for (;;) {
1167 idx=(uint16_t)(indexes[--i]+1);
1168 if(idx<factors[i]) {
1169 /* skip one index and its element string */
1170 indexes[i]=idx;
1171 s=elements[i];
1172 while(*s++!=0) {}
1173 elements[i]=s;
1174 break;
1175 } else {
1176 /* reset this index to 0 and its element string to the first one */
1177 indexes[i]=0;
1178 elements[i]=elementBases[i];
1179 }
1180 }
1181
1182 /* to make matters a little easier, just compare all elements of the suffix */
1183 t=otherName;
1184 for(i=0; i<count; ++i) {
1185 s=elements[i];
1186 while((c=*s++)!=0) {
1187 if(c!=*t++) {
1188 s=""; /* does not match */
1189 i=99;
1190 }
1191 }
1192 }
1193 if(i<99 && *t==0) {
1194 return start;
1195 }
1196 }
1197 break;
1198 }
1199 default:
1200 /* undefined type */
1201 break;
1202 }
1203
1204 return 0xffff;
1205 }
1206
1207 /* sets of name characters, maximum name lengths ---------------------------- */
1208
1209 #define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
1210 #define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
1211
1212 static int32_t
calcStringSetLength(uint32_t set[8],const char * s)1213 calcStringSetLength(uint32_t set[8], const char *s) {
1214 int32_t length=0;
1215 char c;
1216
1217 while((c=*s++)!=0) {
1218 SET_ADD(set, c);
1219 ++length;
1220 }
1221 return length;
1222 }
1223
1224 static int32_t
calcAlgNameSetsLengths(int32_t maxNameLength)1225 calcAlgNameSetsLengths(int32_t maxNameLength) {
1226 AlgorithmicRange *range;
1227 uint32_t *p;
1228 uint32_t rangeCount;
1229 int32_t length;
1230
1231 /* enumerate algorithmic ranges */
1232 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1233 rangeCount=*p;
1234 range=(AlgorithmicRange *)(p+1);
1235 while(rangeCount>0) {
1236 switch(range->type) {
1237 case 0:
1238 /* name = prefix + (range->variant times) hex-digits */
1239 /* prefix */
1240 length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant;
1241 if(length>maxNameLength) {
1242 maxNameLength=length;
1243 }
1244 break;
1245 case 1: {
1246 /* name = prefix factorized-elements */
1247 const uint16_t *factors=(const uint16_t *)(range+1);
1248 const char *s;
1249 int32_t i, count=range->variant, factor, factorLength, maxFactorLength;
1250
1251 /* prefix length */
1252 s=(const char *)(factors+count);
1253 length=calcStringSetLength(gNameSet, s);
1254 s+=length+1; /* start of factor suffixes */
1255
1256 /* get the set and maximum factor suffix length for each factor */
1257 for(i=0; i<count; ++i) {
1258 maxFactorLength=0;
1259 for(factor=factors[i]; factor>0; --factor) {
1260 factorLength=calcStringSetLength(gNameSet, s);
1261 s+=factorLength+1;
1262 if(factorLength>maxFactorLength) {
1263 maxFactorLength=factorLength;
1264 }
1265 }
1266 length+=maxFactorLength;
1267 }
1268
1269 if(length>maxNameLength) {
1270 maxNameLength=length;
1271 }
1272 break;
1273 }
1274 default:
1275 /* unknown type */
1276 break;
1277 }
1278
1279 range=(AlgorithmicRange *)((uint8_t *)range+range->size);
1280 --rangeCount;
1281 }
1282 return maxNameLength;
1283 }
1284
1285 static int32_t
calcExtNameSetsLengths(int32_t maxNameLength)1286 calcExtNameSetsLengths(int32_t maxNameLength) {
1287 int32_t i, length;
1288
1289 for(i=0; i<LENGTHOF(charCatNames); ++i) {
1290 /*
1291 * for each category, count the length of the category name
1292 * plus 9=
1293 * 2 for <>
1294 * 1 for -
1295 * 6 for most hex digits per code point
1296 */
1297 length=9+calcStringSetLength(gNameSet, charCatNames[i]);
1298 if(length>maxNameLength) {
1299 maxNameLength=length;
1300 }
1301 }
1302 return maxNameLength;
1303 }
1304
1305 static int32_t
calcNameSetLength(const uint16_t * tokens,uint16_t tokenCount,const uint8_t * tokenStrings,int8_t * tokenLengths,uint32_t set[8],const uint8_t ** pLine,const uint8_t * lineLimit)1306 calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths,
1307 uint32_t set[8],
1308 const uint8_t **pLine, const uint8_t *lineLimit) {
1309 const uint8_t *line=*pLine;
1310 int32_t length=0, tokenLength;
1311 uint16_t c, token;
1312
1313 while(line!=lineLimit && (c=*line++)!=(uint8_t)';') {
1314 if(c>=tokenCount) {
1315 /* implicit letter */
1316 SET_ADD(set, c);
1317 ++length;
1318 } else {
1319 token=tokens[c];
1320 if(token==(uint16_t)(-2)) {
1321 /* this is a lead byte for a double-byte token */
1322 c=c<<8|*line++;
1323 token=tokens[c];
1324 }
1325 if(token==(uint16_t)(-1)) {
1326 /* explicit letter */
1327 SET_ADD(set, c);
1328 ++length;
1329 } else {
1330 /* count token word */
1331 if(tokenLengths!=NULL) {
1332 /* use cached token length */
1333 tokenLength=tokenLengths[c];
1334 if(tokenLength==0) {
1335 tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1336 tokenLengths[c]=(int8_t)tokenLength;
1337 }
1338 } else {
1339 tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1340 }
1341 length+=tokenLength;
1342 }
1343 }
1344 }
1345
1346 *pLine=line;
1347 return length;
1348 }
1349
1350 static void
calcGroupNameSetsLengths(int32_t maxNameLength)1351 calcGroupNameSetsLengths(int32_t maxNameLength) {
1352 uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
1353
1354 uint16_t *tokens=(uint16_t *)uCharNames+8;
1355 uint16_t tokenCount=*tokens++;
1356 uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset;
1357
1358 int8_t *tokenLengths;
1359
1360 const uint16_t *group;
1361 const uint8_t *s, *line, *lineLimit;
1362
1363 int32_t groupCount, lineNumber, length;
1364
1365 tokenLengths=(int8_t *)uprv_malloc(tokenCount);
1366 if(tokenLengths!=NULL) {
1367 uprv_memset(tokenLengths, 0, tokenCount);
1368 }
1369
1370 group=GET_GROUPS(uCharNames);
1371 groupCount=*group++;
1372
1373 /* enumerate all groups */
1374 while(groupCount>0) {
1375 s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group);
1376 s=expandGroupLengths(s, offsets, lengths);
1377
1378 /* enumerate all lines in each group */
1379 for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {
1380 line=s+offsets[lineNumber];
1381 length=lengths[lineNumber];
1382 if(length==0) {
1383 continue;
1384 }
1385
1386 lineLimit=line+length;
1387
1388 /* read regular name */
1389 length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1390 if(length>maxNameLength) {
1391 maxNameLength=length;
1392 }
1393 if(line==lineLimit) {
1394 continue;
1395 }
1396
1397 /* read Unicode 1.0 name */
1398 length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1399 if(length>maxNameLength) {
1400 maxNameLength=length;
1401 }
1402 if(line==lineLimit) {
1403 continue;
1404 }
1405
1406 /* read ISO comment */
1407 /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
1408 }
1409
1410 group=NEXT_GROUP(group);
1411 --groupCount;
1412 }
1413
1414 if(tokenLengths!=NULL) {
1415 uprv_free(tokenLengths);
1416 }
1417
1418 /* set gMax... - name length last for threading */
1419 gMaxNameLength=maxNameLength;
1420 }
1421
1422 static UBool
calcNameSetsLengths(UErrorCode * pErrorCode)1423 calcNameSetsLengths(UErrorCode *pErrorCode) {
1424 static const char extChars[]="0123456789ABCDEF<>-";
1425 int32_t i, maxNameLength;
1426
1427 if(gMaxNameLength!=0) {
1428 return TRUE;
1429 }
1430
1431 if(!isDataLoaded(pErrorCode)) {
1432 return FALSE;
1433 }
1434
1435 /* set hex digits, used in various names, and <>-, used in extended names */
1436 for(i=0; i<sizeof(extChars)-1; ++i) {
1437 SET_ADD(gNameSet, extChars[i]);
1438 }
1439
1440 /* set sets and lengths from algorithmic names */
1441 maxNameLength=calcAlgNameSetsLengths(0);
1442
1443 /* set sets and lengths from extended names */
1444 maxNameLength=calcExtNameSetsLengths(maxNameLength);
1445
1446 /* set sets and lengths from group names, set global maximum values */
1447 calcGroupNameSetsLengths(maxNameLength);
1448
1449 return TRUE;
1450 }
1451
1452 /* public API --------------------------------------------------------------- */
1453
1454 U_CAPI int32_t U_EXPORT2
u_charName(UChar32 code,UCharNameChoice nameChoice,char * buffer,int32_t bufferLength,UErrorCode * pErrorCode)1455 u_charName(UChar32 code, UCharNameChoice nameChoice,
1456 char *buffer, int32_t bufferLength,
1457 UErrorCode *pErrorCode) {
1458 AlgorithmicRange *algRange;
1459 uint32_t *p;
1460 uint32_t i;
1461 int32_t length;
1462
1463 /* check the argument values */
1464 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1465 return 0;
1466 } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT ||
1467 bufferLength<0 || (bufferLength>0 && buffer==NULL)
1468 ) {
1469 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1470 return 0;
1471 }
1472
1473 if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
1474 return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
1475 }
1476
1477 length=0;
1478
1479 /* try algorithmic names first */
1480 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1481 i=*p;
1482 algRange=(AlgorithmicRange *)(p+1);
1483 while(i>0) {
1484 if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
1485 length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1486 break;
1487 }
1488 algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1489 --i;
1490 }
1491
1492 if(i==0) {
1493 if (nameChoice == U_EXTENDED_CHAR_NAME) {
1494 length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
1495 if (!length) {
1496 /* extended character name */
1497 length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
1498 }
1499 } else {
1500 /* normal character name */
1501 length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1502 }
1503 }
1504
1505 return u_terminateChars(buffer, bufferLength, length, pErrorCode);
1506 }
1507
1508 U_CAPI int32_t U_EXPORT2
u_getISOComment(UChar32 c,char * dest,int32_t destCapacity,UErrorCode * pErrorCode)1509 u_getISOComment(UChar32 c,
1510 char *dest, int32_t destCapacity,
1511 UErrorCode *pErrorCode) {
1512 int32_t length;
1513
1514 /* check the argument values */
1515 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1516 return 0;
1517 } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1518 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1519 return 0;
1520 }
1521
1522 if((uint32_t)c>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
1523 return u_terminateChars(dest, destCapacity, 0, pErrorCode);
1524 }
1525
1526 /* the ISO comment is stored like a normal character name */
1527 length=getName(uCharNames, (uint32_t)c, U_ISO_COMMENT, dest, (uint16_t)destCapacity);
1528 return u_terminateChars(dest, destCapacity, length, pErrorCode);
1529 }
1530
1531 U_CAPI UChar32 U_EXPORT2
u_charFromName(UCharNameChoice nameChoice,const char * name,UErrorCode * pErrorCode)1532 u_charFromName(UCharNameChoice nameChoice,
1533 const char *name,
1534 UErrorCode *pErrorCode) {
1535 char upper[120], lower[120];
1536 FindName findName;
1537 AlgorithmicRange *algRange;
1538 uint32_t *p;
1539 uint32_t i;
1540 UChar32 cp = 0;
1541 char c0;
1542 UChar32 error = 0xffff; /* Undefined, but use this for backwards compatibility. */
1543
1544 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1545 return error;
1546 }
1547
1548 if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) {
1549 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1550 return error;
1551 }
1552
1553 if(!isDataLoaded(pErrorCode)) {
1554 return error;
1555 }
1556
1557 /* construct the uppercase and lowercase of the name first */
1558 for(i=0; i<sizeof(upper); ++i) {
1559 if((c0=*name++)!=0) {
1560 upper[i]=uprv_toupper(c0);
1561 lower[i]=uprv_tolower(c0);
1562 } else {
1563 upper[i]=lower[i]=0;
1564 break;
1565 }
1566 }
1567 if(i==sizeof(upper)) {
1568 /* name too long, there is no such character */
1569 *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1570 return error;
1571 }
1572
1573 /* try extended names first */
1574 if (lower[0] == '<') {
1575 if (nameChoice == U_EXTENDED_CHAR_NAME) {
1576 if (lower[--i] == '>') {
1577 for (--i; lower[i] && lower[i] != '-'; --i) {
1578 }
1579
1580 if (lower[i] == '-') { /* We've got a category. */
1581 uint32_t cIdx;
1582
1583 lower[i] = 0;
1584
1585 for (++i; lower[i] != '>'; ++i) {
1586 if (lower[i] >= '0' && lower[i] <= '9') {
1587 cp = (cp << 4) + lower[i] - '0';
1588 } else if (lower[i] >= 'a' && lower[i] <= 'f') {
1589 cp = (cp << 4) + lower[i] - 'a' + 10;
1590 } else {
1591 *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1592 return error;
1593 }
1594 }
1595
1596 /* Now validate the category name.
1597 We could use a binary search, or a trie, if
1598 we really wanted to. */
1599
1600 for (lower[i] = 0, cIdx = 0; cIdx < LENGTHOF(charCatNames); ++cIdx) {
1601
1602 if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
1603 if (getCharCat(cp) == cIdx) {
1604 return cp;
1605 }
1606 break;
1607 }
1608 }
1609 }
1610 }
1611 }
1612
1613 *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1614 return error;
1615 }
1616
1617 /* try algorithmic names now */
1618 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1619 i=*p;
1620 algRange=(AlgorithmicRange *)(p+1);
1621 while(i>0) {
1622 if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
1623 return cp;
1624 }
1625 algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1626 --i;
1627 }
1628
1629 /* normal character name */
1630 findName.otherName=upper;
1631 findName.code=error;
1632 enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
1633 if (findName.code == error) {
1634 *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1635 }
1636 return findName.code;
1637 }
1638
1639 U_CAPI void U_EXPORT2
u_enumCharNames(UChar32 start,UChar32 limit,UEnumCharNamesFn * fn,void * context,UCharNameChoice nameChoice,UErrorCode * pErrorCode)1640 u_enumCharNames(UChar32 start, UChar32 limit,
1641 UEnumCharNamesFn *fn,
1642 void *context,
1643 UCharNameChoice nameChoice,
1644 UErrorCode *pErrorCode) {
1645 AlgorithmicRange *algRange;
1646 uint32_t *p;
1647 uint32_t i;
1648
1649 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1650 return;
1651 }
1652
1653 if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) {
1654 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1655 return;
1656 }
1657
1658 if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
1659 limit = UCHAR_MAX_VALUE + 1;
1660 }
1661 if((uint32_t)start>=(uint32_t)limit) {
1662 return;
1663 }
1664
1665 if(!isDataLoaded(pErrorCode)) {
1666 return;
1667 }
1668
1669 /* interleave the data-driven ones with the algorithmic ones */
1670 /* iterate over all algorithmic ranges; assume that they are in ascending order */
1671 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1672 i=*p;
1673 algRange=(AlgorithmicRange *)(p+1);
1674 while(i>0) {
1675 /* enumerate the character names before the current algorithmic range */
1676 /* here: start<limit */
1677 if((uint32_t)start<algRange->start) {
1678 if((uint32_t)limit<=algRange->start) {
1679 enumNames(uCharNames, start, limit, fn, context, nameChoice);
1680 return;
1681 }
1682 if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
1683 return;
1684 }
1685 start=(UChar32)algRange->start;
1686 }
1687 /* enumerate the character names in the current algorithmic range */
1688 /* here: algRange->start<=start<limit */
1689 if((uint32_t)start<=algRange->end) {
1690 if((uint32_t)limit<=(algRange->end+1)) {
1691 enumAlgNames(algRange, start, limit, fn, context, nameChoice);
1692 return;
1693 }
1694 if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
1695 return;
1696 }
1697 start=(UChar32)algRange->end+1;
1698 }
1699 /* continue to the next algorithmic range (here: start<limit) */
1700 algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1701 --i;
1702 }
1703 /* enumerate the character names after the last algorithmic range */
1704 enumNames(uCharNames, start, limit, fn, context, nameChoice);
1705 }
1706
1707 U_CAPI int32_t U_EXPORT2
uprv_getMaxCharNameLength()1708 uprv_getMaxCharNameLength() {
1709 UErrorCode errorCode=U_ZERO_ERROR;
1710 if(calcNameSetsLengths(&errorCode)) {
1711 return gMaxNameLength;
1712 } else {
1713 return 0;
1714 }
1715 }
1716
1717 /**
1718 * Converts the char set cset into a Unicode set uset.
1719 * @param cset Set of 256 bit flags corresponding to a set of chars.
1720 * @param uset USet to receive characters. Existing contents are deleted.
1721 */
1722 static void
charSetToUSet(uint32_t cset[8],const USetAdder * sa)1723 charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
1724 UChar us[256];
1725 char cs[256];
1726
1727 int32_t i, length;
1728 UErrorCode errorCode;
1729
1730 errorCode=U_ZERO_ERROR;
1731
1732 if(!calcNameSetsLengths(&errorCode)) {
1733 return;
1734 }
1735
1736 /* build a char string with all chars that are used in character names */
1737 length=0;
1738 for(i=0; i<256; ++i) {
1739 if(SET_CONTAINS(cset, i)) {
1740 cs[length++]=(char)i;
1741 }
1742 }
1743
1744 /* convert the char string to a UChar string */
1745 u_charsToUChars(cs, us, length);
1746
1747 /* add each UChar to the USet */
1748 for(i=0; i<length; ++i) {
1749 if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */
1750 sa->add(sa->set, us[i]);
1751 }
1752 }
1753 }
1754
1755 /**
1756 * Fills set with characters that are used in Unicode character names.
1757 * @param set USet to receive characters.
1758 */
1759 U_CAPI void U_EXPORT2
uprv_getCharNameCharacters(const USetAdder * sa)1760 uprv_getCharNameCharacters(const USetAdder *sa) {
1761 charSetToUSet(gNameSet, sa);
1762 }
1763
1764 /* data swapping ------------------------------------------------------------ */
1765
1766 /*
1767 * The token table contains non-negative entries for token bytes,
1768 * and -1 for bytes that represent themselves in the data file's charset.
1769 * -2 entries are used for lead bytes.
1770 *
1771 * Direct bytes (-1 entries) must be translated from the input charset family
1772 * to the output charset family.
1773 * makeTokenMap() writes a permutation mapping for this.
1774 * Use it once for single-/lead-byte tokens and once more for all trail byte
1775 * tokens. (';' is an unused trail byte marked with -1.)
1776 */
1777 static void
makeTokenMap(const UDataSwapper * ds,int16_t tokens[],uint16_t tokenCount,uint8_t map[256],UErrorCode * pErrorCode)1778 makeTokenMap(const UDataSwapper *ds,
1779 int16_t tokens[], uint16_t tokenCount,
1780 uint8_t map[256],
1781 UErrorCode *pErrorCode) {
1782 UBool usedOutChar[256];
1783 uint16_t i, j;
1784 uint8_t c1, c2;
1785
1786 if(U_FAILURE(*pErrorCode)) {
1787 return;
1788 }
1789
1790 if(ds->inCharset==ds->outCharset) {
1791 /* Same charset family: identity permutation */
1792 for(i=0; i<256; ++i) {
1793 map[i]=(uint8_t)i;
1794 }
1795 } else {
1796 uprv_memset(map, 0, 256);
1797 uprv_memset(usedOutChar, 0, 256);
1798
1799 if(tokenCount>256) {
1800 tokenCount=256;
1801 }
1802
1803 /* set the direct bytes (byte 0 always maps to itself) */
1804 for(i=1; i<tokenCount; ++i) {
1805 if(tokens[i]==-1) {
1806 /* convert the direct byte character */
1807 c1=(uint8_t)i;
1808 ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
1809 if(U_FAILURE(*pErrorCode)) {
1810 udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
1811 i, ds->inCharset);
1812 return;
1813 }
1814
1815 /* enter the converted character into the map and mark it used */
1816 map[c1]=c2;
1817 usedOutChar[c2]=TRUE;
1818 }
1819 }
1820
1821 /* set the mappings for the rest of the permutation */
1822 for(i=j=1; i<tokenCount; ++i) {
1823 /* set mappings that were not set for direct bytes */
1824 if(map[i]==0) {
1825 /* set an output byte value that was not used as an output byte above */
1826 while(usedOutChar[j]) {
1827 ++j;
1828 }
1829 map[i]=(uint8_t)j++;
1830 }
1831 }
1832
1833 /*
1834 * leave mappings at tokenCount and above unset if tokenCount<256
1835 * because they won't be used
1836 */
1837 }
1838 }
1839
1840 U_CAPI int32_t U_EXPORT2
uchar_swapNames(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)1841 uchar_swapNames(const UDataSwapper *ds,
1842 const void *inData, int32_t length, void *outData,
1843 UErrorCode *pErrorCode) {
1844 const UDataInfo *pInfo;
1845 int32_t headerSize;
1846
1847 const uint8_t *inBytes;
1848 uint8_t *outBytes;
1849
1850 uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
1851 offset, i, count, stringsCount;
1852
1853 const AlgorithmicRange *inRange;
1854 AlgorithmicRange *outRange;
1855
1856 /* udata_swapDataHeader checks the arguments */
1857 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
1858 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1859 return 0;
1860 }
1861
1862 /* check data format and format version */
1863 pInfo=(const UDataInfo *)((const char *)inData+4);
1864 if(!(
1865 pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
1866 pInfo->dataFormat[1]==0x6e &&
1867 pInfo->dataFormat[2]==0x61 &&
1868 pInfo->dataFormat[3]==0x6d &&
1869 pInfo->formatVersion[0]==1
1870 )) {
1871 udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
1872 pInfo->dataFormat[0], pInfo->dataFormat[1],
1873 pInfo->dataFormat[2], pInfo->dataFormat[3],
1874 pInfo->formatVersion[0]);
1875 *pErrorCode=U_UNSUPPORTED_ERROR;
1876 return 0;
1877 }
1878
1879 inBytes=(const uint8_t *)inData+headerSize;
1880 outBytes=(uint8_t *)outData+headerSize;
1881 if(length<0) {
1882 algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
1883 } else {
1884 length-=headerSize;
1885 if( length<20 ||
1886 (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
1887 ) {
1888 udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
1889 length);
1890 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1891 return 0;
1892 }
1893 }
1894
1895 if(length<0) {
1896 /* preflighting: iterate through algorithmic ranges */
1897 offset=algNamesOffset;
1898 count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
1899 offset+=4;
1900
1901 for(i=0; i<count; ++i) {
1902 inRange=(const AlgorithmicRange *)(inBytes+offset);
1903 offset+=ds->readUInt16(inRange->size);
1904 }
1905 } else {
1906 /* swap data */
1907 const uint16_t *p;
1908 uint16_t *q, *temp;
1909
1910 int16_t tokens[512];
1911 uint16_t tokenCount;
1912
1913 uint8_t map[256], trailMap[256];
1914
1915 /* copy the data for inaccessible bytes */
1916 if(inBytes!=outBytes) {
1917 uprv_memcpy(outBytes, inBytes, length);
1918 }
1919
1920 /* the initial 4 offsets first */
1921 tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
1922 groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
1923 groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
1924 ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
1925
1926 /*
1927 * now the tokens table
1928 * it needs to be permutated along with the compressed name strings
1929 */
1930 p=(const uint16_t *)(inBytes+16);
1931 q=(uint16_t *)(outBytes+16);
1932
1933 /* read and swap the tokenCount */
1934 tokenCount=ds->readUInt16(*p);
1935 ds->swapArray16(ds, p, 2, q, pErrorCode);
1936 ++p;
1937 ++q;
1938
1939 /* read the first 512 tokens and make the token maps */
1940 if(tokenCount<=512) {
1941 count=tokenCount;
1942 } else {
1943 count=512;
1944 }
1945 for(i=0; i<count; ++i) {
1946 tokens[i]=udata_readInt16(ds, p[i]);
1947 }
1948 for(; i<512; ++i) {
1949 tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
1950 }
1951 makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
1952 makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
1953 if(U_FAILURE(*pErrorCode)) {
1954 return 0;
1955 }
1956
1957 /*
1958 * swap and permutate the tokens
1959 * go through a temporary array to support in-place swapping
1960 */
1961 temp=(uint16_t *)uprv_malloc(tokenCount*2);
1962 if(temp==NULL) {
1963 udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
1964 tokenCount);
1965 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1966 return 0;
1967 }
1968
1969 /* swap and permutate single-/lead-byte tokens */
1970 for(i=0; i<tokenCount && i<256; ++i) {
1971 ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
1972 }
1973
1974 /* swap and permutate trail-byte tokens */
1975 for(; i<tokenCount; ++i) {
1976 ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
1977 }
1978
1979 /* copy the result into the output and free the temporary array */
1980 uprv_memcpy(q, temp, tokenCount*2);
1981 uprv_free(temp);
1982
1983 /*
1984 * swap the token strings but not a possible padding byte after
1985 * the terminating NUL of the last string
1986 */
1987 udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
1988 outBytes+tokenStringOffset, pErrorCode);
1989 if(U_FAILURE(*pErrorCode)) {
1990 udata_printError(ds, "uchar_swapNames(token strings) failed\n");
1991 return 0;
1992 }
1993
1994 /* swap the group table */
1995 count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset)));
1996 ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),
1997 outBytes+groupsOffset, pErrorCode);
1998
1999 /*
2000 * swap the group strings
2001 * swap the string bytes but not the nibble-encoded string lengths
2002 */
2003 if(ds->inCharset!=ds->outCharset) {
2004 uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
2005
2006 const uint8_t *inStrings, *nextInStrings;
2007 uint8_t *outStrings;
2008
2009 uint8_t c;
2010
2011 inStrings=inBytes+groupStringOffset;
2012 outStrings=outBytes+groupStringOffset;
2013
2014 stringsCount=algNamesOffset-groupStringOffset;
2015
2016 /* iterate through string groups until only a few padding bytes are left */
2017 while(stringsCount>32) {
2018 nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
2019
2020 /* move past the length bytes */
2021 stringsCount-=(uint32_t)(nextInStrings-inStrings);
2022 outStrings+=nextInStrings-inStrings;
2023 inStrings=nextInStrings;
2024
2025 count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
2026 stringsCount-=count;
2027
2028 /* swap the string bytes using map[] and trailMap[] */
2029 while(count>0) {
2030 c=*inStrings++;
2031 *outStrings++=map[c];
2032 if(tokens[c]!=-2) {
2033 --count;
2034 } else {
2035 /* token lead byte: swap the trail byte, too */
2036 *outStrings++=trailMap[*inStrings++];
2037 count-=2;
2038 }
2039 }
2040 }
2041 }
2042
2043 /* swap the algorithmic ranges */
2044 offset=algNamesOffset;
2045 count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
2046 ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
2047 offset+=4;
2048
2049 for(i=0; i<count; ++i) {
2050 if(offset>(uint32_t)length) {
2051 udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
2052 length, i);
2053 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2054 return 0;
2055 }
2056
2057 inRange=(const AlgorithmicRange *)(inBytes+offset);
2058 outRange=(AlgorithmicRange *)(outBytes+offset);
2059 offset+=ds->readUInt16(inRange->size);
2060
2061 ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
2062 ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
2063 switch(inRange->type) {
2064 case 0:
2065 /* swap prefix string */
2066 ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),
2067 outRange+1, pErrorCode);
2068 if(U_FAILURE(*pErrorCode)) {
2069 udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
2070 i);
2071 return 0;
2072 }
2073 break;
2074 case 1:
2075 {
2076 /* swap factors and the prefix and factor strings */
2077 uint32_t factorsCount;
2078
2079 factorsCount=inRange->variant;
2080 p=(const uint16_t *)(inRange+1);
2081 q=(uint16_t *)(outRange+1);
2082 ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
2083
2084 /* swap the strings, up to the last terminating NUL */
2085 p+=factorsCount;
2086 q+=factorsCount;
2087 stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
2088 while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
2089 --stringsCount;
2090 }
2091 ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
2092 }
2093 break;
2094 default:
2095 udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
2096 inRange->type, i);
2097 *pErrorCode=U_UNSUPPORTED_ERROR;
2098 return 0;
2099 }
2100 }
2101 }
2102
2103 return headerSize+(int32_t)offset;
2104 }
2105
2106 /*
2107 * Hey, Emacs, please set the following:
2108 *
2109 * Local Variables:
2110 * indent-tabs-mode: nil
2111 * End:
2112 *
2113 */
2114