• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 1998-2015, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *
11 * File parse.cpp
12 *
13 * Modification History:
14 *
15 *   Date          Name          Description
16 *   05/26/99     stephen       Creation.
17 *   02/25/00     weiv          Overhaul to write udata
18 *   5/10/01      Ram           removed ustdio dependency
19 *   06/10/2001  Dominic Ludlam <dom@recoil.org> Rewritten
20 *******************************************************************************
21 */
22 
23 // Safer use of UnicodeString.
24 #include <cstdint>
25 #include "unicode/umachine.h"
26 #ifndef UNISTR_FROM_CHAR_EXPLICIT
27 #   define UNISTR_FROM_CHAR_EXPLICIT explicit
28 #endif
29 
30 // Less important, but still a good idea.
31 #ifndef UNISTR_FROM_STRING_EXPLICIT
32 #   define UNISTR_FROM_STRING_EXPLICIT explicit
33 #endif
34 
35 #include <assert.h>
36 #include "parse.h"
37 #include "errmsg.h"
38 #include "uhash.h"
39 #include "cmemory.h"
40 #include "cstring.h"
41 #include "uinvchar.h"
42 #include "read.h"
43 #include "ustr.h"
44 #include "reslist.h"
45 #include "rbt_pars.h"
46 #include "genrb.h"
47 #include "unicode/normalizer2.h"
48 #include "unicode/stringpiece.h"
49 #include "unicode/unistr.h"
50 #include "unicode/ustring.h"
51 #include "unicode/uscript.h"
52 #include "unicode/utf16.h"
53 #include "unicode/putil.h"
54 #include "charstr.h"
55 #include "collationbuilder.h"
56 #include "collationdata.h"
57 #include "collationdatareader.h"
58 #include "collationdatawriter.h"
59 #include "collationfastlatinbuilder.h"
60 #include "collationinfo.h"
61 #include "collationroot.h"
62 #include "collationruleparser.h"
63 #include "collationtailoring.h"
64 #include <stdio.h>
65 #include "writesrc.h"
66 
67 /* Number of tokens to read ahead of the current stream position */
68 #define MAX_LOOKAHEAD   3
69 
70 #define CR               0x000D
71 #define LF               0x000A
72 #define SPACE            0x0020
73 #define TAB              0x0009
74 #define ESCAPE           0x005C
75 #define HASH             0x0023
76 #define QUOTE            0x0027
77 #define ZERO             0x0030
78 #define STARTCOMMAND     0x005B
79 #define ENDCOMMAND       0x005D
80 #define OPENSQBRACKET    0x005B
81 #define CLOSESQBRACKET   0x005D
82 
83 #define ICU4X_DIACRITIC_BASE  0x0300
84 #define ICU4X_DIACRITIC_LIMIT 0x034F
85 
86 using icu::CharString;
87 using icu::LocalMemory;
88 using icu::LocalPointer;
89 using icu::LocalUCHARBUFPointer;
90 using icu::StringPiece;
91 using icu::UnicodeString;
92 
93 struct Lookahead
94 {
95      enum   ETokenType type;
96      struct UString    value;
97      struct UString    comment;
98      uint32_t          line;
99 };
100 
101 /* keep in sync with token defines in read.h */
102 const char *tokenNames[TOK_TOKEN_COUNT] =
103 {
104      "string",             /* A string token, such as "MonthNames" */
105      "'{'",                 /* An opening brace character */
106      "'}'",                 /* A closing brace character */
107      "','",                 /* A comma */
108      "':'",                 /* A colon */
109 
110      "<end of file>",     /* End of the file has been reached successfully */
111      "<end of line>"
112 };
113 
114 /* Just to store "TRUE" */
115 //static const UChar trueValue[] = {0x0054, 0x0052, 0x0055, 0x0045, 0x0000};
116 
117 typedef struct {
118     struct Lookahead  lookahead[MAX_LOOKAHEAD + 1];
119     uint32_t          lookaheadPosition;
120     UCHARBUF         *buffer;
121     struct SRBRoot *bundle;
122     const char     *inputdir;
123     uint32_t        inputdirLength;
124     const char     *outputdir;
125     uint32_t        outputdirLength;
126     const char     *filename;
127     UBool           makeBinaryCollation;
128     UBool           omitCollationRules;
129     UBool           icu4xMode;
130 } ParseState;
131 
132 typedef struct SResource *
133 ParseResourceFunction(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status);
134 
135 static struct SResource *parseResource(ParseState* state, char *tag, const struct UString *comment, UErrorCode *status);
136 
137 /* The nature of the lookahead buffer:
138    There are MAX_LOOKAHEAD + 1 slots, used as a circular buffer.  This provides
139    MAX_LOOKAHEAD lookahead tokens and a slot for the current token and value.
140    When getToken is called, the current pointer is moved to the next slot and the
141    old slot is filled with the next token from the reader by calling getNextToken.
142    The token values are stored in the slot, which means that token values don't
143    survive a call to getToken, ie.
144 
145    UString *value;
146 
147    getToken(&value, NULL, status);
148    getToken(NULL,   NULL, status);       bad - value is now a different string
149 */
150 static void
initLookahead(ParseState * state,UCHARBUF * buf,UErrorCode * status)151 initLookahead(ParseState* state, UCHARBUF *buf, UErrorCode *status)
152 {
153     static uint32_t initTypeStrings = 0;
154     uint32_t i;
155 
156     if (!initTypeStrings)
157     {
158         initTypeStrings = 1;
159     }
160 
161     state->lookaheadPosition   = 0;
162     state->buffer              = buf;
163 
164     resetLineNumber();
165 
166     for (i = 0; i < MAX_LOOKAHEAD; i++)
167     {
168         state->lookahead[i].type = getNextToken(state->buffer, &state->lookahead[i].value, &state->lookahead[i].line, &state->lookahead[i].comment, status);
169         if (U_FAILURE(*status))
170         {
171             return;
172         }
173     }
174 
175     *status = U_ZERO_ERROR;
176 }
177 
178 static void
cleanupLookahead(ParseState * state)179 cleanupLookahead(ParseState* state)
180 {
181     uint32_t i;
182     for (i = 0; i <= MAX_LOOKAHEAD; i++)
183     {
184         ustr_deinit(&state->lookahead[i].value);
185         ustr_deinit(&state->lookahead[i].comment);
186     }
187 
188 }
189 
190 static enum ETokenType
getToken(ParseState * state,struct UString ** tokenValue,struct UString * comment,uint32_t * linenumber,UErrorCode * status)191 getToken(ParseState* state, struct UString **tokenValue, struct UString* comment, uint32_t *linenumber, UErrorCode *status)
192 {
193     enum ETokenType result;
194     uint32_t          i;
195 
196     result = state->lookahead[state->lookaheadPosition].type;
197 
198     if (tokenValue != NULL)
199     {
200         *tokenValue = &state->lookahead[state->lookaheadPosition].value;
201     }
202 
203     if (linenumber != NULL)
204     {
205         *linenumber = state->lookahead[state->lookaheadPosition].line;
206     }
207 
208     if (comment != NULL)
209     {
210         ustr_cpy(comment, &(state->lookahead[state->lookaheadPosition].comment), status);
211     }
212 
213     i = (state->lookaheadPosition + MAX_LOOKAHEAD) % (MAX_LOOKAHEAD + 1);
214     state->lookaheadPosition = (state->lookaheadPosition + 1) % (MAX_LOOKAHEAD + 1);
215     ustr_setlen(&state->lookahead[i].comment, 0, status);
216     ustr_setlen(&state->lookahead[i].value, 0, status);
217     state->lookahead[i].type = getNextToken(state->buffer, &state->lookahead[i].value, &state->lookahead[i].line, &state->lookahead[i].comment, status);
218 
219     /* printf("getToken, returning %s\n", tokenNames[result]); */
220 
221     return result;
222 }
223 
224 static enum ETokenType
peekToken(ParseState * state,uint32_t lookaheadCount,struct UString ** tokenValue,uint32_t * linenumber,struct UString * comment,UErrorCode * status)225 peekToken(ParseState* state, uint32_t lookaheadCount, struct UString **tokenValue, uint32_t *linenumber, struct UString *comment, UErrorCode *status)
226 {
227     uint32_t i = (state->lookaheadPosition + lookaheadCount) % (MAX_LOOKAHEAD + 1);
228 
229     if (U_FAILURE(*status))
230     {
231         return TOK_ERROR;
232     }
233 
234     if (lookaheadCount >= MAX_LOOKAHEAD)
235     {
236         *status = U_INTERNAL_PROGRAM_ERROR;
237         return TOK_ERROR;
238     }
239 
240     if (tokenValue != NULL)
241     {
242         *tokenValue = &state->lookahead[i].value;
243     }
244 
245     if (linenumber != NULL)
246     {
247         *linenumber = state->lookahead[i].line;
248     }
249 
250     if(comment != NULL){
251         ustr_cpy(comment, &(state->lookahead[state->lookaheadPosition].comment), status);
252     }
253 
254     return state->lookahead[i].type;
255 }
256 
257 static void
expect(ParseState * state,enum ETokenType expectedToken,struct UString ** tokenValue,struct UString * comment,uint32_t * linenumber,UErrorCode * status)258 expect(ParseState* state, enum ETokenType expectedToken, struct UString **tokenValue, struct UString *comment, uint32_t *linenumber, UErrorCode *status)
259 {
260     uint32_t        line;
261 
262     enum ETokenType token = getToken(state, tokenValue, comment, &line, status);
263 
264     if (linenumber != NULL)
265     {
266         *linenumber = line;
267     }
268 
269     if (U_FAILURE(*status))
270     {
271         return;
272     }
273 
274     if (token != expectedToken)
275     {
276         *status = U_INVALID_FORMAT_ERROR;
277         error(line, "expecting %s, got %s", tokenNames[expectedToken], tokenNames[token]);
278     }
279     else
280     {
281         *status = U_ZERO_ERROR;
282     }
283 }
284 
getInvariantString(ParseState * state,uint32_t * line,struct UString * comment,int32_t & stringLength,UErrorCode * status)285 static char *getInvariantString(ParseState* state, uint32_t *line, struct UString *comment,
286                                 int32_t &stringLength, UErrorCode *status)
287 {
288     struct UString *tokenValue;
289     char           *result;
290 
291     expect(state, TOK_STRING, &tokenValue, comment, line, status);
292 
293     if (U_FAILURE(*status))
294     {
295         return NULL;
296     }
297 
298     if(!uprv_isInvariantUString(tokenValue->fChars, tokenValue->fLength)) {
299         *status = U_INVALID_FORMAT_ERROR;
300         error(*line, "invariant characters required for table keys, binary data, etc.");
301         return NULL;
302     }
303 
304     result = static_cast<char *>(uprv_malloc(tokenValue->fLength+1));
305 
306     if (result == NULL)
307     {
308         *status = U_MEMORY_ALLOCATION_ERROR;
309         return NULL;
310     }
311 
312     u_UCharsToChars(tokenValue->fChars, result, tokenValue->fLength+1);
313     stringLength = tokenValue->fLength;
314     return result;
315 }
316 
317 static struct SResource *
parseUCARules(ParseState * state,char * tag,uint32_t startline,const struct UString *,UErrorCode * status)318 parseUCARules(ParseState* state, char *tag, uint32_t startline, const struct UString* /*comment*/, UErrorCode *status)
319 {
320     struct SResource *result = NULL;
321     struct UString   *tokenValue;
322     FileStream       *file          = NULL;
323     char              filename[256] = { '\0' };
324     char              cs[128]       = { '\0' };
325     uint32_t          line;
326     UBool quoted = false;
327     UCHARBUF *ucbuf=NULL;
328     UChar32   c     = 0;
329     const char* cp  = NULL;
330     UChar *pTarget     = NULL;
331     UChar *target      = NULL;
332     UChar *targetLimit = NULL;
333     int32_t size = 0;
334 
335     expect(state, TOK_STRING, &tokenValue, NULL, &line, status);
336 
337     if(isVerbose()){
338         printf(" %s at line %i \n",  (tag == NULL) ? "(null)" : tag, (int)startline);
339     }
340 
341     if (U_FAILURE(*status))
342     {
343         return NULL;
344     }
345     /* make the filename including the directory */
346     if (state->inputdir != NULL)
347     {
348         uprv_strcat(filename, state->inputdir);
349 
350         if (state->inputdir[state->inputdirLength - 1] != U_FILE_SEP_CHAR)
351         {
352             uprv_strcat(filename, U_FILE_SEP_STRING);
353         }
354     }
355 
356     u_UCharsToChars(tokenValue->fChars, cs, tokenValue->fLength);
357 
358     expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
359 
360     if (U_FAILURE(*status))
361     {
362         return NULL;
363     }
364     uprv_strcat(filename, cs);
365 
366     if(state->omitCollationRules) {
367         return res_none();
368     }
369 
370     ucbuf = ucbuf_open(filename, &cp, getShowWarning(),false, status);
371 
372     if (U_FAILURE(*status)) {
373         error(line, "An error occurred while opening the input file %s\n", filename);
374         return NULL;
375     }
376 
377     /* We allocate more space than actually required
378     * since the actual size needed for storing UChars
379     * is not known in UTF-8 byte stream
380     */
381     size        = ucbuf_size(ucbuf) + 1;
382     pTarget     = (UChar*) uprv_malloc(U_SIZEOF_UCHAR * size);
383     uprv_memset(pTarget, 0, size*U_SIZEOF_UCHAR);
384     target      = pTarget;
385     targetLimit = pTarget+size;
386 
387     /* read the rules into the buffer */
388     while (target < targetLimit)
389     {
390         c = ucbuf_getc(ucbuf, status);
391         if(c == QUOTE) {
392             quoted = (UBool)!quoted;
393         }
394         /* weiv (06/26/2002): adding the following:
395          * - preserving spaces in commands [...]
396          * - # comments until the end of line
397          */
398         if (c == STARTCOMMAND && !quoted)
399         {
400             /* preserve commands
401              * closing bracket will be handled by the
402              * append at the end of the loop
403              */
404             while(c != ENDCOMMAND) {
405                 U_APPEND_CHAR32_ONLY(c, target);
406                 c = ucbuf_getc(ucbuf, status);
407             }
408         }
409         else if (c == HASH && !quoted) {
410             /* skip comments */
411             while(c != CR && c != LF) {
412                 c = ucbuf_getc(ucbuf, status);
413             }
414             continue;
415         }
416         else if (c == ESCAPE)
417         {
418             c = unescape(ucbuf, status);
419 
420             if (c == (UChar32)U_ERR)
421             {
422                 uprv_free(pTarget);
423                 T_FileStream_close(file);
424                 return NULL;
425             }
426         }
427         else if (!quoted && (c == SPACE || c == TAB || c == CR || c == LF))
428         {
429             /* ignore spaces carriage returns
430             * and line feed unless in the form \uXXXX
431             */
432             continue;
433         }
434 
435         /* Append UChar * after dissembling if c > 0xffff*/
436         if (c != (UChar32)U_EOF)
437         {
438             U_APPEND_CHAR32_ONLY(c, target);
439         }
440         else
441         {
442             break;
443         }
444     }
445 
446     /* terminate the string */
447     if(target < targetLimit){
448         *target = 0x0000;
449     }
450 
451     result = string_open(state->bundle, tag, pTarget, (int32_t)(target - pTarget), NULL, status);
452 
453 
454     ucbuf_close(ucbuf);
455     uprv_free(pTarget);
456     T_FileStream_close(file);
457 
458     return result;
459 }
460 
461 static struct SResource *
parseTransliterator(ParseState * state,char * tag,uint32_t startline,const struct UString *,UErrorCode * status)462 parseTransliterator(ParseState* state, char *tag, uint32_t startline, const struct UString* /*comment*/, UErrorCode *status)
463 {
464     struct SResource *result = NULL;
465     struct UString   *tokenValue;
466     FileStream       *file          = NULL;
467     char              filename[256] = { '\0' };
468     char              cs[128]       = { '\0' };
469     uint32_t          line;
470     UCHARBUF *ucbuf=NULL;
471     const char* cp  = NULL;
472     UChar *pTarget     = NULL;
473     const UChar *pSource     = NULL;
474     int32_t size = 0;
475 
476     expect(state, TOK_STRING, &tokenValue, NULL, &line, status);
477 
478     if(isVerbose()){
479         printf(" %s at line %i \n",  (tag == NULL) ? "(null)" : tag, (int)startline);
480     }
481 
482     if (U_FAILURE(*status))
483     {
484         return NULL;
485     }
486     /* make the filename including the directory */
487     if (state->inputdir != NULL)
488     {
489         uprv_strcat(filename, state->inputdir);
490 
491         if (state->inputdir[state->inputdirLength - 1] != U_FILE_SEP_CHAR)
492         {
493             uprv_strcat(filename, U_FILE_SEP_STRING);
494         }
495     }
496 
497     u_UCharsToChars(tokenValue->fChars, cs, tokenValue->fLength);
498 
499     expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
500 
501     if (U_FAILURE(*status))
502     {
503         return NULL;
504     }
505     uprv_strcat(filename, cs);
506 
507 
508     ucbuf = ucbuf_open(filename, &cp, getShowWarning(),false, status);
509 
510     if (U_FAILURE(*status)) {
511         error(line, "An error occurred while opening the input file %s\n", filename);
512         return NULL;
513     }
514 
515     /* We allocate more space than actually required
516     * since the actual size needed for storing UChars
517     * is not known in UTF-8 byte stream
518     */
519     pSource = ucbuf_getBuffer(ucbuf, &size, status);
520     pTarget     = (UChar*) uprv_malloc(U_SIZEOF_UCHAR * (size + 1));
521     uprv_memset(pTarget, 0, size*U_SIZEOF_UCHAR);
522 
523 #if !UCONFIG_NO_TRANSLITERATION
524     size = utrans_stripRules(pSource, size, pTarget, status);
525 #else
526     size = 0;
527     fprintf(stderr, " Warning: writing empty transliteration data ( UCONFIG_NO_TRANSLITERATION ) \n");
528 #endif
529     result = string_open(state->bundle, tag, pTarget, size, NULL, status);
530 
531     ucbuf_close(ucbuf);
532     uprv_free(pTarget);
533     T_FileStream_close(file);
534 
535     return result;
536 }
537 static ArrayResource* dependencyArray = NULL;
538 
539 static struct SResource *
parseDependency(ParseState * state,char * tag,uint32_t startline,const struct UString * comment,UErrorCode * status)540 parseDependency(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status)
541 {
542     struct SResource *result = NULL;
543     struct SResource *elem = NULL;
544     struct UString   *tokenValue;
545     uint32_t          line;
546     char              filename[256] = { '\0' };
547     char              cs[128]       = { '\0' };
548 
549     expect(state, TOK_STRING, &tokenValue, NULL, &line, status);
550 
551     if(isVerbose()){
552         printf(" %s at line %i \n",  (tag == NULL) ? "(null)" : tag, (int)startline);
553     }
554 
555     if (U_FAILURE(*status))
556     {
557         return NULL;
558     }
559     /* make the filename including the directory */
560     if (state->outputdir != NULL)
561     {
562         uprv_strcat(filename, state->outputdir);
563 
564         if (state->outputdir[state->outputdirLength - 1] != U_FILE_SEP_CHAR)
565         {
566             uprv_strcat(filename, U_FILE_SEP_STRING);
567         }
568     }
569 
570     u_UCharsToChars(tokenValue->fChars, cs, tokenValue->fLength);
571 
572     if (U_FAILURE(*status))
573     {
574         return NULL;
575     }
576     uprv_strcat(filename, cs);
577     if(!T_FileStream_file_exists(filename)){
578         if(isStrict()){
579             error(line, "The dependency file %s does not exist. Please make sure it exists.\n",filename);
580         }else{
581             warning(line, "The dependency file %s does not exist. Please make sure it exists.\n",filename);
582         }
583     }
584     if(dependencyArray==NULL){
585         dependencyArray = array_open(state->bundle, "%%DEPENDENCY", NULL, status);
586     }
587     if(tag!=NULL){
588         result = string_open(state->bundle, tag, tokenValue->fChars, tokenValue->fLength, comment, status);
589     }
590     elem = string_open(state->bundle, NULL, tokenValue->fChars, tokenValue->fLength, comment, status);
591 
592     dependencyArray->add(elem);
593 
594     if (U_FAILURE(*status))
595     {
596         return NULL;
597     }
598     expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
599     return result;
600 }
601 static struct SResource *
parseString(ParseState * state,char * tag,uint32_t startline,const struct UString * comment,UErrorCode * status)602 parseString(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status)
603 {
604     struct UString   *tokenValue;
605     struct SResource *result = NULL;
606 
607 /*    if (tag != NULL && uprv_strcmp(tag, "%%UCARULES") == 0)
608     {
609         return parseUCARules(tag, startline, status);
610     }*/
611     if(isVerbose()){
612         printf(" string %s at line %i \n",  (tag == NULL) ? "(null)" : tag, (int)startline);
613     }
614     expect(state, TOK_STRING, &tokenValue, NULL, NULL, status);
615 
616     if (U_SUCCESS(*status))
617     {
618         /* create the string now - tokenValue doesn't survive a call to getToken (and therefore
619         doesn't survive expect either) */
620 
621         result = string_open(state->bundle, tag, tokenValue->fChars, tokenValue->fLength, comment, status);
622         if(U_SUCCESS(*status) && result) {
623             expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
624 
625             if (U_FAILURE(*status))
626             {
627                 res_close(result);
628                 return NULL;
629             }
630         }
631     }
632 
633     return result;
634 }
635 
636 static struct SResource *
parseAlias(ParseState * state,char * tag,uint32_t startline,const struct UString * comment,UErrorCode * status)637 parseAlias(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
638 {
639     struct UString   *tokenValue;
640     struct SResource *result  = NULL;
641 
642     expect(state, TOK_STRING, &tokenValue, NULL, NULL, status);
643 
644     if(isVerbose()){
645         printf(" alias %s at line %i \n",  (tag == NULL) ? "(null)" : tag, (int)startline);
646     }
647 
648     if (U_SUCCESS(*status))
649     {
650         /* create the string now - tokenValue doesn't survive a call to getToken (and therefore
651         doesn't survive expect either) */
652 
653         result = alias_open(state->bundle, tag, tokenValue->fChars, tokenValue->fLength, comment, status);
654 
655         expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
656 
657         if (U_FAILURE(*status))
658         {
659             res_close(result);
660             return NULL;
661         }
662     }
663 
664     return result;
665 }
666 
667 #if !UCONFIG_NO_COLLATION
668 
669 namespace {
670 
resLookup(struct SResource * res,const char * key)671 static struct SResource* resLookup(struct SResource* res, const char* key){
672     if (res == res_none() || !res->isTable()) {
673         return NULL;
674     }
675 
676     TableResource *list = static_cast<TableResource *>(res);
677     SResource *current = list->fFirst;
678     while (current != NULL) {
679         if (uprv_strcmp(((list->fRoot->fKeys) + (current->fKey)), key) == 0) {
680             return current;
681         }
682         current = current->fNext;
683     }
684     return NULL;
685 }
686 
687 class GenrbImporter : public icu::CollationRuleParser::Importer {
688 public:
GenrbImporter(const char * in,const char * out)689     GenrbImporter(const char *in, const char *out) : inputDir(in), outputDir(out) {}
690     virtual ~GenrbImporter();
691     virtual void getRules(
692             const char *localeID, const char *collationType,
693             UnicodeString &rules,
694             const char *&errorReason, UErrorCode &errorCode) override;
695 
696 private:
697     const char *inputDir;
698     const char *outputDir;
699 };
700 
~GenrbImporter()701 GenrbImporter::~GenrbImporter() {}
702 
703 void
getRules(const char * localeID,const char * collationType,UnicodeString & rules,const char * &,UErrorCode & errorCode)704 GenrbImporter::getRules(
705         const char *localeID, const char *collationType,
706         UnicodeString &rules,
707         const char *& /*errorReason*/, UErrorCode &errorCode) {
708     CharString filename(localeID, errorCode);
709     for(int32_t i = 0; i < filename.length(); i++){
710         if(filename[i] == '-'){
711             filename.data()[i] = '_';
712         }
713     }
714     filename.append(".txt", errorCode);
715     if (U_FAILURE(errorCode)) {
716         return;
717     }
718     CharString inputDirBuf;
719     CharString openFileName;
720     if(inputDir == NULL) {
721         const char *filenameBegin = uprv_strrchr(filename.data(), U_FILE_SEP_CHAR);
722         if (filenameBegin != NULL) {
723             /*
724              * When a filename ../../../data/root.txt is specified,
725              * we presume that the input directory is ../../../data
726              * This is very important when the resource file includes
727              * another file, like UCARules.txt or thaidict.brk.
728              */
729             StringPiece dir = filename.toStringPiece();
730             const char *filenameLimit = filename.data() + filename.length();
731             dir.remove_suffix((int32_t)(filenameLimit - filenameBegin));
732             inputDirBuf.append(dir, errorCode);
733             inputDir = inputDirBuf.data();
734         }
735     }else{
736         int32_t dirlen  = (int32_t)uprv_strlen(inputDir);
737 
738         if((filename[0] != U_FILE_SEP_CHAR) && (inputDir[dirlen-1] !='.')) {
739             /*
740              * append the input dir to openFileName if the first char in
741              * filename is not file separator char and the last char input directory is  not '.'.
742              * This is to support :
743              * genrb -s. /home/icu/data
744              * genrb -s. icu/data
745              * The user cannot mix notations like
746              * genrb -s. /icu/data --- the absolute path specified. -s redundant
747              * user should use
748              * genrb -s. icu/data  --- start from CWD and look in icu/data dir
749              */
750             openFileName.append(inputDir, dirlen, errorCode);
751             if(inputDir[dirlen-1] != U_FILE_SEP_CHAR) {
752                 openFileName.append(U_FILE_SEP_CHAR, errorCode);
753             }
754         }
755     }
756     openFileName.append(filename, errorCode);
757     if(U_FAILURE(errorCode)) {
758         return;
759     }
760     // printf("GenrbImporter::getRules(%s, %s) reads %s\n", localeID, collationType, openFileName.data());
761     const char* cp = "";
762     LocalUCHARBUFPointer ucbuf(
763             ucbuf_open(openFileName.data(), &cp, getShowWarning(), true, &errorCode));
764     if(errorCode == U_FILE_ACCESS_ERROR) {
765         fprintf(stderr, "couldn't open file %s\n", openFileName.data());
766         return;
767     }
768     if (ucbuf.isNull() || U_FAILURE(errorCode)) {
769         fprintf(stderr, "An error occurred processing file %s. Error: %s\n", openFileName.data(), u_errorName(errorCode));
770         return;
771     }
772 
773     /* Parse the data into an SRBRoot */
774     LocalPointer<SRBRoot> data(
775             parse(ucbuf.getAlias(), inputDir, outputDir, filename.data(), false, false, false, &errorCode));
776     if (U_FAILURE(errorCode)) {
777         return;
778     }
779 
780     struct SResource *root = data->fRoot;
781     struct SResource *collations = resLookup(root, "collations");
782     if (collations != NULL) {
783       struct SResource *collation = resLookup(collations, collationType);
784       if (collation != NULL) {
785         struct SResource *sequence = resLookup(collation, "Sequence");
786         if (sequence != NULL && sequence->isString()) {
787           // No string pointer aliasing so that we need not hold onto the resource bundle.
788           StringResource *sr = static_cast<StringResource *>(sequence);
789           rules = sr->fString;
790         }
791       }
792     }
793 }
794 
795 // Quick-and-dirty escaping function.
796 // Assumes that we are on an ASCII-based platform.
797 static void
escape(const UChar * s,char * buffer)798 escape(const UChar *s, char *buffer) {
799     int32_t length = u_strlen(s);
800     int32_t i = 0;
801     for (;;) {
802         UChar32 c;
803         U16_NEXT(s, i, length, c);
804         if (c == 0) {
805             *buffer = 0;
806             return;
807         } else if (0x20 <= c && c <= 0x7e) {
808             // printable ASCII
809             *buffer++ = (char)c;  // assumes ASCII-based platform
810         } else {
811             buffer += sprintf(buffer, "\\u%04X", (int)c);
812         }
813     }
814 }
815 
816 }  // namespace
817 
818 static FILE*
openTOML(const char * outputdir,const char * name,const char * collationType,const char * structType,UErrorCode * status)819 openTOML(const char* outputdir, const char* name, const char* collationType, const char* structType, UErrorCode *status) {
820     CharString baseName;
821     baseName.append(name, *status);
822     baseName.append("_", *status);
823     baseName.append(collationType, *status);
824     baseName.append("_", *status);
825     baseName.append(structType, *status);
826 
827     CharString outFileName;
828     if (outputdir && *outputdir) {
829         outFileName.append(outputdir, *status).ensureEndsWithFileSeparator(*status);
830     }
831     outFileName.append(baseName, *status);
832     outFileName.append(".toml", *status);
833     if (U_FAILURE(*status)) {
834         return NULL;
835     }
836 
837     FILE* f = fopen(outFileName.data(), "w");
838     if (!f) {
839         *status = U_FILE_ACCESS_ERROR;
840         return NULL;
841     }
842     usrc_writeFileNameGeneratedBy(f, "#", baseName.data(), "genrb -X");
843 
844     return f;
845 }
846 
847 static void
writeCollationMetadataTOML(const char * outputdir,const char * name,const char * collationType,const uint32_t metadataBits,UErrorCode * status)848 writeCollationMetadataTOML(const char* outputdir, const char* name, const char* collationType, const uint32_t metadataBits, UErrorCode *status) {
849     FILE* f = openTOML(outputdir, name, collationType, "meta", status);
850     if (!f) {
851         return;
852     }
853     // printf("writeCollationMetadataTOML %s %s\n", name, collationType);
854     fprintf(f, "bits = 0x%X\n", metadataBits);
855     fclose(f);
856 }
857 
858 static UChar32
writeCollationDiacriticsTOML(const char * outputdir,const char * name,const char * collationType,const icu::CollationData * data,UErrorCode * status)859 writeCollationDiacriticsTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) {
860     UChar32 limit = ICU4X_DIACRITIC_LIMIT;
861     FILE* f = openTOML(outputdir, name, collationType, "dia", status);
862     if (!f) {
863         return limit;
864     }
865     // printf("writeCollationDiacriticsTOML %s %s\n", name, collationType);
866     uint16_t secondaries[ICU4X_DIACRITIC_LIMIT-ICU4X_DIACRITIC_BASE];
867     for (UChar32 c = ICU4X_DIACRITIC_BASE; c < ICU4X_DIACRITIC_LIMIT; ++c) {
868         uint16_t secondary = 0;
869         uint32_t ce32 = data->getCE32(c);
870         if (ce32 == icu::Collation::FALLBACK_CE32) {
871             ce32 = data->base->getCE32(c);
872         }
873         if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) {
874             // These never occur in NFD data
875         } else if (!icu::Collation::isSimpleOrLongCE32(ce32)) {
876             if (uprv_strcmp(name, "root") == 0) {
877                 printf("UNSUPPORTED DIACRITIC CE32 in root: TAG: %X CE32: %X char: %X\n", icu::Collation::tagFromCE32(ce32), ce32, c);
878                 fclose(f);
879                 *status = U_INTERNAL_PROGRAM_ERROR;
880                 return limit;
881             }
882             limit = c;
883             break;
884         } else {
885             uint64_t ce = uint64_t(icu::Collation::ceFromCE32(ce32));
886             if ((ce & 0xFFFFFFFF0000FFFF) != uint64_t(icu::Collation::COMMON_TERTIARY_CE)) {
887                 // Not a CE where only the secondary weight differs from the expected
888                 // pattern.
889                 limit = c;
890                 break;
891             }
892             secondary = uint16_t(ce >> 16);
893         }
894         secondaries[c - ICU4X_DIACRITIC_BASE] = secondary;
895 
896     }
897     usrc_writeArray(f, "secondaries = [\n  ", secondaries, 16, limit-ICU4X_DIACRITIC_BASE, "  ", "\n]\n");
898     fclose(f);
899     return limit;
900 }
901 
902 static void
writeCollationReorderingTOML(const char * outputdir,const char * name,const char * collationType,const icu::CollationSettings * settings,UErrorCode * status)903 writeCollationReorderingTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationSettings* settings, UErrorCode *status) {
904     FILE* f = openTOML(outputdir, name, collationType, "reord", status);
905     if (!f) {
906         return;
907     }
908     // printf("writeCollationReorderingTOML %s %s\n", name, collationType);
909     fprintf(f, "min_high_no_reorder = 0x%X\n", settings->minHighNoReorder);
910     usrc_writeArray(f, "reorder_table = [\n  ", settings->reorderTable, 8, 256, "  ", "\n]\n");
911     usrc_writeArray(f, "reorder_ranges = [\n  ", settings->reorderRanges, 32, settings->reorderRangesLength, "  ", "\n]\n");
912     fclose(f);
913 }
914 
915 
916 static void
writeCollationJamoTOML(const char * outputdir,const char * name,const char * collationType,const icu::CollationData * data,UErrorCode * status)917 writeCollationJamoTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) {
918     FILE* f = openTOML(outputdir, name, collationType, "jamo", status);
919     if (!f) {
920         printf("writeCollationJamoTOML FAILED TO OPEN FILE %s %s\n", name, collationType);
921         return;
922     }
923     uint32_t jamo[0x1200-0x1100];
924     for (UChar32 c = 0x1100; c < 0x1200; ++c) {
925         uint32_t ce32 = data->getCE32(c);
926         if (ce32 == icu::Collation::FALLBACK_CE32) {
927             ce32 = data->base->getCE32(c);
928         }
929         // Can't reject complex CE32s, because search collations have expansions.
930         // These expansions refer to the tailoring, which foils the reuse of the
931         // these jamo tables.
932         // XXX Figure out what to do. Perhaps instead of having Latin mini expansions,
933         // there should be Hangul mini expansions.
934         // XXX in any case, validate that modern jamo are self-contained.
935         jamo[c - 0x1100] = ce32;
936 
937     }
938     usrc_writeArray(f, "ce32s = [\n  ", jamo, 32, 0x1200-0x1100, "  ", "\n]\n");
939     fclose(f);
940 }
941 
942 static UBool
convertTrie(const void * context,UChar32 start,UChar32 end,uint32_t value)943 convertTrie(const void *context, UChar32 start, UChar32 end, uint32_t value) {
944     if (start >= 0x1100 && start < 0x1200 && end >= 0x1100 && end < 0x1200) {
945         // Range entirely in conjoining jamo block.
946         return true;
947     }
948     icu::IcuToolErrorCode status("genrb: convertTrie");
949     umutablecptrie_setRange((UMutableCPTrie*)context, start, end, value, status);
950     return !U_FAILURE(*status);
951 }
952 
953 static void
writeCollationDataTOML(const char * outputdir,const char * name,const char * collationType,const icu::CollationData * data,UBool root,UChar32 diacriticLimit,UErrorCode * status)954 writeCollationDataTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UBool root, UChar32 diacriticLimit, UErrorCode *status) {
955     FILE* f = openTOML(outputdir, name, collationType, "data", status);
956     if (!f) {
957         return;
958     }
959     // printf("writeCollationDataTOML %s %s\n", name, collationType);
960 
961     icu::UnicodeSet tailoringSet;
962 
963     if (data->base) {
964         tailoringSet.addAll(*(data->unsafeBackwardSet));
965         tailoringSet.removeAll(*(data->base->unsafeBackwardSet));
966     } else {
967         tailoringSet.addAll(*(data->unsafeBackwardSet));
968     }
969 
970     // Use the same value for out-of-range and default in the hope of not having to allocate
971     // different blocks, since ICU4X never does out-of-range queries.
972     uint32_t trieDefault = root ? icu::Collation::UNASSIGNED_CE32 : icu::Collation::FALLBACK_CE32;
973     icu::LocalUMutableCPTriePointer builder(umutablecptrie_open(trieDefault, trieDefault, status));
974 
975     utrie2_enum(data->trie, NULL, &convertTrie, builder.getAlias());
976 
977     // If the diacritic table was cut short, copy CE32s between the lowered
978     // limit and the max limit from the root to the tailoring. As of June 2022,
979     // no collation in CLDR needs this.
980     for (UChar32 c = diacriticLimit; c < ICU4X_DIACRITIC_LIMIT; ++c) {
981         if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) {
982             // These never occur in NFD data.
983             continue;
984         }
985         uint32_t ce32 = data->getCE32(c);
986         if (ce32 == icu::Collation::FALLBACK_CE32) {
987             ce32 = data->base->getCE32(c);
988             umutablecptrie_set(builder.getAlias(), c, ce32, status);
989         }
990     }
991 
992     // Ensure that the range covered by the diacritic table isn't duplicated
993     // in the trie.
994     for (UChar32 c = ICU4X_DIACRITIC_BASE; c < diacriticLimit; ++c) {
995         if (umutablecptrie_get(builder.getAlias(), c) != trieDefault) {
996             umutablecptrie_set(builder.getAlias(), c, trieDefault, status);
997         }
998     }
999 
1000     icu::LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
1001     builder.getAlias(),
1002     UCPTRIE_TYPE_SMALL,
1003     UCPTRIE_VALUE_BITS_32,
1004     status));
1005     usrc_writeArray(f, "contexts = [\n  ", data->contexts, 16, data->contextsLength, "  ", "\n]\n");
1006     usrc_writeArray(f, "ce32s = [\n  ", data->ce32s, 32, data->ce32sLength, "  ", "\n]\n");
1007     usrc_writeArray(f, "ces = [\n  ", data->ces, 64, data->cesLength, "  ", "\n]\n");
1008     fprintf(f, "[trie]\n");
1009     usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
1010 
1011     fclose(f);
1012 }
1013 
1014 static void
writeCollationSpecialPrimariesTOML(const char * outputdir,const char * name,const char * collationType,const icu::CollationData * data,UErrorCode * status)1015 writeCollationSpecialPrimariesTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) {
1016     FILE* f = openTOML(outputdir, name, collationType, "prim", status);
1017     if (!f) {
1018         return;
1019     }
1020     // printf("writeCollationSpecialPrimariesTOML %s %s\n", name, collationType);
1021 
1022     uint16_t lastPrimaries[4];
1023     for (int32_t i = 0; i < 4; ++i) {
1024         // getLastPrimaryForGroup subtracts one from a 16-bit value, so we add one
1025         // back to get a value that fits in 16 bits.
1026         lastPrimaries[i] = (uint16_t)((data->getLastPrimaryForGroup(UCOL_REORDER_CODE_FIRST + i) + 1) >> 16);
1027     }
1028 
1029     uint32_t numericPrimary = data->numericPrimary;
1030     if (numericPrimary & 0xFFFFFF) {
1031         printf("Lower 24 bits set in numeric primary");
1032         *status = U_INTERNAL_PROGRAM_ERROR;
1033         return;
1034     }
1035 
1036     usrc_writeArray(f, "last_primaries = [\n  ", lastPrimaries, 16, 4, "  ", "\n]\n");
1037     fprintf(f, "numeric_primary = 0x%X\n", numericPrimary >> 24);
1038     fclose(f);
1039 }
1040 
1041 static void
writeCollationTOML(const char * outputdir,const char * name,const char * collationType,const icu::CollationData * data,const icu::CollationSettings * settings,UErrorCode * status)1042 writeCollationTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, const icu::CollationSettings* settings, UErrorCode *status) {
1043     UBool tailored = false;
1044     UBool tailoredDiacritics = false;
1045     UBool lithuanianDotAbove = (uprv_strcmp(name, "lt") == 0);
1046     UBool reordering = false;
1047     UBool isRoot = uprv_strcmp(name, "root") == 0;
1048     UChar32 diacriticLimit = ICU4X_DIACRITIC_LIMIT;
1049     if (!data->base && isRoot) {
1050         diacriticLimit = writeCollationDiacriticsTOML(outputdir, name, collationType, data, status);
1051         if (U_FAILURE(*status)) {
1052             return;
1053         }
1054         writeCollationJamoTOML(outputdir, name, collationType, data, status);
1055         if (U_FAILURE(*status)) {
1056             return;
1057         }
1058         writeCollationSpecialPrimariesTOML(outputdir, name, collationType, data, status);
1059         if (U_FAILURE(*status)) {
1060             return;
1061         }
1062     } else if (data->base && !lithuanianDotAbove) {
1063         for (UChar32 c = ICU4X_DIACRITIC_BASE; c < ICU4X_DIACRITIC_LIMIT; ++c) {
1064             if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) {
1065                 // These never occur in NFD data.
1066                 continue;
1067             }
1068             uint32_t ce32 = data->getCE32(c);
1069             if ((ce32 != icu::Collation::FALLBACK_CE32) && (ce32 != data->base->getCE32(c))) {
1070                 tailoredDiacritics = true;
1071                 diacriticLimit = writeCollationDiacriticsTOML(outputdir, name, collationType, data, status);
1072                 if (U_FAILURE(*status)) {
1073                     return;
1074                 }
1075                 break;
1076             }
1077         }
1078     }
1079 
1080     if (settings->hasReordering()) {
1081         reordering = true;
1082         // Note: There are duplicate reorderings. Expecting the ICU4X provider
1083         // to take care of deduplication.
1084         writeCollationReorderingTOML(outputdir, name, collationType, settings, status);
1085         if (U_FAILURE(*status)) {
1086             return;
1087         }
1088     }
1089 
1090     // Write collation data if either base is non-null or the name is root.
1091     // Languages that only reorder scripts are otherwise root-like and have
1092     // null base.
1093     if (data->base || isRoot) {
1094         tailored = !isRoot;
1095         writeCollationDataTOML(outputdir, name, collationType, data, (!data->base && isRoot), diacriticLimit, status);
1096         if (U_FAILURE(*status)) {
1097             return;
1098         }
1099     }
1100 
1101     uint32_t maxVariable = (uint32_t)settings->getMaxVariable();
1102     if (maxVariable >= 4) {
1103         printf("Max variable out of range");
1104         *status = U_INTERNAL_PROGRAM_ERROR;
1105         return;
1106     }
1107 
1108     uint32_t metadataBits = maxVariable;
1109     if (tailored) {
1110         metadataBits |= (1 << 3);
1111     }
1112     if (tailoredDiacritics) {
1113         metadataBits |= (1 << 4);
1114     }
1115     if (reordering) {
1116         metadataBits |= (1 << 5);
1117     }
1118     if (lithuanianDotAbove) {
1119         metadataBits |= (1 << 6);
1120     }
1121     if ((settings->options & icu::CollationSettings::BACKWARD_SECONDARY) != 0) {
1122         metadataBits |= (1 << 7);
1123     }
1124     if (settings->getAlternateHandling() == UCOL_SHIFTED) {
1125         metadataBits |= (1 << 8);
1126     }
1127     switch (settings->getCaseFirst()) {
1128         case UCOL_OFF:
1129             break;
1130         case UCOL_UPPER_FIRST:
1131             metadataBits |= (1 << 9);
1132             metadataBits |= (1 << 10);
1133             break;
1134         case UCOL_LOWER_FIRST:
1135             metadataBits |= (1 << 9);
1136             break;
1137         default:
1138             *status = U_INTERNAL_PROGRAM_ERROR;
1139             return;
1140     }
1141 
1142     writeCollationMetadataTOML(outputdir, name, collationType, metadataBits, status);
1143 }
1144 
1145 #endif  // !UCONFIG_NO_COLLATION
1146 
1147 static TableResource *
addCollation(ParseState * state,TableResource * result,const char * collationType,uint32_t startline,UErrorCode * status)1148 addCollation(ParseState* state, TableResource  *result, const char *collationType,
1149              uint32_t startline, UErrorCode *status)
1150 {
1151     // TODO: Use LocalPointer for result, or make caller close it when there is a failure.
1152     struct SResource  *member = NULL;
1153     struct UString    *tokenValue;
1154     struct UString     comment;
1155     enum   ETokenType  token;
1156     char               subtag[1024];
1157     UnicodeString      rules;
1158     UBool              haveRules = false;
1159     UVersionInfo       version;
1160     uint32_t           line;
1161 
1162     /* '{' . (name resource)* '}' */
1163     version[0]=0; version[1]=0; version[2]=0; version[3]=0;
1164 
1165     for (;;)
1166     {
1167         ustr_init(&comment);
1168         token = getToken(state, &tokenValue, &comment, &line, status);
1169 
1170         if (token == TOK_CLOSE_BRACE)
1171         {
1172             break;
1173         }
1174 
1175         if (token != TOK_STRING)
1176         {
1177             res_close(result);
1178             *status = U_INVALID_FORMAT_ERROR;
1179 
1180             if (token == TOK_EOF)
1181             {
1182                 error(startline, "unterminated table");
1183             }
1184             else
1185             {
1186                 error(line, "Unexpected token %s", tokenNames[token]);
1187             }
1188 
1189             return NULL;
1190         }
1191 
1192         u_UCharsToChars(tokenValue->fChars, subtag, u_strlen(tokenValue->fChars) + 1);
1193 
1194         if (U_FAILURE(*status))
1195         {
1196             res_close(result);
1197             return NULL;
1198         }
1199 
1200         member = parseResource(state, subtag, NULL, status);
1201 
1202         if (U_FAILURE(*status))
1203         {
1204             res_close(result);
1205             return NULL;
1206         }
1207         if (result == NULL)
1208         {
1209             // Ignore the parsed resources, continue parsing.
1210         }
1211         else if (uprv_strcmp(subtag, "Version") == 0 && member->isString())
1212         {
1213             StringResource *sr = static_cast<StringResource *>(member);
1214             char     ver[40];
1215             int32_t length = sr->length();
1216 
1217             if (length >= UPRV_LENGTHOF(ver))
1218             {
1219                 length = UPRV_LENGTHOF(ver) - 1;
1220             }
1221 
1222             sr->fString.extract(0, length, ver, UPRV_LENGTHOF(ver), US_INV);
1223             u_versionFromString(version, ver);
1224 
1225             result->add(member, line, *status);
1226             member = NULL;
1227         }
1228         else if(uprv_strcmp(subtag, "%%CollationBin")==0)
1229         {
1230             /* discard duplicate %%CollationBin if any*/
1231         }
1232         else if (uprv_strcmp(subtag, "Sequence") == 0 && member->isString())
1233         {
1234             StringResource *sr = static_cast<StringResource *>(member);
1235             rules = sr->fString;
1236             haveRules = true;
1237             // Defer building the collator until we have seen
1238             // all sub-elements of the collation table, including the Version.
1239             /* in order to achieve smaller data files, we can direct genrb */
1240             /* to omit collation rules */
1241             if(!state->omitCollationRules) {
1242                 result->add(member, line, *status);
1243                 member = NULL;
1244             }
1245         }
1246         else  // Just copy non-special items.
1247         {
1248             result->add(member, line, *status);
1249             member = NULL;
1250         }
1251         res_close(member);  // TODO: use LocalPointer
1252         if (U_FAILURE(*status))
1253         {
1254             res_close(result);
1255             return NULL;
1256         }
1257     }
1258 
1259     if (!haveRules) { return result; }
1260 
1261 #if UCONFIG_NO_COLLATION || UCONFIG_NO_FILE_IO
1262     warning(line, "Not building collation elements because of UCONFIG_NO_COLLATION and/or UCONFIG_NO_FILE_IO, see uconfig.h");
1263     (void)collationType;
1264 #else
1265     // CLDR ticket #3949, ICU ticket #8082:
1266     // Do not build collation binary data for for-import-only "private" collation rule strings.
1267     if (uprv_strncmp(collationType, "private-", 8) == 0) {
1268         if(isVerbose()) {
1269             printf("Not building %s~%s collation binary\n", state->filename, collationType);
1270         }
1271         return result;
1272     }
1273 
1274     if(!state->makeBinaryCollation) {
1275         if(isVerbose()) {
1276             printf("Not building %s~%s collation binary\n", state->filename, collationType);
1277         }
1278         return result;
1279     }
1280     UErrorCode intStatus = U_ZERO_ERROR;
1281     UParseError parseError;
1282     uprv_memset(&parseError, 0, sizeof(parseError));
1283     GenrbImporter importer(state->inputdir, state->outputdir);
1284     const icu::CollationTailoring *base = icu::CollationRoot::getRoot(intStatus);
1285     if(U_FAILURE(intStatus)) {
1286         error(line, "failed to load root collator (ucadata.icu) - %s", u_errorName(intStatus));
1287         res_close(result);
1288         return NULL;  // TODO: use LocalUResourceBundlePointer for result
1289     }
1290     icu::CollationBuilder builder(base, state->icu4xMode, intStatus);
1291     if(state->icu4xMode || (uprv_strncmp(collationType, "search", 6) == 0)) {
1292         builder.disableFastLatin();  // build fast-Latin table unless search collator or ICU4X
1293     }
1294     LocalPointer<icu::CollationTailoring> t(
1295             builder.parseAndBuild(rules, version, &importer, &parseError, intStatus));
1296     if(U_FAILURE(intStatus)) {
1297         const char *reason = builder.getErrorReason();
1298         if(reason == NULL) { reason = ""; }
1299         error(line, "CollationBuilder failed at %s~%s/Sequence rule offset %ld: %s  %s",
1300                 state->filename, collationType,
1301                 (long)parseError.offset, u_errorName(intStatus), reason);
1302         if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1303             // Print pre- and post-context.
1304             char preBuffer[100], postBuffer[100];
1305             escape(parseError.preContext, preBuffer);
1306             escape(parseError.postContext, postBuffer);
1307             error(line, "  error context: \"...%s\" ! \"%s...\"", preBuffer, postBuffer);
1308         }
1309         if(isStrict() || t.isNull()) {
1310             *status = intStatus;
1311             res_close(result);
1312             return NULL;
1313         }
1314     }
1315     if (state->icu4xMode) {
1316         char *nameWithoutSuffix = static_cast<char *>(uprv_malloc(uprv_strlen(state->filename) + 1));
1317         if (nameWithoutSuffix == NULL) {
1318             *status = U_MEMORY_ALLOCATION_ERROR;
1319             res_close(result);
1320             return NULL;
1321         }
1322         uprv_strcpy(nameWithoutSuffix, state->filename);
1323         *uprv_strrchr(nameWithoutSuffix, '.') = 0;
1324 
1325         writeCollationTOML(state->outputdir, nameWithoutSuffix, collationType, t->data, t->settings, status);
1326         uprv_free(nameWithoutSuffix);
1327     }
1328     icu::LocalMemory<uint8_t> buffer;
1329     int32_t capacity = 100000;
1330     uint8_t *dest = buffer.allocateInsteadAndCopy(capacity);
1331     if(dest == NULL) {
1332         fprintf(stderr, "memory allocation (%ld bytes) for file contents failed\n",
1333                 (long)capacity);
1334         *status = U_MEMORY_ALLOCATION_ERROR;
1335         res_close(result);
1336         return NULL;
1337     }
1338     int32_t indexes[icu::CollationDataReader::IX_TOTAL_SIZE + 1];
1339     int32_t totalSize = icu::CollationDataWriter::writeTailoring(
1340             *t, *t->settings, indexes, dest, capacity, intStatus);
1341     if(intStatus == U_BUFFER_OVERFLOW_ERROR) {
1342         intStatus = U_ZERO_ERROR;
1343         capacity = totalSize;
1344         dest = buffer.allocateInsteadAndCopy(capacity);
1345         if(dest == NULL) {
1346             fprintf(stderr, "memory allocation (%ld bytes) for file contents failed\n",
1347                     (long)capacity);
1348             *status = U_MEMORY_ALLOCATION_ERROR;
1349             res_close(result);
1350             return NULL;
1351         }
1352         totalSize = icu::CollationDataWriter::writeTailoring(
1353                 *t, *t->settings, indexes, dest, capacity, intStatus);
1354     }
1355     if(U_FAILURE(intStatus)) {
1356         fprintf(stderr, "CollationDataWriter::writeTailoring() failed: %s\n",
1357                 u_errorName(intStatus));
1358         res_close(result);
1359         return NULL;
1360     }
1361     if(isVerbose()) {
1362         printf("%s~%s collation tailoring part sizes:\n", state->filename, collationType);
1363         icu::CollationInfo::printSizes(totalSize, indexes);
1364         if(t->settings->hasReordering()) {
1365             printf("%s~%s collation reordering ranges:\n", state->filename, collationType);
1366             icu::CollationInfo::printReorderRanges(
1367                     *t->data, t->settings->reorderCodes, t->settings->reorderCodesLength);
1368         }
1369 #if 0  // debugging output
1370     } else {
1371         printf("%s~%s collation tailoring part sizes:\n", state->filename, collationType);
1372         icu::CollationInfo::printSizes(totalSize, indexes);
1373 #endif
1374     }
1375     struct SResource *collationBin = bin_open(state->bundle, "%%CollationBin", totalSize, dest, NULL, NULL, status);
1376     result->add(collationBin, line, *status);
1377     if (U_FAILURE(*status)) {
1378         res_close(result);
1379         return NULL;
1380     }
1381 #endif
1382     return result;
1383 }
1384 
1385 static UBool
keepCollationType(const char *)1386 keepCollationType(const char * /*type*/) {
1387     return true;
1388 }
1389 
1390 static struct SResource *
parseCollationElements(ParseState * state,char * tag,uint32_t startline,UBool newCollation,UErrorCode * status)1391 parseCollationElements(ParseState* state, char *tag, uint32_t startline, UBool newCollation, UErrorCode *status)
1392 {
1393     TableResource  *result = NULL;
1394     struct SResource  *member = NULL;
1395     struct UString    *tokenValue;
1396     struct UString     comment;
1397     enum   ETokenType  token;
1398     char               subtag[1024], typeKeyword[1024];
1399     uint32_t           line;
1400 
1401     result = table_open(state->bundle, tag, NULL, status);
1402 
1403     if (result == NULL || U_FAILURE(*status))
1404     {
1405         return NULL;
1406     }
1407     if(isVerbose()){
1408         printf(" collation elements %s at line %i \n",  (tag == NULL) ? "(null)" : tag, (int)startline);
1409     }
1410     if(!newCollation) {
1411         return addCollation(state, result, "(no type)", startline, status);
1412     }
1413     else {
1414         for(;;) {
1415             ustr_init(&comment);
1416             token = getToken(state, &tokenValue, &comment, &line, status);
1417 
1418             if (token == TOK_CLOSE_BRACE)
1419             {
1420                 return result;
1421             }
1422 
1423             if (token != TOK_STRING)
1424             {
1425                 res_close(result);
1426                 *status = U_INVALID_FORMAT_ERROR;
1427 
1428                 if (token == TOK_EOF)
1429                 {
1430                     error(startline, "unterminated table");
1431                 }
1432                 else
1433                 {
1434                     error(line, "Unexpected token %s", tokenNames[token]);
1435                 }
1436 
1437                 return NULL;
1438             }
1439 
1440             u_UCharsToChars(tokenValue->fChars, subtag, u_strlen(tokenValue->fChars) + 1);
1441 
1442             if (U_FAILURE(*status))
1443             {
1444                 res_close(result);
1445                 return NULL;
1446             }
1447 
1448             if (uprv_strcmp(subtag, "default") == 0)
1449             {
1450                 member = parseResource(state, subtag, NULL, status);
1451 
1452                 if (U_FAILURE(*status))
1453                 {
1454                     res_close(result);
1455                     return NULL;
1456                 }
1457 
1458                 result->add(member, line, *status);
1459             }
1460             else
1461             {
1462                 token = peekToken(state, 0, &tokenValue, &line, &comment, status);
1463                 /* this probably needs to be refactored or recursively use the parser */
1464                 /* first we assume that our collation table won't have the explicit type */
1465                 /* then, we cannot handle aliases */
1466                 if(token == TOK_OPEN_BRACE) {
1467                     token = getToken(state, &tokenValue, &comment, &line, status);
1468                     TableResource *collationRes;
1469                     if (keepCollationType(subtag)) {
1470                         collationRes = table_open(state->bundle, subtag, NULL, status);
1471                     } else {
1472                         collationRes = NULL;
1473                     }
1474                     // need to parse the collation data regardless
1475                     collationRes = addCollation(state, collationRes, subtag, startline, status);
1476                     if (collationRes != NULL) {
1477                         result->add(collationRes, startline, *status);
1478                     }
1479                 } else if(token == TOK_COLON) { /* right now, we'll just try to see if we have aliases */
1480                     /* we could have a table too */
1481                     token = peekToken(state, 1, &tokenValue, &line, &comment, status);
1482                     u_UCharsToChars(tokenValue->fChars, typeKeyword, u_strlen(tokenValue->fChars) + 1);
1483                     if(uprv_strcmp(typeKeyword, "alias") == 0) {
1484                         member = parseResource(state, subtag, NULL, status);
1485                         if (U_FAILURE(*status))
1486                         {
1487                             res_close(result);
1488                             return NULL;
1489                         }
1490 
1491                         result->add(member, line, *status);
1492                     } else {
1493                         res_close(result);
1494                         *status = U_INVALID_FORMAT_ERROR;
1495                         return NULL;
1496                     }
1497                 } else {
1498                     res_close(result);
1499                     *status = U_INVALID_FORMAT_ERROR;
1500                     return NULL;
1501                 }
1502             }
1503 
1504             /*member = string_open(bundle, subtag, tokenValue->fChars, tokenValue->fLength, status);*/
1505 
1506             /*expect(TOK_CLOSE_BRACE, NULL, NULL, status);*/
1507 
1508             if (U_FAILURE(*status))
1509             {
1510                 res_close(result);
1511                 return NULL;
1512             }
1513         }
1514     }
1515 }
1516 
1517 /* Necessary, because CollationElements requires the bundle->fRoot member to be present which,
1518    if this weren't special-cased, wouldn't be set until the entire file had been processed. */
1519 static struct SResource *
realParseTable(ParseState * state,TableResource * table,char * tag,uint32_t startline,UErrorCode * status)1520 realParseTable(ParseState* state, TableResource *table, char *tag, uint32_t startline, UErrorCode *status)
1521 {
1522     struct SResource  *member = NULL;
1523     struct UString    *tokenValue=NULL;
1524     struct UString    comment;
1525     enum   ETokenType token;
1526     char              subtag[1024];
1527     uint32_t          line;
1528     UBool             readToken = false;
1529 
1530     /* '{' . (name resource)* '}' */
1531 
1532     if(isVerbose()){
1533         printf(" parsing table %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1534     }
1535     for (;;)
1536     {
1537         ustr_init(&comment);
1538         token = getToken(state, &tokenValue, &comment, &line, status);
1539 
1540         if (token == TOK_CLOSE_BRACE)
1541         {
1542             if (!readToken && isVerbose()) {
1543                 warning(startline, "Encountered empty table");
1544             }
1545             return table;
1546         }
1547 
1548         if (token != TOK_STRING)
1549         {
1550             *status = U_INVALID_FORMAT_ERROR;
1551 
1552             if (token == TOK_EOF)
1553             {
1554                 error(startline, "unterminated table");
1555             }
1556             else
1557             {
1558                 error(line, "unexpected token %s", tokenNames[token]);
1559             }
1560 
1561             return NULL;
1562         }
1563 
1564         if(uprv_isInvariantUString(tokenValue->fChars, -1)) {
1565             u_UCharsToChars(tokenValue->fChars, subtag, u_strlen(tokenValue->fChars) + 1);
1566         } else {
1567             *status = U_INVALID_FORMAT_ERROR;
1568             error(line, "invariant characters required for table keys");
1569             return NULL;
1570         }
1571 
1572         if (U_FAILURE(*status))
1573         {
1574             error(line, "parse error. Stopped parsing tokens with %s", u_errorName(*status));
1575             return NULL;
1576         }
1577 
1578         member = parseResource(state, subtag, &comment, status);
1579 
1580         if (member == NULL || U_FAILURE(*status))
1581         {
1582             error(line, "parse error. Stopped parsing resource with %s", u_errorName(*status));
1583             return NULL;
1584         }
1585 
1586         table->add(member, line, *status);
1587 
1588         if (U_FAILURE(*status))
1589         {
1590             error(line, "parse error. Stopped parsing table with %s", u_errorName(*status));
1591             return NULL;
1592         }
1593         readToken = true;
1594         ustr_deinit(&comment);
1595    }
1596 
1597     /* not reached */
1598     /* A compiler warning will appear if all paths don't contain a return statement. */
1599 /*     *status = U_INTERNAL_PROGRAM_ERROR;
1600      return NULL;*/
1601 }
1602 
1603 static struct SResource *
parseTable(ParseState * state,char * tag,uint32_t startline,const struct UString * comment,UErrorCode * status)1604 parseTable(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
1605 {
1606     if (tag != NULL && uprv_strcmp(tag, "CollationElements") == 0)
1607     {
1608         return parseCollationElements(state, tag, startline, false, status);
1609     }
1610     if (tag != NULL && uprv_strcmp(tag, "collations") == 0)
1611     {
1612         return parseCollationElements(state, tag, startline, true, status);
1613     }
1614     if(isVerbose()){
1615         printf(" table %s at line %i \n",  (tag == NULL) ? "(null)" : tag, (int)startline);
1616     }
1617 
1618     TableResource *result = table_open(state->bundle, tag, comment, status);
1619 
1620     if (result == NULL || U_FAILURE(*status))
1621     {
1622         return NULL;
1623     }
1624     return realParseTable(state, result, tag, startline,  status);
1625 }
1626 
1627 static struct SResource *
parseArray(ParseState * state,char * tag,uint32_t startline,const struct UString * comment,UErrorCode * status)1628 parseArray(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
1629 {
1630     struct SResource  *member = NULL;
1631     struct UString    *tokenValue;
1632     struct UString    memberComments;
1633     enum   ETokenType token;
1634     UBool             readToken = false;
1635 
1636     ArrayResource  *result = array_open(state->bundle, tag, comment, status);
1637 
1638     if (result == NULL || U_FAILURE(*status))
1639     {
1640         return NULL;
1641     }
1642     if(isVerbose()){
1643         printf(" array %s at line %i \n",  (tag == NULL) ? "(null)" : tag, (int)startline);
1644     }
1645 
1646     ustr_init(&memberComments);
1647 
1648     /* '{' . resource [','] '}' */
1649     for (;;)
1650     {
1651         /* reset length */
1652         ustr_setlen(&memberComments, 0, status);
1653 
1654         /* check for end of array, but don't consume next token unless it really is the end */
1655         token = peekToken(state, 0, &tokenValue, NULL, &memberComments, status);
1656 
1657 
1658         if (token == TOK_CLOSE_BRACE)
1659         {
1660             getToken(state, NULL, NULL, NULL, status);
1661             if (!readToken) {
1662                 warning(startline, "Encountered empty array");
1663             }
1664             break;
1665         }
1666 
1667         if (token == TOK_EOF)
1668         {
1669             res_close(result);
1670             *status = U_INVALID_FORMAT_ERROR;
1671             error(startline, "unterminated array");
1672             return NULL;
1673         }
1674 
1675         /* string arrays are a special case */
1676         if (token == TOK_STRING)
1677         {
1678             getToken(state, &tokenValue, &memberComments, NULL, status);
1679             member = string_open(state->bundle, NULL, tokenValue->fChars, tokenValue->fLength, &memberComments, status);
1680         }
1681         else
1682         {
1683             member = parseResource(state, NULL, &memberComments, status);
1684         }
1685 
1686         if (member == NULL || U_FAILURE(*status))
1687         {
1688             res_close(result);
1689             return NULL;
1690         }
1691 
1692         result->add(member);
1693 
1694         /* eat optional comma if present */
1695         token = peekToken(state, 0, NULL, NULL, NULL, status);
1696 
1697         if (token == TOK_COMMA)
1698         {
1699             getToken(state, NULL, NULL, NULL, status);
1700         }
1701 
1702         if (U_FAILURE(*status))
1703         {
1704             res_close(result);
1705             return NULL;
1706         }
1707         readToken = true;
1708     }
1709 
1710     ustr_deinit(&memberComments);
1711     return result;
1712 }
1713 
1714 static struct SResource *
parseIntVector(ParseState * state,char * tag,uint32_t startline,const struct UString * comment,UErrorCode * status)1715 parseIntVector(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
1716 {
1717     enum   ETokenType  token;
1718     char              *string;
1719     int32_t            value;
1720     UBool              readToken = false;
1721     char              *stopstring;
1722     struct UString     memberComments;
1723 
1724     IntVectorResource *result = intvector_open(state->bundle, tag, comment, status);
1725 
1726     if (result == NULL || U_FAILURE(*status))
1727     {
1728         return NULL;
1729     }
1730 
1731     if(isVerbose()){
1732         printf(" vector %s at line %i \n",  (tag == NULL) ? "(null)" : tag, (int)startline);
1733     }
1734     ustr_init(&memberComments);
1735     /* '{' . string [','] '}' */
1736     for (;;)
1737     {
1738         ustr_setlen(&memberComments, 0, status);
1739 
1740         /* check for end of array, but don't consume next token unless it really is the end */
1741         token = peekToken(state, 0, NULL, NULL,&memberComments, status);
1742 
1743         if (token == TOK_CLOSE_BRACE)
1744         {
1745             /* it's the end, consume the close brace */
1746             getToken(state, NULL, NULL, NULL, status);
1747             if (!readToken) {
1748                 warning(startline, "Encountered empty int vector");
1749             }
1750             ustr_deinit(&memberComments);
1751             return result;
1752         }
1753 
1754         int32_t stringLength;
1755         string = getInvariantString(state, NULL, NULL, stringLength, status);
1756 
1757         if (U_FAILURE(*status))
1758         {
1759             res_close(result);
1760             return NULL;
1761         }
1762 
1763         /* For handling illegal char in the Intvector */
1764         value = uprv_strtoul(string, &stopstring, 0);/* make intvector support decimal,hexdigit,octal digit ranging from -2^31-2^32-1*/
1765         int32_t len = (int32_t)(stopstring-string);
1766 
1767         if(len==stringLength)
1768         {
1769             result->add(value, *status);
1770             uprv_free(string);
1771             token = peekToken(state, 0, NULL, NULL, NULL, status);
1772         }
1773         else
1774         {
1775             uprv_free(string);
1776             *status=U_INVALID_CHAR_FOUND;
1777         }
1778 
1779         if (U_FAILURE(*status))
1780         {
1781             res_close(result);
1782             return NULL;
1783         }
1784 
1785         /* the comma is optional (even though it is required to prevent the reader from concatenating
1786         consecutive entries) so that a missing comma on the last entry isn't an error */
1787         if (token == TOK_COMMA)
1788         {
1789             getToken(state, NULL, NULL, NULL, status);
1790         }
1791         readToken = true;
1792     }
1793 
1794     /* not reached */
1795     /* A compiler warning will appear if all paths don't contain a return statement. */
1796 /*    intvector_close(result, status);
1797     *status = U_INTERNAL_PROGRAM_ERROR;
1798     return NULL;*/
1799 }
1800 
1801 static struct SResource *
parseBinary(ParseState * state,char * tag,uint32_t startline,const struct UString * comment,UErrorCode * status)1802 parseBinary(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
1803 {
1804     uint32_t line;
1805     int32_t stringLength;
1806     LocalMemory<char> string(getInvariantString(state, &line, NULL, stringLength, status));
1807     if (string.isNull() || U_FAILURE(*status))
1808     {
1809         return NULL;
1810     }
1811 
1812     expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
1813     if (U_FAILURE(*status))
1814     {
1815         return NULL;
1816     }
1817 
1818     if(isVerbose()){
1819         printf(" binary %s at line %i \n",  (tag == NULL) ? "(null)" : tag, (int)startline);
1820     }
1821 
1822     LocalMemory<uint8_t> value;
1823     int32_t count = 0;
1824     if (stringLength > 0 && value.allocateInsteadAndCopy(stringLength) == NULL)
1825     {
1826         *status = U_MEMORY_ALLOCATION_ERROR;
1827         return NULL;
1828     }
1829 
1830     char toConv[3] = {'\0', '\0', '\0'};
1831     for (int32_t i = 0; i < stringLength;)
1832     {
1833         // Skip spaces (which may have been line endings).
1834         char c0 = string[i++];
1835         if (c0 == ' ') { continue; }
1836         if (i == stringLength) {
1837             *status=U_INVALID_CHAR_FOUND;
1838             error(line, "Encountered invalid binary value (odd number of hex digits)");
1839             return NULL;
1840         }
1841         toConv[0] = c0;
1842         toConv[1] = string[i++];
1843 
1844         char *stopstring;
1845         value[count++] = (uint8_t) uprv_strtoul(toConv, &stopstring, 16);
1846         uint32_t len=(uint32_t)(stopstring-toConv);
1847 
1848         if(len!=2)
1849         {
1850             *status=U_INVALID_CHAR_FOUND;
1851             error(line, "Encountered invalid binary value (not all pairs of hex digits)");
1852             return NULL;
1853         }
1854     }
1855 
1856     if (count == 0) {
1857         warning(startline, "Encountered empty binary value");
1858         return bin_open(state->bundle, tag, 0, NULL, "", comment, status);
1859     } else {
1860         return bin_open(state->bundle, tag, count, value.getAlias(), NULL, comment, status);
1861     }
1862 }
1863 
1864 static struct SResource *
parseInteger(ParseState * state,char * tag,uint32_t startline,const struct UString * comment,UErrorCode * status)1865 parseInteger(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
1866 {
1867     struct SResource *result = NULL;
1868     int32_t           value;
1869     char             *string;
1870     char             *stopstring;
1871 
1872     int32_t stringLength;
1873     string = getInvariantString(state, NULL, NULL, stringLength, status);
1874 
1875     if (string == NULL || U_FAILURE(*status))
1876     {
1877         return NULL;
1878     }
1879 
1880     expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
1881 
1882     if (U_FAILURE(*status))
1883     {
1884         uprv_free(string);
1885         return NULL;
1886     }
1887 
1888     if(isVerbose()){
1889         printf(" integer %s at line %i \n",  (tag == NULL) ? "(null)" : tag, (int)startline);
1890     }
1891 
1892     if (stringLength == 0)
1893     {
1894         warning(startline, "Encountered empty integer. Default value is 0.");
1895     }
1896 
1897     /* Allow integer support for hexdecimal, octal digit and decimal*/
1898     /* and handle illegal char in the integer*/
1899     value = uprv_strtoul(string, &stopstring, 0);
1900     int32_t len = (int32_t)(stopstring-string);
1901     if(len==stringLength)
1902     {
1903         result = int_open(state->bundle, tag, value, comment, status);
1904     }
1905     else
1906     {
1907         *status=U_INVALID_CHAR_FOUND;
1908     }
1909     uprv_free(string);
1910 
1911     return result;
1912 }
1913 
1914 static struct SResource *
parseImport(ParseState * state,char * tag,uint32_t startline,const struct UString * comment,UErrorCode * status)1915 parseImport(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status)
1916 {
1917     uint32_t          line;
1918     int32_t stringLength;
1919     LocalMemory<char> filename(getInvariantString(state, &line, NULL, stringLength, status));
1920     if (U_FAILURE(*status))
1921     {
1922         return NULL;
1923     }
1924 
1925     expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
1926 
1927     if (U_FAILURE(*status))
1928     {
1929         return NULL;
1930     }
1931 
1932     if(isVerbose()){
1933         printf(" import %s at line %i \n",  (tag == NULL) ? "(null)" : tag, (int)startline);
1934     }
1935 
1936     /* Open the input file for reading */
1937     CharString fullname;
1938     if (state->inputdir != NULL) {
1939         fullname.append(state->inputdir, *status);
1940     }
1941     fullname.appendPathPart(filename.getAlias(), *status);
1942     if (U_FAILURE(*status)) {
1943         return NULL;
1944     }
1945 
1946     FileStream *file = T_FileStream_open(fullname.data(), "rb");
1947     if (file == NULL)
1948     {
1949         error(line, "couldn't open input file %s", filename.getAlias());
1950         *status = U_FILE_ACCESS_ERROR;
1951         return NULL;
1952     }
1953 
1954     int32_t len  = T_FileStream_size(file);
1955     LocalMemory<uint8_t> data;
1956     if(data.allocateInsteadAndCopy(len) == NULL)
1957     {
1958         *status = U_MEMORY_ALLOCATION_ERROR;
1959         T_FileStream_close (file);
1960         return NULL;
1961     }
1962 
1963     /* int32_t numRead = */ T_FileStream_read(file, data.getAlias(), len);
1964     T_FileStream_close (file);
1965 
1966     return bin_open(state->bundle, tag, len, data.getAlias(), fullname.data(), comment, status);
1967 }
1968 
1969 static struct SResource *
parseInclude(ParseState * state,char * tag,uint32_t startline,const struct UString * comment,UErrorCode * status)1970 parseInclude(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status)
1971 {
1972     struct SResource *result;
1973     int32_t           len=0;
1974     char             *filename;
1975     uint32_t          line;
1976     UChar *pTarget     = NULL;
1977 
1978     UCHARBUF *ucbuf;
1979     char     *fullname = NULL;
1980     const char* cp = NULL;
1981     const UChar* uBuffer = NULL;
1982 
1983     int32_t stringLength;
1984     filename = getInvariantString(state, &line, NULL, stringLength, status);
1985 
1986     if (U_FAILURE(*status))
1987     {
1988         return NULL;
1989     }
1990 
1991     expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
1992 
1993     if (U_FAILURE(*status))
1994     {
1995         uprv_free(filename);
1996         return NULL;
1997     }
1998 
1999     if(isVerbose()){
2000         printf(" include %s at line %i \n",  (tag == NULL) ? "(null)" : tag, (int)startline);
2001     }
2002 
2003     fullname = (char *) uprv_malloc(state->inputdirLength + stringLength + 2);
2004     /* test for NULL */
2005     if(fullname == NULL)
2006     {
2007         *status = U_MEMORY_ALLOCATION_ERROR;
2008         uprv_free(filename);
2009         return NULL;
2010     }
2011 
2012     if(state->inputdir!=NULL){
2013         if (state->inputdir[state->inputdirLength - 1] != U_FILE_SEP_CHAR)
2014         {
2015 
2016             uprv_strcpy(fullname, state->inputdir);
2017 
2018             fullname[state->inputdirLength]      = U_FILE_SEP_CHAR;
2019             fullname[state->inputdirLength + 1] = '\0';
2020 
2021             uprv_strcat(fullname, filename);
2022         }
2023         else
2024         {
2025             uprv_strcpy(fullname, state->inputdir);
2026             uprv_strcat(fullname, filename);
2027         }
2028     }else{
2029         uprv_strcpy(fullname,filename);
2030     }
2031 
2032     ucbuf = ucbuf_open(fullname, &cp,getShowWarning(),false,status);
2033 
2034     if (U_FAILURE(*status)) {
2035         error(line, "couldn't open input file %s\n", filename);
2036         return NULL;
2037     }
2038 
2039     uBuffer = ucbuf_getBuffer(ucbuf,&len,status);
2040     result = string_open(state->bundle, tag, uBuffer, len, comment, status);
2041 
2042     ucbuf_close(ucbuf);
2043 
2044     uprv_free(pTarget);
2045 
2046     uprv_free(filename);
2047     uprv_free(fullname);
2048 
2049     return result;
2050 }
2051 
2052 
2053 
2054 
2055 
2056 U_STRING_DECL(k_type_string,    "string",    6);
2057 U_STRING_DECL(k_type_binary,    "binary",    6);
2058 U_STRING_DECL(k_type_bin,       "bin",       3);
2059 U_STRING_DECL(k_type_table,     "table",     5);
2060 U_STRING_DECL(k_type_table_no_fallback,     "table(nofallback)",         17);
2061 U_STRING_DECL(k_type_int,       "int",       3);
2062 U_STRING_DECL(k_type_integer,   "integer",   7);
2063 U_STRING_DECL(k_type_array,     "array",     5);
2064 U_STRING_DECL(k_type_alias,     "alias",     5);
2065 U_STRING_DECL(k_type_intvector, "intvector", 9);
2066 U_STRING_DECL(k_type_import,    "import",    6);
2067 U_STRING_DECL(k_type_include,   "include",   7);
2068 
2069 /* Various non-standard processing plugins that create one or more special resources. */
2070 U_STRING_DECL(k_type_plugin_uca_rules,      "process(uca_rules)",        18);
2071 U_STRING_DECL(k_type_plugin_collation,      "process(collation)",        18);
2072 U_STRING_DECL(k_type_plugin_transliterator, "process(transliterator)",   23);
2073 U_STRING_DECL(k_type_plugin_dependency,     "process(dependency)",       19);
2074 
2075 typedef enum EResourceType
2076 {
2077     RESTYPE_UNKNOWN,
2078     RESTYPE_STRING,
2079     RESTYPE_BINARY,
2080     RESTYPE_TABLE,
2081     RESTYPE_TABLE_NO_FALLBACK,
2082     RESTYPE_INTEGER,
2083     RESTYPE_ARRAY,
2084     RESTYPE_ALIAS,
2085     RESTYPE_INTVECTOR,
2086     RESTYPE_IMPORT,
2087     RESTYPE_INCLUDE,
2088     RESTYPE_PROCESS_UCA_RULES,
2089     RESTYPE_PROCESS_COLLATION,
2090     RESTYPE_PROCESS_TRANSLITERATOR,
2091     RESTYPE_PROCESS_DEPENDENCY,
2092     RESTYPE_RESERVED
2093 } EResourceType;
2094 
2095 static struct {
2096     const char *nameChars;   /* only used for debugging */
2097     const UChar *nameUChars;
2098     ParseResourceFunction *parseFunction;
2099 } gResourceTypes[] = {
2100     {"Unknown", NULL, NULL},
2101     {"string", k_type_string, parseString},
2102     {"binary", k_type_binary, parseBinary},
2103     {"table", k_type_table, parseTable},
2104     {"table(nofallback)", k_type_table_no_fallback, NULL}, /* parseFunction will never be called */
2105     {"integer", k_type_integer, parseInteger},
2106     {"array", k_type_array, parseArray},
2107     {"alias", k_type_alias, parseAlias},
2108     {"intvector", k_type_intvector, parseIntVector},
2109     {"import", k_type_import, parseImport},
2110     {"include", k_type_include, parseInclude},
2111     {"process(uca_rules)", k_type_plugin_uca_rules, parseUCARules},
2112     {"process(collation)", k_type_plugin_collation, NULL /* not implemented yet */},
2113     {"process(transliterator)", k_type_plugin_transliterator, parseTransliterator},
2114     {"process(dependency)", k_type_plugin_dependency, parseDependency},
2115     {"reserved", NULL, NULL}
2116 };
2117 
initParser()2118 void initParser()
2119 {
2120     U_STRING_INIT(k_type_string,    "string",    6);
2121     U_STRING_INIT(k_type_binary,    "binary",    6);
2122     U_STRING_INIT(k_type_bin,       "bin",       3);
2123     U_STRING_INIT(k_type_table,     "table",     5);
2124     U_STRING_INIT(k_type_table_no_fallback,     "table(nofallback)",         17);
2125     U_STRING_INIT(k_type_int,       "int",       3);
2126     U_STRING_INIT(k_type_integer,   "integer",   7);
2127     U_STRING_INIT(k_type_array,     "array",     5);
2128     U_STRING_INIT(k_type_alias,     "alias",     5);
2129     U_STRING_INIT(k_type_intvector, "intvector", 9);
2130     U_STRING_INIT(k_type_import,    "import",    6);
2131     U_STRING_INIT(k_type_include,   "include",   7);
2132 
2133     U_STRING_INIT(k_type_plugin_uca_rules,      "process(uca_rules)",        18);
2134     U_STRING_INIT(k_type_plugin_collation,      "process(collation)",        18);
2135     U_STRING_INIT(k_type_plugin_transliterator, "process(transliterator)",   23);
2136     U_STRING_INIT(k_type_plugin_dependency,     "process(dependency)",       19);
2137 }
2138 
isTable(enum EResourceType type)2139 static inline UBool isTable(enum EResourceType type) {
2140     return (UBool)(type==RESTYPE_TABLE || type==RESTYPE_TABLE_NO_FALLBACK);
2141 }
2142 
2143 static enum EResourceType
parseResourceType(ParseState * state,UErrorCode * status)2144 parseResourceType(ParseState* state, UErrorCode *status)
2145 {
2146     struct UString        *tokenValue;
2147     struct UString        comment;
2148     enum   EResourceType  result = RESTYPE_UNKNOWN;
2149     uint32_t              line=0;
2150     ustr_init(&comment);
2151     expect(state, TOK_STRING, &tokenValue, &comment, &line, status);
2152 
2153     if (U_FAILURE(*status))
2154     {
2155         return RESTYPE_UNKNOWN;
2156     }
2157 
2158     *status = U_ZERO_ERROR;
2159 
2160     /* Search for normal types */
2161     result=RESTYPE_UNKNOWN;
2162     while ((result=(EResourceType)(result+1)) < RESTYPE_RESERVED) {
2163         if (u_strcmp(tokenValue->fChars, gResourceTypes[result].nameUChars) == 0) {
2164             break;
2165         }
2166     }
2167     /* Now search for the aliases */
2168     if (u_strcmp(tokenValue->fChars, k_type_int) == 0) {
2169         result = RESTYPE_INTEGER;
2170     }
2171     else if (u_strcmp(tokenValue->fChars, k_type_bin) == 0) {
2172         result = RESTYPE_BINARY;
2173     }
2174     else if (result == RESTYPE_RESERVED) {
2175         char tokenBuffer[1024];
2176         u_austrncpy(tokenBuffer, tokenValue->fChars, sizeof(tokenBuffer));
2177         tokenBuffer[sizeof(tokenBuffer) - 1] = 0;
2178         *status = U_INVALID_FORMAT_ERROR;
2179         error(line, "unknown resource type '%s'", tokenBuffer);
2180     }
2181 
2182     return result;
2183 }
2184 
2185 /* parse a non-top-level resource */
2186 static struct SResource *
parseResource(ParseState * state,char * tag,const struct UString * comment,UErrorCode * status)2187 parseResource(ParseState* state, char *tag, const struct UString *comment, UErrorCode *status)
2188 {
2189     enum   ETokenType      token;
2190     enum   EResourceType  resType = RESTYPE_UNKNOWN;
2191     ParseResourceFunction *parseFunction = NULL;
2192     struct UString        *tokenValue;
2193     uint32_t                 startline;
2194     uint32_t                 line;
2195 
2196 
2197     token = getToken(state, &tokenValue, NULL, &startline, status);
2198 
2199     if(isVerbose()){
2200         printf(" resource %s at line %i \n",  (tag == NULL) ? "(null)" : tag, (int)startline);
2201     }
2202 
2203     /* name . [ ':' type ] '{' resource '}' */
2204     /* This function parses from the colon onwards.  If the colon is present, parse the
2205     type then try to parse a resource of that type.  If there is no explicit type,
2206     work it out using the lookahead tokens. */
2207     switch (token)
2208     {
2209     case TOK_EOF:
2210         *status = U_INVALID_FORMAT_ERROR;
2211         error(startline, "Unexpected EOF encountered");
2212         return NULL;
2213 
2214     case TOK_ERROR:
2215         *status = U_INVALID_FORMAT_ERROR;
2216         return NULL;
2217 
2218     case TOK_COLON:
2219         resType = parseResourceType(state, status);
2220         expect(state, TOK_OPEN_BRACE, &tokenValue, NULL, &startline, status);
2221 
2222         if (U_FAILURE(*status))
2223         {
2224             return NULL;
2225         }
2226 
2227         break;
2228 
2229     case TOK_OPEN_BRACE:
2230         break;
2231 
2232     default:
2233         *status = U_INVALID_FORMAT_ERROR;
2234         error(startline, "syntax error while reading a resource, expected '{' or ':'");
2235         return NULL;
2236     }
2237 
2238 
2239     if (resType == RESTYPE_UNKNOWN)
2240     {
2241         /* No explicit type, so try to work it out.  At this point, we've read the first '{'.
2242         We could have any of the following:
2243         { {         => array (nested)
2244         { :/}       => array
2245         { string ,  => string array
2246 
2247         { string {  => table
2248 
2249         { string :/{    => table
2250         { string }      => string
2251         */
2252 
2253         token = peekToken(state, 0, NULL, &line, NULL,status);
2254 
2255         if (U_FAILURE(*status))
2256         {
2257             return NULL;
2258         }
2259 
2260         if (token == TOK_OPEN_BRACE || token == TOK_COLON ||token ==TOK_CLOSE_BRACE )
2261         {
2262             resType = RESTYPE_ARRAY;
2263         }
2264         else if (token == TOK_STRING)
2265         {
2266             token = peekToken(state, 1, NULL, &line, NULL, status);
2267 
2268             if (U_FAILURE(*status))
2269             {
2270                 return NULL;
2271             }
2272 
2273             switch (token)
2274             {
2275             case TOK_COMMA:         resType = RESTYPE_ARRAY;  break;
2276             case TOK_OPEN_BRACE:    resType = RESTYPE_TABLE;  break;
2277             case TOK_CLOSE_BRACE:   resType = RESTYPE_STRING; break;
2278             case TOK_COLON:         resType = RESTYPE_TABLE;  break;
2279             default:
2280                 *status = U_INVALID_FORMAT_ERROR;
2281                 error(line, "Unexpected token after string, expected ',', '{' or '}'");
2282                 return NULL;
2283             }
2284         }
2285         else
2286         {
2287             *status = U_INVALID_FORMAT_ERROR;
2288             error(line, "Unexpected token after '{'");
2289             return NULL;
2290         }
2291 
2292         /* printf("Type guessed as %s\n", resourceNames[resType]); */
2293     } else if(resType == RESTYPE_TABLE_NO_FALLBACK) {
2294         *status = U_INVALID_FORMAT_ERROR;
2295         error(startline, "error: %s resource type not valid except on top bundle level", gResourceTypes[resType].nameChars);
2296         return NULL;
2297     }
2298 
2299 
2300     /* We should now know what we need to parse next, so call the appropriate parser
2301     function and return. */
2302     parseFunction = gResourceTypes[resType].parseFunction;
2303     if (parseFunction != NULL) {
2304         return parseFunction(state, tag, startline, comment, status);
2305     }
2306     else {
2307         *status = U_INTERNAL_PROGRAM_ERROR;
2308         error(startline, "internal error: %s resource type found and not handled", gResourceTypes[resType].nameChars);
2309     }
2310 
2311     return NULL;
2312 }
2313 
2314 /* parse the top-level resource */
2315 struct SRBRoot *
parse(UCHARBUF * buf,const char * inputDir,const char * outputDir,const char * filename,UBool makeBinaryCollation,UBool omitCollationRules,UBool icu4xMode,UErrorCode * status)2316 parse(UCHARBUF *buf, const char *inputDir, const char *outputDir, const char *filename,
2317       UBool makeBinaryCollation, UBool omitCollationRules, UBool icu4xMode, UErrorCode *status)
2318 {
2319     struct UString    *tokenValue;
2320     struct UString    comment;
2321     uint32_t           line;
2322     enum EResourceType bundleType;
2323     enum ETokenType    token;
2324     ParseState state;
2325     uint32_t i;
2326 
2327 
2328     for (i = 0; i < MAX_LOOKAHEAD + 1; i++)
2329     {
2330         ustr_init(&state.lookahead[i].value);
2331         ustr_init(&state.lookahead[i].comment);
2332     }
2333 
2334     initLookahead(&state, buf, status);
2335 
2336     state.inputdir       = inputDir;
2337     state.inputdirLength = (state.inputdir != NULL) ? (uint32_t)uprv_strlen(state.inputdir) : 0;
2338     state.outputdir       = outputDir;
2339     state.outputdirLength = (state.outputdir != NULL) ? (uint32_t)uprv_strlen(state.outputdir) : 0;
2340     state.filename = filename;
2341     state.makeBinaryCollation = makeBinaryCollation;
2342     state.omitCollationRules = omitCollationRules;
2343     state.icu4xMode = icu4xMode;
2344 
2345     ustr_init(&comment);
2346     expect(&state, TOK_STRING, &tokenValue, &comment, NULL, status);
2347 
2348     state.bundle = new SRBRoot(&comment, false, *status);
2349 
2350     if (state.bundle == NULL || U_FAILURE(*status))
2351     {
2352         delete state.bundle;
2353 
2354         return NULL;
2355     }
2356 
2357 
2358     state.bundle->setLocale(tokenValue->fChars, *status);
2359 
2360     /* The following code is to make Empty bundle work no matter with :table specifer or not */
2361     token = getToken(&state, NULL, NULL, &line, status);
2362     if(token==TOK_COLON) {
2363         *status=U_ZERO_ERROR;
2364         bundleType=parseResourceType(&state, status);
2365 
2366         if(isTable(bundleType))
2367         {
2368             expect(&state, TOK_OPEN_BRACE, NULL, NULL, &line, status);
2369         }
2370         else
2371         {
2372             *status=U_PARSE_ERROR;
2373              error(line, "parse error. Stopped parsing with %s", u_errorName(*status));
2374         }
2375     }
2376     else
2377     {
2378         /* not a colon */
2379         if(token==TOK_OPEN_BRACE)
2380         {
2381             *status=U_ZERO_ERROR;
2382             bundleType=RESTYPE_TABLE;
2383         }
2384         else
2385         {
2386             /* neither colon nor open brace */
2387             *status=U_PARSE_ERROR;
2388             bundleType=RESTYPE_UNKNOWN;
2389             error(line, "parse error, did not find open-brace '{' or colon ':', stopped with %s", u_errorName(*status));
2390         }
2391     }
2392 
2393     if (U_FAILURE(*status))
2394     {
2395         delete state.bundle;
2396         return NULL;
2397     }
2398 
2399     if(bundleType==RESTYPE_TABLE_NO_FALLBACK) {
2400         /*
2401          * Parse a top-level table with the table(nofallback) declaration.
2402          * This is the same as a regular table, but also sets the
2403          * URES_ATT_NO_FALLBACK flag in indexes[URES_INDEX_ATTRIBUTES] .
2404          */
2405         state.bundle->fNoFallback=true;
2406     }
2407     /* top-level tables need not handle special table names like "collations" */
2408     assert(!state.bundle->fIsPoolBundle);
2409     assert(state.bundle->fRoot->fType == URES_TABLE);
2410     TableResource *rootTable = static_cast<TableResource *>(state.bundle->fRoot);
2411     realParseTable(&state, rootTable, NULL, line, status);
2412     if(dependencyArray!=NULL){
2413         rootTable->add(dependencyArray, 0, *status);
2414         dependencyArray = NULL;
2415     }
2416    if (U_FAILURE(*status))
2417     {
2418         delete state.bundle;
2419         res_close(dependencyArray);
2420         return NULL;
2421     }
2422 
2423     if (getToken(&state, NULL, NULL, &line, status) != TOK_EOF)
2424     {
2425         warning(line, "extraneous text after resource bundle (perhaps unmatched braces)");
2426         if(isStrict()){
2427             *status = U_INVALID_FORMAT_ERROR;
2428             return NULL;
2429         }
2430     }
2431 
2432     cleanupLookahead(&state);
2433     ustr_deinit(&comment);
2434     return state.bundle;
2435 }
2436