1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 1998-2015, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 *
11 * File parse.cpp
12 *
13 * Modification History:
14 *
15 * Date Name Description
16 * 05/26/99 stephen Creation.
17 * 02/25/00 weiv Overhaul to write udata
18 * 5/10/01 Ram removed ustdio dependency
19 * 06/10/2001 Dominic Ludlam <dom@recoil.org> Rewritten
20 *******************************************************************************
21 */
22
23 // Safer use of UnicodeString.
24 #include <cstdint>
25 #include "unicode/umachine.h"
26 #ifndef UNISTR_FROM_CHAR_EXPLICIT
27 # define UNISTR_FROM_CHAR_EXPLICIT explicit
28 #endif
29
30 // Less important, but still a good idea.
31 #ifndef UNISTR_FROM_STRING_EXPLICIT
32 # define UNISTR_FROM_STRING_EXPLICIT explicit
33 #endif
34
35 #include <assert.h>
36 #include "parse.h"
37 #include "errmsg.h"
38 #include "uhash.h"
39 #include "cmemory.h"
40 #include "cstring.h"
41 #include "uinvchar.h"
42 #include "read.h"
43 #include "ustr.h"
44 #include "reslist.h"
45 #include "rbt_pars.h"
46 #include "genrb.h"
47 #include "unicode/normalizer2.h"
48 #include "unicode/stringpiece.h"
49 #include "unicode/unistr.h"
50 #include "unicode/ustring.h"
51 #include "unicode/uscript.h"
52 #include "unicode/utf16.h"
53 #include "unicode/putil.h"
54 #include "charstr.h"
55 #include "collationbuilder.h"
56 #include "collationdata.h"
57 #include "collationdatareader.h"
58 #include "collationdatawriter.h"
59 #include "collationfastlatinbuilder.h"
60 #include "collationinfo.h"
61 #include "collationroot.h"
62 #include "collationruleparser.h"
63 #include "collationtailoring.h"
64 #include <stdio.h>
65 #include "writesrc.h"
66
67 /* Number of tokens to read ahead of the current stream position */
68 #define MAX_LOOKAHEAD 3
69
70 #define CR 0x000D
71 #define LF 0x000A
72 #define SPACE 0x0020
73 #define TAB 0x0009
74 #define ESCAPE 0x005C
75 #define HASH 0x0023
76 #define QUOTE 0x0027
77 #define ZERO 0x0030
78 #define STARTCOMMAND 0x005B
79 #define ENDCOMMAND 0x005D
80 #define OPENSQBRACKET 0x005B
81 #define CLOSESQBRACKET 0x005D
82
83 #define ICU4X_DIACRITIC_BASE 0x0300
84 #define ICU4X_DIACRITIC_LIMIT 0x034F
85
86 using icu::CharString;
87 using icu::LocalMemory;
88 using icu::LocalPointer;
89 using icu::LocalUCHARBUFPointer;
90 using icu::StringPiece;
91 using icu::UnicodeString;
92
93 struct Lookahead
94 {
95 enum ETokenType type;
96 struct UString value;
97 struct UString comment;
98 uint32_t line;
99 };
100
101 /* keep in sync with token defines in read.h */
102 const char *tokenNames[TOK_TOKEN_COUNT] =
103 {
104 "string", /* A string token, such as "MonthNames" */
105 "'{'", /* An opening brace character */
106 "'}'", /* A closing brace character */
107 "','", /* A comma */
108 "':'", /* A colon */
109
110 "<end of file>", /* End of the file has been reached successfully */
111 "<end of line>"
112 };
113
114 /* Just to store "TRUE" */
115 //static const UChar trueValue[] = {0x0054, 0x0052, 0x0055, 0x0045, 0x0000};
116
117 typedef struct {
118 struct Lookahead lookahead[MAX_LOOKAHEAD + 1];
119 uint32_t lookaheadPosition;
120 UCHARBUF *buffer;
121 struct SRBRoot *bundle;
122 const char *inputdir;
123 uint32_t inputdirLength;
124 const char *outputdir;
125 uint32_t outputdirLength;
126 const char *filename;
127 UBool makeBinaryCollation;
128 UBool omitCollationRules;
129 UBool icu4xMode;
130 } ParseState;
131
132 typedef struct SResource *
133 ParseResourceFunction(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status);
134
135 static struct SResource *parseResource(ParseState* state, char *tag, const struct UString *comment, UErrorCode *status);
136
137 /* The nature of the lookahead buffer:
138 There are MAX_LOOKAHEAD + 1 slots, used as a circular buffer. This provides
139 MAX_LOOKAHEAD lookahead tokens and a slot for the current token and value.
140 When getToken is called, the current pointer is moved to the next slot and the
141 old slot is filled with the next token from the reader by calling getNextToken.
142 The token values are stored in the slot, which means that token values don't
143 survive a call to getToken, ie.
144
145 UString *value;
146
147 getToken(&value, NULL, status);
148 getToken(NULL, NULL, status); bad - value is now a different string
149 */
150 static void
initLookahead(ParseState * state,UCHARBUF * buf,UErrorCode * status)151 initLookahead(ParseState* state, UCHARBUF *buf, UErrorCode *status)
152 {
153 static uint32_t initTypeStrings = 0;
154 uint32_t i;
155
156 if (!initTypeStrings)
157 {
158 initTypeStrings = 1;
159 }
160
161 state->lookaheadPosition = 0;
162 state->buffer = buf;
163
164 resetLineNumber();
165
166 for (i = 0; i < MAX_LOOKAHEAD; i++)
167 {
168 state->lookahead[i].type = getNextToken(state->buffer, &state->lookahead[i].value, &state->lookahead[i].line, &state->lookahead[i].comment, status);
169 if (U_FAILURE(*status))
170 {
171 return;
172 }
173 }
174
175 *status = U_ZERO_ERROR;
176 }
177
178 static void
cleanupLookahead(ParseState * state)179 cleanupLookahead(ParseState* state)
180 {
181 uint32_t i;
182 for (i = 0; i <= MAX_LOOKAHEAD; i++)
183 {
184 ustr_deinit(&state->lookahead[i].value);
185 ustr_deinit(&state->lookahead[i].comment);
186 }
187
188 }
189
190 static enum ETokenType
getToken(ParseState * state,struct UString ** tokenValue,struct UString * comment,uint32_t * linenumber,UErrorCode * status)191 getToken(ParseState* state, struct UString **tokenValue, struct UString* comment, uint32_t *linenumber, UErrorCode *status)
192 {
193 enum ETokenType result;
194 uint32_t i;
195
196 result = state->lookahead[state->lookaheadPosition].type;
197
198 if (tokenValue != NULL)
199 {
200 *tokenValue = &state->lookahead[state->lookaheadPosition].value;
201 }
202
203 if (linenumber != NULL)
204 {
205 *linenumber = state->lookahead[state->lookaheadPosition].line;
206 }
207
208 if (comment != NULL)
209 {
210 ustr_cpy(comment, &(state->lookahead[state->lookaheadPosition].comment), status);
211 }
212
213 i = (state->lookaheadPosition + MAX_LOOKAHEAD) % (MAX_LOOKAHEAD + 1);
214 state->lookaheadPosition = (state->lookaheadPosition + 1) % (MAX_LOOKAHEAD + 1);
215 ustr_setlen(&state->lookahead[i].comment, 0, status);
216 ustr_setlen(&state->lookahead[i].value, 0, status);
217 state->lookahead[i].type = getNextToken(state->buffer, &state->lookahead[i].value, &state->lookahead[i].line, &state->lookahead[i].comment, status);
218
219 /* printf("getToken, returning %s\n", tokenNames[result]); */
220
221 return result;
222 }
223
224 static enum ETokenType
peekToken(ParseState * state,uint32_t lookaheadCount,struct UString ** tokenValue,uint32_t * linenumber,struct UString * comment,UErrorCode * status)225 peekToken(ParseState* state, uint32_t lookaheadCount, struct UString **tokenValue, uint32_t *linenumber, struct UString *comment, UErrorCode *status)
226 {
227 uint32_t i = (state->lookaheadPosition + lookaheadCount) % (MAX_LOOKAHEAD + 1);
228
229 if (U_FAILURE(*status))
230 {
231 return TOK_ERROR;
232 }
233
234 if (lookaheadCount >= MAX_LOOKAHEAD)
235 {
236 *status = U_INTERNAL_PROGRAM_ERROR;
237 return TOK_ERROR;
238 }
239
240 if (tokenValue != NULL)
241 {
242 *tokenValue = &state->lookahead[i].value;
243 }
244
245 if (linenumber != NULL)
246 {
247 *linenumber = state->lookahead[i].line;
248 }
249
250 if(comment != NULL){
251 ustr_cpy(comment, &(state->lookahead[state->lookaheadPosition].comment), status);
252 }
253
254 return state->lookahead[i].type;
255 }
256
257 static void
expect(ParseState * state,enum ETokenType expectedToken,struct UString ** tokenValue,struct UString * comment,uint32_t * linenumber,UErrorCode * status)258 expect(ParseState* state, enum ETokenType expectedToken, struct UString **tokenValue, struct UString *comment, uint32_t *linenumber, UErrorCode *status)
259 {
260 uint32_t line;
261
262 enum ETokenType token = getToken(state, tokenValue, comment, &line, status);
263
264 if (linenumber != NULL)
265 {
266 *linenumber = line;
267 }
268
269 if (U_FAILURE(*status))
270 {
271 return;
272 }
273
274 if (token != expectedToken)
275 {
276 *status = U_INVALID_FORMAT_ERROR;
277 error(line, "expecting %s, got %s", tokenNames[expectedToken], tokenNames[token]);
278 }
279 else
280 {
281 *status = U_ZERO_ERROR;
282 }
283 }
284
getInvariantString(ParseState * state,uint32_t * line,struct UString * comment,int32_t & stringLength,UErrorCode * status)285 static char *getInvariantString(ParseState* state, uint32_t *line, struct UString *comment,
286 int32_t &stringLength, UErrorCode *status)
287 {
288 struct UString *tokenValue;
289 char *result;
290
291 expect(state, TOK_STRING, &tokenValue, comment, line, status);
292
293 if (U_FAILURE(*status))
294 {
295 return NULL;
296 }
297
298 if(!uprv_isInvariantUString(tokenValue->fChars, tokenValue->fLength)) {
299 *status = U_INVALID_FORMAT_ERROR;
300 error(*line, "invariant characters required for table keys, binary data, etc.");
301 return NULL;
302 }
303
304 result = static_cast<char *>(uprv_malloc(tokenValue->fLength+1));
305
306 if (result == NULL)
307 {
308 *status = U_MEMORY_ALLOCATION_ERROR;
309 return NULL;
310 }
311
312 u_UCharsToChars(tokenValue->fChars, result, tokenValue->fLength+1);
313 stringLength = tokenValue->fLength;
314 return result;
315 }
316
317 static struct SResource *
parseUCARules(ParseState * state,char * tag,uint32_t startline,const struct UString *,UErrorCode * status)318 parseUCARules(ParseState* state, char *tag, uint32_t startline, const struct UString* /*comment*/, UErrorCode *status)
319 {
320 struct SResource *result = NULL;
321 struct UString *tokenValue;
322 FileStream *file = NULL;
323 char filename[256] = { '\0' };
324 char cs[128] = { '\0' };
325 uint32_t line;
326 UBool quoted = false;
327 UCHARBUF *ucbuf=NULL;
328 UChar32 c = 0;
329 const char* cp = NULL;
330 UChar *pTarget = NULL;
331 UChar *target = NULL;
332 UChar *targetLimit = NULL;
333 int32_t size = 0;
334
335 expect(state, TOK_STRING, &tokenValue, NULL, &line, status);
336
337 if(isVerbose()){
338 printf(" %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
339 }
340
341 if (U_FAILURE(*status))
342 {
343 return NULL;
344 }
345 /* make the filename including the directory */
346 if (state->inputdir != NULL)
347 {
348 uprv_strcat(filename, state->inputdir);
349
350 if (state->inputdir[state->inputdirLength - 1] != U_FILE_SEP_CHAR)
351 {
352 uprv_strcat(filename, U_FILE_SEP_STRING);
353 }
354 }
355
356 u_UCharsToChars(tokenValue->fChars, cs, tokenValue->fLength);
357
358 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
359
360 if (U_FAILURE(*status))
361 {
362 return NULL;
363 }
364 uprv_strcat(filename, cs);
365
366 if(state->omitCollationRules) {
367 return res_none();
368 }
369
370 ucbuf = ucbuf_open(filename, &cp, getShowWarning(),false, status);
371
372 if (U_FAILURE(*status)) {
373 error(line, "An error occurred while opening the input file %s\n", filename);
374 return NULL;
375 }
376
377 /* We allocate more space than actually required
378 * since the actual size needed for storing UChars
379 * is not known in UTF-8 byte stream
380 */
381 size = ucbuf_size(ucbuf) + 1;
382 pTarget = (UChar*) uprv_malloc(U_SIZEOF_UCHAR * size);
383 uprv_memset(pTarget, 0, size*U_SIZEOF_UCHAR);
384 target = pTarget;
385 targetLimit = pTarget+size;
386
387 /* read the rules into the buffer */
388 while (target < targetLimit)
389 {
390 c = ucbuf_getc(ucbuf, status);
391 if(c == QUOTE) {
392 quoted = (UBool)!quoted;
393 }
394 /* weiv (06/26/2002): adding the following:
395 * - preserving spaces in commands [...]
396 * - # comments until the end of line
397 */
398 if (c == STARTCOMMAND && !quoted)
399 {
400 /* preserve commands
401 * closing bracket will be handled by the
402 * append at the end of the loop
403 */
404 while(c != ENDCOMMAND) {
405 U_APPEND_CHAR32_ONLY(c, target);
406 c = ucbuf_getc(ucbuf, status);
407 }
408 }
409 else if (c == HASH && !quoted) {
410 /* skip comments */
411 while(c != CR && c != LF) {
412 c = ucbuf_getc(ucbuf, status);
413 }
414 continue;
415 }
416 else if (c == ESCAPE)
417 {
418 c = unescape(ucbuf, status);
419
420 if (c == (UChar32)U_ERR)
421 {
422 uprv_free(pTarget);
423 T_FileStream_close(file);
424 return NULL;
425 }
426 }
427 else if (!quoted && (c == SPACE || c == TAB || c == CR || c == LF))
428 {
429 /* ignore spaces carriage returns
430 * and line feed unless in the form \uXXXX
431 */
432 continue;
433 }
434
435 /* Append UChar * after dissembling if c > 0xffff*/
436 if (c != (UChar32)U_EOF)
437 {
438 U_APPEND_CHAR32_ONLY(c, target);
439 }
440 else
441 {
442 break;
443 }
444 }
445
446 /* terminate the string */
447 if(target < targetLimit){
448 *target = 0x0000;
449 }
450
451 result = string_open(state->bundle, tag, pTarget, (int32_t)(target - pTarget), NULL, status);
452
453
454 ucbuf_close(ucbuf);
455 uprv_free(pTarget);
456 T_FileStream_close(file);
457
458 return result;
459 }
460
461 static struct SResource *
parseTransliterator(ParseState * state,char * tag,uint32_t startline,const struct UString *,UErrorCode * status)462 parseTransliterator(ParseState* state, char *tag, uint32_t startline, const struct UString* /*comment*/, UErrorCode *status)
463 {
464 struct SResource *result = NULL;
465 struct UString *tokenValue;
466 FileStream *file = NULL;
467 char filename[256] = { '\0' };
468 char cs[128] = { '\0' };
469 uint32_t line;
470 UCHARBUF *ucbuf=NULL;
471 const char* cp = NULL;
472 UChar *pTarget = NULL;
473 const UChar *pSource = NULL;
474 int32_t size = 0;
475
476 expect(state, TOK_STRING, &tokenValue, NULL, &line, status);
477
478 if(isVerbose()){
479 printf(" %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
480 }
481
482 if (U_FAILURE(*status))
483 {
484 return NULL;
485 }
486 /* make the filename including the directory */
487 if (state->inputdir != NULL)
488 {
489 uprv_strcat(filename, state->inputdir);
490
491 if (state->inputdir[state->inputdirLength - 1] != U_FILE_SEP_CHAR)
492 {
493 uprv_strcat(filename, U_FILE_SEP_STRING);
494 }
495 }
496
497 u_UCharsToChars(tokenValue->fChars, cs, tokenValue->fLength);
498
499 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
500
501 if (U_FAILURE(*status))
502 {
503 return NULL;
504 }
505 uprv_strcat(filename, cs);
506
507
508 ucbuf = ucbuf_open(filename, &cp, getShowWarning(),false, status);
509
510 if (U_FAILURE(*status)) {
511 error(line, "An error occurred while opening the input file %s\n", filename);
512 return NULL;
513 }
514
515 /* We allocate more space than actually required
516 * since the actual size needed for storing UChars
517 * is not known in UTF-8 byte stream
518 */
519 pSource = ucbuf_getBuffer(ucbuf, &size, status);
520 pTarget = (UChar*) uprv_malloc(U_SIZEOF_UCHAR * (size + 1));
521 uprv_memset(pTarget, 0, size*U_SIZEOF_UCHAR);
522
523 #if !UCONFIG_NO_TRANSLITERATION
524 size = utrans_stripRules(pSource, size, pTarget, status);
525 #else
526 size = 0;
527 fprintf(stderr, " Warning: writing empty transliteration data ( UCONFIG_NO_TRANSLITERATION ) \n");
528 #endif
529 result = string_open(state->bundle, tag, pTarget, size, NULL, status);
530
531 ucbuf_close(ucbuf);
532 uprv_free(pTarget);
533 T_FileStream_close(file);
534
535 return result;
536 }
537 static ArrayResource* dependencyArray = NULL;
538
539 static struct SResource *
parseDependency(ParseState * state,char * tag,uint32_t startline,const struct UString * comment,UErrorCode * status)540 parseDependency(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status)
541 {
542 struct SResource *result = NULL;
543 struct SResource *elem = NULL;
544 struct UString *tokenValue;
545 uint32_t line;
546 char filename[256] = { '\0' };
547 char cs[128] = { '\0' };
548
549 expect(state, TOK_STRING, &tokenValue, NULL, &line, status);
550
551 if(isVerbose()){
552 printf(" %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
553 }
554
555 if (U_FAILURE(*status))
556 {
557 return NULL;
558 }
559 /* make the filename including the directory */
560 if (state->outputdir != NULL)
561 {
562 uprv_strcat(filename, state->outputdir);
563
564 if (state->outputdir[state->outputdirLength - 1] != U_FILE_SEP_CHAR)
565 {
566 uprv_strcat(filename, U_FILE_SEP_STRING);
567 }
568 }
569
570 u_UCharsToChars(tokenValue->fChars, cs, tokenValue->fLength);
571
572 if (U_FAILURE(*status))
573 {
574 return NULL;
575 }
576 uprv_strcat(filename, cs);
577 if(!T_FileStream_file_exists(filename)){
578 if(isStrict()){
579 error(line, "The dependency file %s does not exist. Please make sure it exists.\n",filename);
580 }else{
581 warning(line, "The dependency file %s does not exist. Please make sure it exists.\n",filename);
582 }
583 }
584 if(dependencyArray==NULL){
585 dependencyArray = array_open(state->bundle, "%%DEPENDENCY", NULL, status);
586 }
587 if(tag!=NULL){
588 result = string_open(state->bundle, tag, tokenValue->fChars, tokenValue->fLength, comment, status);
589 }
590 elem = string_open(state->bundle, NULL, tokenValue->fChars, tokenValue->fLength, comment, status);
591
592 dependencyArray->add(elem);
593
594 if (U_FAILURE(*status))
595 {
596 return NULL;
597 }
598 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
599 return result;
600 }
601 static struct SResource *
parseString(ParseState * state,char * tag,uint32_t startline,const struct UString * comment,UErrorCode * status)602 parseString(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status)
603 {
604 struct UString *tokenValue;
605 struct SResource *result = NULL;
606
607 /* if (tag != NULL && uprv_strcmp(tag, "%%UCARULES") == 0)
608 {
609 return parseUCARules(tag, startline, status);
610 }*/
611 if(isVerbose()){
612 printf(" string %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
613 }
614 expect(state, TOK_STRING, &tokenValue, NULL, NULL, status);
615
616 if (U_SUCCESS(*status))
617 {
618 /* create the string now - tokenValue doesn't survive a call to getToken (and therefore
619 doesn't survive expect either) */
620
621 result = string_open(state->bundle, tag, tokenValue->fChars, tokenValue->fLength, comment, status);
622 if(U_SUCCESS(*status) && result) {
623 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
624
625 if (U_FAILURE(*status))
626 {
627 res_close(result);
628 return NULL;
629 }
630 }
631 }
632
633 return result;
634 }
635
636 static struct SResource *
parseAlias(ParseState * state,char * tag,uint32_t startline,const struct UString * comment,UErrorCode * status)637 parseAlias(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
638 {
639 struct UString *tokenValue;
640 struct SResource *result = NULL;
641
642 expect(state, TOK_STRING, &tokenValue, NULL, NULL, status);
643
644 if(isVerbose()){
645 printf(" alias %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
646 }
647
648 if (U_SUCCESS(*status))
649 {
650 /* create the string now - tokenValue doesn't survive a call to getToken (and therefore
651 doesn't survive expect either) */
652
653 result = alias_open(state->bundle, tag, tokenValue->fChars, tokenValue->fLength, comment, status);
654
655 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
656
657 if (U_FAILURE(*status))
658 {
659 res_close(result);
660 return NULL;
661 }
662 }
663
664 return result;
665 }
666
667 #if !UCONFIG_NO_COLLATION
668
669 namespace {
670
resLookup(struct SResource * res,const char * key)671 static struct SResource* resLookup(struct SResource* res, const char* key){
672 if (res == res_none() || !res->isTable()) {
673 return NULL;
674 }
675
676 TableResource *list = static_cast<TableResource *>(res);
677 SResource *current = list->fFirst;
678 while (current != NULL) {
679 if (uprv_strcmp(((list->fRoot->fKeys) + (current->fKey)), key) == 0) {
680 return current;
681 }
682 current = current->fNext;
683 }
684 return NULL;
685 }
686
687 class GenrbImporter : public icu::CollationRuleParser::Importer {
688 public:
GenrbImporter(const char * in,const char * out)689 GenrbImporter(const char *in, const char *out) : inputDir(in), outputDir(out) {}
690 virtual ~GenrbImporter();
691 virtual void getRules(
692 const char *localeID, const char *collationType,
693 UnicodeString &rules,
694 const char *&errorReason, UErrorCode &errorCode) override;
695
696 private:
697 const char *inputDir;
698 const char *outputDir;
699 };
700
~GenrbImporter()701 GenrbImporter::~GenrbImporter() {}
702
703 void
getRules(const char * localeID,const char * collationType,UnicodeString & rules,const char * &,UErrorCode & errorCode)704 GenrbImporter::getRules(
705 const char *localeID, const char *collationType,
706 UnicodeString &rules,
707 const char *& /*errorReason*/, UErrorCode &errorCode) {
708 CharString filename(localeID, errorCode);
709 for(int32_t i = 0; i < filename.length(); i++){
710 if(filename[i] == '-'){
711 filename.data()[i] = '_';
712 }
713 }
714 filename.append(".txt", errorCode);
715 if (U_FAILURE(errorCode)) {
716 return;
717 }
718 CharString inputDirBuf;
719 CharString openFileName;
720 if(inputDir == NULL) {
721 const char *filenameBegin = uprv_strrchr(filename.data(), U_FILE_SEP_CHAR);
722 if (filenameBegin != NULL) {
723 /*
724 * When a filename ../../../data/root.txt is specified,
725 * we presume that the input directory is ../../../data
726 * This is very important when the resource file includes
727 * another file, like UCARules.txt or thaidict.brk.
728 */
729 StringPiece dir = filename.toStringPiece();
730 const char *filenameLimit = filename.data() + filename.length();
731 dir.remove_suffix((int32_t)(filenameLimit - filenameBegin));
732 inputDirBuf.append(dir, errorCode);
733 inputDir = inputDirBuf.data();
734 }
735 }else{
736 int32_t dirlen = (int32_t)uprv_strlen(inputDir);
737
738 if((filename[0] != U_FILE_SEP_CHAR) && (inputDir[dirlen-1] !='.')) {
739 /*
740 * append the input dir to openFileName if the first char in
741 * filename is not file separator char and the last char input directory is not '.'.
742 * This is to support :
743 * genrb -s. /home/icu/data
744 * genrb -s. icu/data
745 * The user cannot mix notations like
746 * genrb -s. /icu/data --- the absolute path specified. -s redundant
747 * user should use
748 * genrb -s. icu/data --- start from CWD and look in icu/data dir
749 */
750 openFileName.append(inputDir, dirlen, errorCode);
751 if(inputDir[dirlen-1] != U_FILE_SEP_CHAR) {
752 openFileName.append(U_FILE_SEP_CHAR, errorCode);
753 }
754 }
755 }
756 openFileName.append(filename, errorCode);
757 if(U_FAILURE(errorCode)) {
758 return;
759 }
760 // printf("GenrbImporter::getRules(%s, %s) reads %s\n", localeID, collationType, openFileName.data());
761 const char* cp = "";
762 LocalUCHARBUFPointer ucbuf(
763 ucbuf_open(openFileName.data(), &cp, getShowWarning(), true, &errorCode));
764 if(errorCode == U_FILE_ACCESS_ERROR) {
765 fprintf(stderr, "couldn't open file %s\n", openFileName.data());
766 return;
767 }
768 if (ucbuf.isNull() || U_FAILURE(errorCode)) {
769 fprintf(stderr, "An error occurred processing file %s. Error: %s\n", openFileName.data(), u_errorName(errorCode));
770 return;
771 }
772
773 /* Parse the data into an SRBRoot */
774 LocalPointer<SRBRoot> data(
775 parse(ucbuf.getAlias(), inputDir, outputDir, filename.data(), false, false, false, &errorCode));
776 if (U_FAILURE(errorCode)) {
777 return;
778 }
779
780 struct SResource *root = data->fRoot;
781 struct SResource *collations = resLookup(root, "collations");
782 if (collations != NULL) {
783 struct SResource *collation = resLookup(collations, collationType);
784 if (collation != NULL) {
785 struct SResource *sequence = resLookup(collation, "Sequence");
786 if (sequence != NULL && sequence->isString()) {
787 // No string pointer aliasing so that we need not hold onto the resource bundle.
788 StringResource *sr = static_cast<StringResource *>(sequence);
789 rules = sr->fString;
790 }
791 }
792 }
793 }
794
795 // Quick-and-dirty escaping function.
796 // Assumes that we are on an ASCII-based platform.
797 static void
escape(const UChar * s,char * buffer)798 escape(const UChar *s, char *buffer) {
799 int32_t length = u_strlen(s);
800 int32_t i = 0;
801 for (;;) {
802 UChar32 c;
803 U16_NEXT(s, i, length, c);
804 if (c == 0) {
805 *buffer = 0;
806 return;
807 } else if (0x20 <= c && c <= 0x7e) {
808 // printable ASCII
809 *buffer++ = (char)c; // assumes ASCII-based platform
810 } else {
811 buffer += sprintf(buffer, "\\u%04X", (int)c);
812 }
813 }
814 }
815
816 } // namespace
817
818 static FILE*
openTOML(const char * outputdir,const char * name,const char * collationType,const char * structType,UErrorCode * status)819 openTOML(const char* outputdir, const char* name, const char* collationType, const char* structType, UErrorCode *status) {
820 CharString baseName;
821 baseName.append(name, *status);
822 baseName.append("_", *status);
823 baseName.append(collationType, *status);
824 baseName.append("_", *status);
825 baseName.append(structType, *status);
826
827 CharString outFileName;
828 if (outputdir && *outputdir) {
829 outFileName.append(outputdir, *status).ensureEndsWithFileSeparator(*status);
830 }
831 outFileName.append(baseName, *status);
832 outFileName.append(".toml", *status);
833 if (U_FAILURE(*status)) {
834 return NULL;
835 }
836
837 FILE* f = fopen(outFileName.data(), "w");
838 if (!f) {
839 *status = U_FILE_ACCESS_ERROR;
840 return NULL;
841 }
842 usrc_writeFileNameGeneratedBy(f, "#", baseName.data(), "genrb -X");
843
844 return f;
845 }
846
847 static void
writeCollationMetadataTOML(const char * outputdir,const char * name,const char * collationType,const uint32_t metadataBits,UErrorCode * status)848 writeCollationMetadataTOML(const char* outputdir, const char* name, const char* collationType, const uint32_t metadataBits, UErrorCode *status) {
849 FILE* f = openTOML(outputdir, name, collationType, "meta", status);
850 if (!f) {
851 return;
852 }
853 // printf("writeCollationMetadataTOML %s %s\n", name, collationType);
854 fprintf(f, "bits = 0x%X\n", metadataBits);
855 fclose(f);
856 }
857
858 static UChar32
writeCollationDiacriticsTOML(const char * outputdir,const char * name,const char * collationType,const icu::CollationData * data,UErrorCode * status)859 writeCollationDiacriticsTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) {
860 UChar32 limit = ICU4X_DIACRITIC_LIMIT;
861 FILE* f = openTOML(outputdir, name, collationType, "dia", status);
862 if (!f) {
863 return limit;
864 }
865 // printf("writeCollationDiacriticsTOML %s %s\n", name, collationType);
866 uint16_t secondaries[ICU4X_DIACRITIC_LIMIT-ICU4X_DIACRITIC_BASE];
867 for (UChar32 c = ICU4X_DIACRITIC_BASE; c < ICU4X_DIACRITIC_LIMIT; ++c) {
868 uint16_t secondary = 0;
869 uint32_t ce32 = data->getCE32(c);
870 if (ce32 == icu::Collation::FALLBACK_CE32) {
871 ce32 = data->base->getCE32(c);
872 }
873 if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) {
874 // These never occur in NFD data
875 } else if (!icu::Collation::isSimpleOrLongCE32(ce32)) {
876 if (uprv_strcmp(name, "root") == 0) {
877 printf("UNSUPPORTED DIACRITIC CE32 in root: TAG: %X CE32: %X char: %X\n", icu::Collation::tagFromCE32(ce32), ce32, c);
878 fclose(f);
879 *status = U_INTERNAL_PROGRAM_ERROR;
880 return limit;
881 }
882 limit = c;
883 break;
884 } else {
885 uint64_t ce = uint64_t(icu::Collation::ceFromCE32(ce32));
886 if ((ce & 0xFFFFFFFF0000FFFF) != uint64_t(icu::Collation::COMMON_TERTIARY_CE)) {
887 // Not a CE where only the secondary weight differs from the expected
888 // pattern.
889 limit = c;
890 break;
891 }
892 secondary = uint16_t(ce >> 16);
893 }
894 secondaries[c - ICU4X_DIACRITIC_BASE] = secondary;
895
896 }
897 usrc_writeArray(f, "secondaries = [\n ", secondaries, 16, limit-ICU4X_DIACRITIC_BASE, " ", "\n]\n");
898 fclose(f);
899 return limit;
900 }
901
902 static void
writeCollationReorderingTOML(const char * outputdir,const char * name,const char * collationType,const icu::CollationSettings * settings,UErrorCode * status)903 writeCollationReorderingTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationSettings* settings, UErrorCode *status) {
904 FILE* f = openTOML(outputdir, name, collationType, "reord", status);
905 if (!f) {
906 return;
907 }
908 // printf("writeCollationReorderingTOML %s %s\n", name, collationType);
909 fprintf(f, "min_high_no_reorder = 0x%X\n", settings->minHighNoReorder);
910 usrc_writeArray(f, "reorder_table = [\n ", settings->reorderTable, 8, 256, " ", "\n]\n");
911 usrc_writeArray(f, "reorder_ranges = [\n ", settings->reorderRanges, 32, settings->reorderRangesLength, " ", "\n]\n");
912 fclose(f);
913 }
914
915
916 static void
writeCollationJamoTOML(const char * outputdir,const char * name,const char * collationType,const icu::CollationData * data,UErrorCode * status)917 writeCollationJamoTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) {
918 FILE* f = openTOML(outputdir, name, collationType, "jamo", status);
919 if (!f) {
920 printf("writeCollationJamoTOML FAILED TO OPEN FILE %s %s\n", name, collationType);
921 return;
922 }
923 uint32_t jamo[0x1200-0x1100];
924 for (UChar32 c = 0x1100; c < 0x1200; ++c) {
925 uint32_t ce32 = data->getCE32(c);
926 if (ce32 == icu::Collation::FALLBACK_CE32) {
927 ce32 = data->base->getCE32(c);
928 }
929 // Can't reject complex CE32s, because search collations have expansions.
930 // These expansions refer to the tailoring, which foils the reuse of the
931 // these jamo tables.
932 // XXX Figure out what to do. Perhaps instead of having Latin mini expansions,
933 // there should be Hangul mini expansions.
934 // XXX in any case, validate that modern jamo are self-contained.
935 jamo[c - 0x1100] = ce32;
936
937 }
938 usrc_writeArray(f, "ce32s = [\n ", jamo, 32, 0x1200-0x1100, " ", "\n]\n");
939 fclose(f);
940 }
941
942 static UBool
convertTrie(const void * context,UChar32 start,UChar32 end,uint32_t value)943 convertTrie(const void *context, UChar32 start, UChar32 end, uint32_t value) {
944 if (start >= 0x1100 && start < 0x1200 && end >= 0x1100 && end < 0x1200) {
945 // Range entirely in conjoining jamo block.
946 return true;
947 }
948 icu::IcuToolErrorCode status("genrb: convertTrie");
949 umutablecptrie_setRange((UMutableCPTrie*)context, start, end, value, status);
950 return !U_FAILURE(*status);
951 }
952
953 static void
writeCollationDataTOML(const char * outputdir,const char * name,const char * collationType,const icu::CollationData * data,UBool root,UChar32 diacriticLimit,UErrorCode * status)954 writeCollationDataTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UBool root, UChar32 diacriticLimit, UErrorCode *status) {
955 FILE* f = openTOML(outputdir, name, collationType, "data", status);
956 if (!f) {
957 return;
958 }
959 // printf("writeCollationDataTOML %s %s\n", name, collationType);
960
961 icu::UnicodeSet tailoringSet;
962
963 if (data->base) {
964 tailoringSet.addAll(*(data->unsafeBackwardSet));
965 tailoringSet.removeAll(*(data->base->unsafeBackwardSet));
966 } else {
967 tailoringSet.addAll(*(data->unsafeBackwardSet));
968 }
969
970 // Use the same value for out-of-range and default in the hope of not having to allocate
971 // different blocks, since ICU4X never does out-of-range queries.
972 uint32_t trieDefault = root ? icu::Collation::UNASSIGNED_CE32 : icu::Collation::FALLBACK_CE32;
973 icu::LocalUMutableCPTriePointer builder(umutablecptrie_open(trieDefault, trieDefault, status));
974
975 utrie2_enum(data->trie, NULL, &convertTrie, builder.getAlias());
976
977 // If the diacritic table was cut short, copy CE32s between the lowered
978 // limit and the max limit from the root to the tailoring. As of June 2022,
979 // no collation in CLDR needs this.
980 for (UChar32 c = diacriticLimit; c < ICU4X_DIACRITIC_LIMIT; ++c) {
981 if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) {
982 // These never occur in NFD data.
983 continue;
984 }
985 uint32_t ce32 = data->getCE32(c);
986 if (ce32 == icu::Collation::FALLBACK_CE32) {
987 ce32 = data->base->getCE32(c);
988 umutablecptrie_set(builder.getAlias(), c, ce32, status);
989 }
990 }
991
992 // Ensure that the range covered by the diacritic table isn't duplicated
993 // in the trie.
994 for (UChar32 c = ICU4X_DIACRITIC_BASE; c < diacriticLimit; ++c) {
995 if (umutablecptrie_get(builder.getAlias(), c) != trieDefault) {
996 umutablecptrie_set(builder.getAlias(), c, trieDefault, status);
997 }
998 }
999
1000 icu::LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
1001 builder.getAlias(),
1002 UCPTRIE_TYPE_SMALL,
1003 UCPTRIE_VALUE_BITS_32,
1004 status));
1005 usrc_writeArray(f, "contexts = [\n ", data->contexts, 16, data->contextsLength, " ", "\n]\n");
1006 usrc_writeArray(f, "ce32s = [\n ", data->ce32s, 32, data->ce32sLength, " ", "\n]\n");
1007 usrc_writeArray(f, "ces = [\n ", data->ces, 64, data->cesLength, " ", "\n]\n");
1008 fprintf(f, "[trie]\n");
1009 usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
1010
1011 fclose(f);
1012 }
1013
1014 static void
writeCollationSpecialPrimariesTOML(const char * outputdir,const char * name,const char * collationType,const icu::CollationData * data,UErrorCode * status)1015 writeCollationSpecialPrimariesTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) {
1016 FILE* f = openTOML(outputdir, name, collationType, "prim", status);
1017 if (!f) {
1018 return;
1019 }
1020 // printf("writeCollationSpecialPrimariesTOML %s %s\n", name, collationType);
1021
1022 uint16_t lastPrimaries[4];
1023 for (int32_t i = 0; i < 4; ++i) {
1024 // getLastPrimaryForGroup subtracts one from a 16-bit value, so we add one
1025 // back to get a value that fits in 16 bits.
1026 lastPrimaries[i] = (uint16_t)((data->getLastPrimaryForGroup(UCOL_REORDER_CODE_FIRST + i) + 1) >> 16);
1027 }
1028
1029 uint32_t numericPrimary = data->numericPrimary;
1030 if (numericPrimary & 0xFFFFFF) {
1031 printf("Lower 24 bits set in numeric primary");
1032 *status = U_INTERNAL_PROGRAM_ERROR;
1033 return;
1034 }
1035
1036 usrc_writeArray(f, "last_primaries = [\n ", lastPrimaries, 16, 4, " ", "\n]\n");
1037 fprintf(f, "numeric_primary = 0x%X\n", numericPrimary >> 24);
1038 fclose(f);
1039 }
1040
1041 static void
writeCollationTOML(const char * outputdir,const char * name,const char * collationType,const icu::CollationData * data,const icu::CollationSettings * settings,UErrorCode * status)1042 writeCollationTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, const icu::CollationSettings* settings, UErrorCode *status) {
1043 UBool tailored = false;
1044 UBool tailoredDiacritics = false;
1045 UBool lithuanianDotAbove = (uprv_strcmp(name, "lt") == 0);
1046 UBool reordering = false;
1047 UBool isRoot = uprv_strcmp(name, "root") == 0;
1048 UChar32 diacriticLimit = ICU4X_DIACRITIC_LIMIT;
1049 if (!data->base && isRoot) {
1050 diacriticLimit = writeCollationDiacriticsTOML(outputdir, name, collationType, data, status);
1051 if (U_FAILURE(*status)) {
1052 return;
1053 }
1054 writeCollationJamoTOML(outputdir, name, collationType, data, status);
1055 if (U_FAILURE(*status)) {
1056 return;
1057 }
1058 writeCollationSpecialPrimariesTOML(outputdir, name, collationType, data, status);
1059 if (U_FAILURE(*status)) {
1060 return;
1061 }
1062 } else if (data->base && !lithuanianDotAbove) {
1063 for (UChar32 c = ICU4X_DIACRITIC_BASE; c < ICU4X_DIACRITIC_LIMIT; ++c) {
1064 if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) {
1065 // These never occur in NFD data.
1066 continue;
1067 }
1068 uint32_t ce32 = data->getCE32(c);
1069 if ((ce32 != icu::Collation::FALLBACK_CE32) && (ce32 != data->base->getCE32(c))) {
1070 tailoredDiacritics = true;
1071 diacriticLimit = writeCollationDiacriticsTOML(outputdir, name, collationType, data, status);
1072 if (U_FAILURE(*status)) {
1073 return;
1074 }
1075 break;
1076 }
1077 }
1078 }
1079
1080 if (settings->hasReordering()) {
1081 reordering = true;
1082 // Note: There are duplicate reorderings. Expecting the ICU4X provider
1083 // to take care of deduplication.
1084 writeCollationReorderingTOML(outputdir, name, collationType, settings, status);
1085 if (U_FAILURE(*status)) {
1086 return;
1087 }
1088 }
1089
1090 // Write collation data if either base is non-null or the name is root.
1091 // Languages that only reorder scripts are otherwise root-like and have
1092 // null base.
1093 if (data->base || isRoot) {
1094 tailored = !isRoot;
1095 writeCollationDataTOML(outputdir, name, collationType, data, (!data->base && isRoot), diacriticLimit, status);
1096 if (U_FAILURE(*status)) {
1097 return;
1098 }
1099 }
1100
1101 uint32_t maxVariable = (uint32_t)settings->getMaxVariable();
1102 if (maxVariable >= 4) {
1103 printf("Max variable out of range");
1104 *status = U_INTERNAL_PROGRAM_ERROR;
1105 return;
1106 }
1107
1108 uint32_t metadataBits = maxVariable;
1109 if (tailored) {
1110 metadataBits |= (1 << 3);
1111 }
1112 if (tailoredDiacritics) {
1113 metadataBits |= (1 << 4);
1114 }
1115 if (reordering) {
1116 metadataBits |= (1 << 5);
1117 }
1118 if (lithuanianDotAbove) {
1119 metadataBits |= (1 << 6);
1120 }
1121 if ((settings->options & icu::CollationSettings::BACKWARD_SECONDARY) != 0) {
1122 metadataBits |= (1 << 7);
1123 }
1124 if (settings->getAlternateHandling() == UCOL_SHIFTED) {
1125 metadataBits |= (1 << 8);
1126 }
1127 switch (settings->getCaseFirst()) {
1128 case UCOL_OFF:
1129 break;
1130 case UCOL_UPPER_FIRST:
1131 metadataBits |= (1 << 9);
1132 metadataBits |= (1 << 10);
1133 break;
1134 case UCOL_LOWER_FIRST:
1135 metadataBits |= (1 << 9);
1136 break;
1137 default:
1138 *status = U_INTERNAL_PROGRAM_ERROR;
1139 return;
1140 }
1141
1142 writeCollationMetadataTOML(outputdir, name, collationType, metadataBits, status);
1143 }
1144
1145 #endif // !UCONFIG_NO_COLLATION
1146
1147 static TableResource *
addCollation(ParseState * state,TableResource * result,const char * collationType,uint32_t startline,UErrorCode * status)1148 addCollation(ParseState* state, TableResource *result, const char *collationType,
1149 uint32_t startline, UErrorCode *status)
1150 {
1151 // TODO: Use LocalPointer for result, or make caller close it when there is a failure.
1152 struct SResource *member = NULL;
1153 struct UString *tokenValue;
1154 struct UString comment;
1155 enum ETokenType token;
1156 char subtag[1024];
1157 UnicodeString rules;
1158 UBool haveRules = false;
1159 UVersionInfo version;
1160 uint32_t line;
1161
1162 /* '{' . (name resource)* '}' */
1163 version[0]=0; version[1]=0; version[2]=0; version[3]=0;
1164
1165 for (;;)
1166 {
1167 ustr_init(&comment);
1168 token = getToken(state, &tokenValue, &comment, &line, status);
1169
1170 if (token == TOK_CLOSE_BRACE)
1171 {
1172 break;
1173 }
1174
1175 if (token != TOK_STRING)
1176 {
1177 res_close(result);
1178 *status = U_INVALID_FORMAT_ERROR;
1179
1180 if (token == TOK_EOF)
1181 {
1182 error(startline, "unterminated table");
1183 }
1184 else
1185 {
1186 error(line, "Unexpected token %s", tokenNames[token]);
1187 }
1188
1189 return NULL;
1190 }
1191
1192 u_UCharsToChars(tokenValue->fChars, subtag, u_strlen(tokenValue->fChars) + 1);
1193
1194 if (U_FAILURE(*status))
1195 {
1196 res_close(result);
1197 return NULL;
1198 }
1199
1200 member = parseResource(state, subtag, NULL, status);
1201
1202 if (U_FAILURE(*status))
1203 {
1204 res_close(result);
1205 return NULL;
1206 }
1207 if (result == NULL)
1208 {
1209 // Ignore the parsed resources, continue parsing.
1210 }
1211 else if (uprv_strcmp(subtag, "Version") == 0 && member->isString())
1212 {
1213 StringResource *sr = static_cast<StringResource *>(member);
1214 char ver[40];
1215 int32_t length = sr->length();
1216
1217 if (length >= UPRV_LENGTHOF(ver))
1218 {
1219 length = UPRV_LENGTHOF(ver) - 1;
1220 }
1221
1222 sr->fString.extract(0, length, ver, UPRV_LENGTHOF(ver), US_INV);
1223 u_versionFromString(version, ver);
1224
1225 result->add(member, line, *status);
1226 member = NULL;
1227 }
1228 else if(uprv_strcmp(subtag, "%%CollationBin")==0)
1229 {
1230 /* discard duplicate %%CollationBin if any*/
1231 }
1232 else if (uprv_strcmp(subtag, "Sequence") == 0 && member->isString())
1233 {
1234 StringResource *sr = static_cast<StringResource *>(member);
1235 rules = sr->fString;
1236 haveRules = true;
1237 // Defer building the collator until we have seen
1238 // all sub-elements of the collation table, including the Version.
1239 /* in order to achieve smaller data files, we can direct genrb */
1240 /* to omit collation rules */
1241 if(!state->omitCollationRules) {
1242 result->add(member, line, *status);
1243 member = NULL;
1244 }
1245 }
1246 else // Just copy non-special items.
1247 {
1248 result->add(member, line, *status);
1249 member = NULL;
1250 }
1251 res_close(member); // TODO: use LocalPointer
1252 if (U_FAILURE(*status))
1253 {
1254 res_close(result);
1255 return NULL;
1256 }
1257 }
1258
1259 if (!haveRules) { return result; }
1260
1261 #if UCONFIG_NO_COLLATION || UCONFIG_NO_FILE_IO
1262 warning(line, "Not building collation elements because of UCONFIG_NO_COLLATION and/or UCONFIG_NO_FILE_IO, see uconfig.h");
1263 (void)collationType;
1264 #else
1265 // CLDR ticket #3949, ICU ticket #8082:
1266 // Do not build collation binary data for for-import-only "private" collation rule strings.
1267 if (uprv_strncmp(collationType, "private-", 8) == 0) {
1268 if(isVerbose()) {
1269 printf("Not building %s~%s collation binary\n", state->filename, collationType);
1270 }
1271 return result;
1272 }
1273
1274 if(!state->makeBinaryCollation) {
1275 if(isVerbose()) {
1276 printf("Not building %s~%s collation binary\n", state->filename, collationType);
1277 }
1278 return result;
1279 }
1280 UErrorCode intStatus = U_ZERO_ERROR;
1281 UParseError parseError;
1282 uprv_memset(&parseError, 0, sizeof(parseError));
1283 GenrbImporter importer(state->inputdir, state->outputdir);
1284 const icu::CollationTailoring *base = icu::CollationRoot::getRoot(intStatus);
1285 if(U_FAILURE(intStatus)) {
1286 error(line, "failed to load root collator (ucadata.icu) - %s", u_errorName(intStatus));
1287 res_close(result);
1288 return NULL; // TODO: use LocalUResourceBundlePointer for result
1289 }
1290 icu::CollationBuilder builder(base, state->icu4xMode, intStatus);
1291 if(state->icu4xMode || (uprv_strncmp(collationType, "search", 6) == 0)) {
1292 builder.disableFastLatin(); // build fast-Latin table unless search collator or ICU4X
1293 }
1294 LocalPointer<icu::CollationTailoring> t(
1295 builder.parseAndBuild(rules, version, &importer, &parseError, intStatus));
1296 if(U_FAILURE(intStatus)) {
1297 const char *reason = builder.getErrorReason();
1298 if(reason == NULL) { reason = ""; }
1299 error(line, "CollationBuilder failed at %s~%s/Sequence rule offset %ld: %s %s",
1300 state->filename, collationType,
1301 (long)parseError.offset, u_errorName(intStatus), reason);
1302 if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1303 // Print pre- and post-context.
1304 char preBuffer[100], postBuffer[100];
1305 escape(parseError.preContext, preBuffer);
1306 escape(parseError.postContext, postBuffer);
1307 error(line, " error context: \"...%s\" ! \"%s...\"", preBuffer, postBuffer);
1308 }
1309 if(isStrict() || t.isNull()) {
1310 *status = intStatus;
1311 res_close(result);
1312 return NULL;
1313 }
1314 }
1315 if (state->icu4xMode) {
1316 char *nameWithoutSuffix = static_cast<char *>(uprv_malloc(uprv_strlen(state->filename) + 1));
1317 if (nameWithoutSuffix == NULL) {
1318 *status = U_MEMORY_ALLOCATION_ERROR;
1319 res_close(result);
1320 return NULL;
1321 }
1322 uprv_strcpy(nameWithoutSuffix, state->filename);
1323 *uprv_strrchr(nameWithoutSuffix, '.') = 0;
1324
1325 writeCollationTOML(state->outputdir, nameWithoutSuffix, collationType, t->data, t->settings, status);
1326 uprv_free(nameWithoutSuffix);
1327 }
1328 icu::LocalMemory<uint8_t> buffer;
1329 int32_t capacity = 100000;
1330 uint8_t *dest = buffer.allocateInsteadAndCopy(capacity);
1331 if(dest == NULL) {
1332 fprintf(stderr, "memory allocation (%ld bytes) for file contents failed\n",
1333 (long)capacity);
1334 *status = U_MEMORY_ALLOCATION_ERROR;
1335 res_close(result);
1336 return NULL;
1337 }
1338 int32_t indexes[icu::CollationDataReader::IX_TOTAL_SIZE + 1];
1339 int32_t totalSize = icu::CollationDataWriter::writeTailoring(
1340 *t, *t->settings, indexes, dest, capacity, intStatus);
1341 if(intStatus == U_BUFFER_OVERFLOW_ERROR) {
1342 intStatus = U_ZERO_ERROR;
1343 capacity = totalSize;
1344 dest = buffer.allocateInsteadAndCopy(capacity);
1345 if(dest == NULL) {
1346 fprintf(stderr, "memory allocation (%ld bytes) for file contents failed\n",
1347 (long)capacity);
1348 *status = U_MEMORY_ALLOCATION_ERROR;
1349 res_close(result);
1350 return NULL;
1351 }
1352 totalSize = icu::CollationDataWriter::writeTailoring(
1353 *t, *t->settings, indexes, dest, capacity, intStatus);
1354 }
1355 if(U_FAILURE(intStatus)) {
1356 fprintf(stderr, "CollationDataWriter::writeTailoring() failed: %s\n",
1357 u_errorName(intStatus));
1358 res_close(result);
1359 return NULL;
1360 }
1361 if(isVerbose()) {
1362 printf("%s~%s collation tailoring part sizes:\n", state->filename, collationType);
1363 icu::CollationInfo::printSizes(totalSize, indexes);
1364 if(t->settings->hasReordering()) {
1365 printf("%s~%s collation reordering ranges:\n", state->filename, collationType);
1366 icu::CollationInfo::printReorderRanges(
1367 *t->data, t->settings->reorderCodes, t->settings->reorderCodesLength);
1368 }
1369 #if 0 // debugging output
1370 } else {
1371 printf("%s~%s collation tailoring part sizes:\n", state->filename, collationType);
1372 icu::CollationInfo::printSizes(totalSize, indexes);
1373 #endif
1374 }
1375 struct SResource *collationBin = bin_open(state->bundle, "%%CollationBin", totalSize, dest, NULL, NULL, status);
1376 result->add(collationBin, line, *status);
1377 if (U_FAILURE(*status)) {
1378 res_close(result);
1379 return NULL;
1380 }
1381 #endif
1382 return result;
1383 }
1384
1385 static UBool
keepCollationType(const char *)1386 keepCollationType(const char * /*type*/) {
1387 return true;
1388 }
1389
1390 static struct SResource *
parseCollationElements(ParseState * state,char * tag,uint32_t startline,UBool newCollation,UErrorCode * status)1391 parseCollationElements(ParseState* state, char *tag, uint32_t startline, UBool newCollation, UErrorCode *status)
1392 {
1393 TableResource *result = NULL;
1394 struct SResource *member = NULL;
1395 struct UString *tokenValue;
1396 struct UString comment;
1397 enum ETokenType token;
1398 char subtag[1024], typeKeyword[1024];
1399 uint32_t line;
1400
1401 result = table_open(state->bundle, tag, NULL, status);
1402
1403 if (result == NULL || U_FAILURE(*status))
1404 {
1405 return NULL;
1406 }
1407 if(isVerbose()){
1408 printf(" collation elements %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1409 }
1410 if(!newCollation) {
1411 return addCollation(state, result, "(no type)", startline, status);
1412 }
1413 else {
1414 for(;;) {
1415 ustr_init(&comment);
1416 token = getToken(state, &tokenValue, &comment, &line, status);
1417
1418 if (token == TOK_CLOSE_BRACE)
1419 {
1420 return result;
1421 }
1422
1423 if (token != TOK_STRING)
1424 {
1425 res_close(result);
1426 *status = U_INVALID_FORMAT_ERROR;
1427
1428 if (token == TOK_EOF)
1429 {
1430 error(startline, "unterminated table");
1431 }
1432 else
1433 {
1434 error(line, "Unexpected token %s", tokenNames[token]);
1435 }
1436
1437 return NULL;
1438 }
1439
1440 u_UCharsToChars(tokenValue->fChars, subtag, u_strlen(tokenValue->fChars) + 1);
1441
1442 if (U_FAILURE(*status))
1443 {
1444 res_close(result);
1445 return NULL;
1446 }
1447
1448 if (uprv_strcmp(subtag, "default") == 0)
1449 {
1450 member = parseResource(state, subtag, NULL, status);
1451
1452 if (U_FAILURE(*status))
1453 {
1454 res_close(result);
1455 return NULL;
1456 }
1457
1458 result->add(member, line, *status);
1459 }
1460 else
1461 {
1462 token = peekToken(state, 0, &tokenValue, &line, &comment, status);
1463 /* this probably needs to be refactored or recursively use the parser */
1464 /* first we assume that our collation table won't have the explicit type */
1465 /* then, we cannot handle aliases */
1466 if(token == TOK_OPEN_BRACE) {
1467 token = getToken(state, &tokenValue, &comment, &line, status);
1468 TableResource *collationRes;
1469 if (keepCollationType(subtag)) {
1470 collationRes = table_open(state->bundle, subtag, NULL, status);
1471 } else {
1472 collationRes = NULL;
1473 }
1474 // need to parse the collation data regardless
1475 collationRes = addCollation(state, collationRes, subtag, startline, status);
1476 if (collationRes != NULL) {
1477 result->add(collationRes, startline, *status);
1478 }
1479 } else if(token == TOK_COLON) { /* right now, we'll just try to see if we have aliases */
1480 /* we could have a table too */
1481 token = peekToken(state, 1, &tokenValue, &line, &comment, status);
1482 u_UCharsToChars(tokenValue->fChars, typeKeyword, u_strlen(tokenValue->fChars) + 1);
1483 if(uprv_strcmp(typeKeyword, "alias") == 0) {
1484 member = parseResource(state, subtag, NULL, status);
1485 if (U_FAILURE(*status))
1486 {
1487 res_close(result);
1488 return NULL;
1489 }
1490
1491 result->add(member, line, *status);
1492 } else {
1493 res_close(result);
1494 *status = U_INVALID_FORMAT_ERROR;
1495 return NULL;
1496 }
1497 } else {
1498 res_close(result);
1499 *status = U_INVALID_FORMAT_ERROR;
1500 return NULL;
1501 }
1502 }
1503
1504 /*member = string_open(bundle, subtag, tokenValue->fChars, tokenValue->fLength, status);*/
1505
1506 /*expect(TOK_CLOSE_BRACE, NULL, NULL, status);*/
1507
1508 if (U_FAILURE(*status))
1509 {
1510 res_close(result);
1511 return NULL;
1512 }
1513 }
1514 }
1515 }
1516
1517 /* Necessary, because CollationElements requires the bundle->fRoot member to be present which,
1518 if this weren't special-cased, wouldn't be set until the entire file had been processed. */
1519 static struct SResource *
realParseTable(ParseState * state,TableResource * table,char * tag,uint32_t startline,UErrorCode * status)1520 realParseTable(ParseState* state, TableResource *table, char *tag, uint32_t startline, UErrorCode *status)
1521 {
1522 struct SResource *member = NULL;
1523 struct UString *tokenValue=NULL;
1524 struct UString comment;
1525 enum ETokenType token;
1526 char subtag[1024];
1527 uint32_t line;
1528 UBool readToken = false;
1529
1530 /* '{' . (name resource)* '}' */
1531
1532 if(isVerbose()){
1533 printf(" parsing table %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1534 }
1535 for (;;)
1536 {
1537 ustr_init(&comment);
1538 token = getToken(state, &tokenValue, &comment, &line, status);
1539
1540 if (token == TOK_CLOSE_BRACE)
1541 {
1542 if (!readToken && isVerbose()) {
1543 warning(startline, "Encountered empty table");
1544 }
1545 return table;
1546 }
1547
1548 if (token != TOK_STRING)
1549 {
1550 *status = U_INVALID_FORMAT_ERROR;
1551
1552 if (token == TOK_EOF)
1553 {
1554 error(startline, "unterminated table");
1555 }
1556 else
1557 {
1558 error(line, "unexpected token %s", tokenNames[token]);
1559 }
1560
1561 return NULL;
1562 }
1563
1564 if(uprv_isInvariantUString(tokenValue->fChars, -1)) {
1565 u_UCharsToChars(tokenValue->fChars, subtag, u_strlen(tokenValue->fChars) + 1);
1566 } else {
1567 *status = U_INVALID_FORMAT_ERROR;
1568 error(line, "invariant characters required for table keys");
1569 return NULL;
1570 }
1571
1572 if (U_FAILURE(*status))
1573 {
1574 error(line, "parse error. Stopped parsing tokens with %s", u_errorName(*status));
1575 return NULL;
1576 }
1577
1578 member = parseResource(state, subtag, &comment, status);
1579
1580 if (member == NULL || U_FAILURE(*status))
1581 {
1582 error(line, "parse error. Stopped parsing resource with %s", u_errorName(*status));
1583 return NULL;
1584 }
1585
1586 table->add(member, line, *status);
1587
1588 if (U_FAILURE(*status))
1589 {
1590 error(line, "parse error. Stopped parsing table with %s", u_errorName(*status));
1591 return NULL;
1592 }
1593 readToken = true;
1594 ustr_deinit(&comment);
1595 }
1596
1597 /* not reached */
1598 /* A compiler warning will appear if all paths don't contain a return statement. */
1599 /* *status = U_INTERNAL_PROGRAM_ERROR;
1600 return NULL;*/
1601 }
1602
1603 static struct SResource *
parseTable(ParseState * state,char * tag,uint32_t startline,const struct UString * comment,UErrorCode * status)1604 parseTable(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
1605 {
1606 if (tag != NULL && uprv_strcmp(tag, "CollationElements") == 0)
1607 {
1608 return parseCollationElements(state, tag, startline, false, status);
1609 }
1610 if (tag != NULL && uprv_strcmp(tag, "collations") == 0)
1611 {
1612 return parseCollationElements(state, tag, startline, true, status);
1613 }
1614 if(isVerbose()){
1615 printf(" table %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1616 }
1617
1618 TableResource *result = table_open(state->bundle, tag, comment, status);
1619
1620 if (result == NULL || U_FAILURE(*status))
1621 {
1622 return NULL;
1623 }
1624 return realParseTable(state, result, tag, startline, status);
1625 }
1626
1627 static struct SResource *
parseArray(ParseState * state,char * tag,uint32_t startline,const struct UString * comment,UErrorCode * status)1628 parseArray(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
1629 {
1630 struct SResource *member = NULL;
1631 struct UString *tokenValue;
1632 struct UString memberComments;
1633 enum ETokenType token;
1634 UBool readToken = false;
1635
1636 ArrayResource *result = array_open(state->bundle, tag, comment, status);
1637
1638 if (result == NULL || U_FAILURE(*status))
1639 {
1640 return NULL;
1641 }
1642 if(isVerbose()){
1643 printf(" array %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1644 }
1645
1646 ustr_init(&memberComments);
1647
1648 /* '{' . resource [','] '}' */
1649 for (;;)
1650 {
1651 /* reset length */
1652 ustr_setlen(&memberComments, 0, status);
1653
1654 /* check for end of array, but don't consume next token unless it really is the end */
1655 token = peekToken(state, 0, &tokenValue, NULL, &memberComments, status);
1656
1657
1658 if (token == TOK_CLOSE_BRACE)
1659 {
1660 getToken(state, NULL, NULL, NULL, status);
1661 if (!readToken) {
1662 warning(startline, "Encountered empty array");
1663 }
1664 break;
1665 }
1666
1667 if (token == TOK_EOF)
1668 {
1669 res_close(result);
1670 *status = U_INVALID_FORMAT_ERROR;
1671 error(startline, "unterminated array");
1672 return NULL;
1673 }
1674
1675 /* string arrays are a special case */
1676 if (token == TOK_STRING)
1677 {
1678 getToken(state, &tokenValue, &memberComments, NULL, status);
1679 member = string_open(state->bundle, NULL, tokenValue->fChars, tokenValue->fLength, &memberComments, status);
1680 }
1681 else
1682 {
1683 member = parseResource(state, NULL, &memberComments, status);
1684 }
1685
1686 if (member == NULL || U_FAILURE(*status))
1687 {
1688 res_close(result);
1689 return NULL;
1690 }
1691
1692 result->add(member);
1693
1694 /* eat optional comma if present */
1695 token = peekToken(state, 0, NULL, NULL, NULL, status);
1696
1697 if (token == TOK_COMMA)
1698 {
1699 getToken(state, NULL, NULL, NULL, status);
1700 }
1701
1702 if (U_FAILURE(*status))
1703 {
1704 res_close(result);
1705 return NULL;
1706 }
1707 readToken = true;
1708 }
1709
1710 ustr_deinit(&memberComments);
1711 return result;
1712 }
1713
1714 static struct SResource *
parseIntVector(ParseState * state,char * tag,uint32_t startline,const struct UString * comment,UErrorCode * status)1715 parseIntVector(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
1716 {
1717 enum ETokenType token;
1718 char *string;
1719 int32_t value;
1720 UBool readToken = false;
1721 char *stopstring;
1722 struct UString memberComments;
1723
1724 IntVectorResource *result = intvector_open(state->bundle, tag, comment, status);
1725
1726 if (result == NULL || U_FAILURE(*status))
1727 {
1728 return NULL;
1729 }
1730
1731 if(isVerbose()){
1732 printf(" vector %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1733 }
1734 ustr_init(&memberComments);
1735 /* '{' . string [','] '}' */
1736 for (;;)
1737 {
1738 ustr_setlen(&memberComments, 0, status);
1739
1740 /* check for end of array, but don't consume next token unless it really is the end */
1741 token = peekToken(state, 0, NULL, NULL,&memberComments, status);
1742
1743 if (token == TOK_CLOSE_BRACE)
1744 {
1745 /* it's the end, consume the close brace */
1746 getToken(state, NULL, NULL, NULL, status);
1747 if (!readToken) {
1748 warning(startline, "Encountered empty int vector");
1749 }
1750 ustr_deinit(&memberComments);
1751 return result;
1752 }
1753
1754 int32_t stringLength;
1755 string = getInvariantString(state, NULL, NULL, stringLength, status);
1756
1757 if (U_FAILURE(*status))
1758 {
1759 res_close(result);
1760 return NULL;
1761 }
1762
1763 /* For handling illegal char in the Intvector */
1764 value = uprv_strtoul(string, &stopstring, 0);/* make intvector support decimal,hexdigit,octal digit ranging from -2^31-2^32-1*/
1765 int32_t len = (int32_t)(stopstring-string);
1766
1767 if(len==stringLength)
1768 {
1769 result->add(value, *status);
1770 uprv_free(string);
1771 token = peekToken(state, 0, NULL, NULL, NULL, status);
1772 }
1773 else
1774 {
1775 uprv_free(string);
1776 *status=U_INVALID_CHAR_FOUND;
1777 }
1778
1779 if (U_FAILURE(*status))
1780 {
1781 res_close(result);
1782 return NULL;
1783 }
1784
1785 /* the comma is optional (even though it is required to prevent the reader from concatenating
1786 consecutive entries) so that a missing comma on the last entry isn't an error */
1787 if (token == TOK_COMMA)
1788 {
1789 getToken(state, NULL, NULL, NULL, status);
1790 }
1791 readToken = true;
1792 }
1793
1794 /* not reached */
1795 /* A compiler warning will appear if all paths don't contain a return statement. */
1796 /* intvector_close(result, status);
1797 *status = U_INTERNAL_PROGRAM_ERROR;
1798 return NULL;*/
1799 }
1800
1801 static struct SResource *
parseBinary(ParseState * state,char * tag,uint32_t startline,const struct UString * comment,UErrorCode * status)1802 parseBinary(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
1803 {
1804 uint32_t line;
1805 int32_t stringLength;
1806 LocalMemory<char> string(getInvariantString(state, &line, NULL, stringLength, status));
1807 if (string.isNull() || U_FAILURE(*status))
1808 {
1809 return NULL;
1810 }
1811
1812 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
1813 if (U_FAILURE(*status))
1814 {
1815 return NULL;
1816 }
1817
1818 if(isVerbose()){
1819 printf(" binary %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1820 }
1821
1822 LocalMemory<uint8_t> value;
1823 int32_t count = 0;
1824 if (stringLength > 0 && value.allocateInsteadAndCopy(stringLength) == NULL)
1825 {
1826 *status = U_MEMORY_ALLOCATION_ERROR;
1827 return NULL;
1828 }
1829
1830 char toConv[3] = {'\0', '\0', '\0'};
1831 for (int32_t i = 0; i < stringLength;)
1832 {
1833 // Skip spaces (which may have been line endings).
1834 char c0 = string[i++];
1835 if (c0 == ' ') { continue; }
1836 if (i == stringLength) {
1837 *status=U_INVALID_CHAR_FOUND;
1838 error(line, "Encountered invalid binary value (odd number of hex digits)");
1839 return NULL;
1840 }
1841 toConv[0] = c0;
1842 toConv[1] = string[i++];
1843
1844 char *stopstring;
1845 value[count++] = (uint8_t) uprv_strtoul(toConv, &stopstring, 16);
1846 uint32_t len=(uint32_t)(stopstring-toConv);
1847
1848 if(len!=2)
1849 {
1850 *status=U_INVALID_CHAR_FOUND;
1851 error(line, "Encountered invalid binary value (not all pairs of hex digits)");
1852 return NULL;
1853 }
1854 }
1855
1856 if (count == 0) {
1857 warning(startline, "Encountered empty binary value");
1858 return bin_open(state->bundle, tag, 0, NULL, "", comment, status);
1859 } else {
1860 return bin_open(state->bundle, tag, count, value.getAlias(), NULL, comment, status);
1861 }
1862 }
1863
1864 static struct SResource *
parseInteger(ParseState * state,char * tag,uint32_t startline,const struct UString * comment,UErrorCode * status)1865 parseInteger(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
1866 {
1867 struct SResource *result = NULL;
1868 int32_t value;
1869 char *string;
1870 char *stopstring;
1871
1872 int32_t stringLength;
1873 string = getInvariantString(state, NULL, NULL, stringLength, status);
1874
1875 if (string == NULL || U_FAILURE(*status))
1876 {
1877 return NULL;
1878 }
1879
1880 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
1881
1882 if (U_FAILURE(*status))
1883 {
1884 uprv_free(string);
1885 return NULL;
1886 }
1887
1888 if(isVerbose()){
1889 printf(" integer %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1890 }
1891
1892 if (stringLength == 0)
1893 {
1894 warning(startline, "Encountered empty integer. Default value is 0.");
1895 }
1896
1897 /* Allow integer support for hexdecimal, octal digit and decimal*/
1898 /* and handle illegal char in the integer*/
1899 value = uprv_strtoul(string, &stopstring, 0);
1900 int32_t len = (int32_t)(stopstring-string);
1901 if(len==stringLength)
1902 {
1903 result = int_open(state->bundle, tag, value, comment, status);
1904 }
1905 else
1906 {
1907 *status=U_INVALID_CHAR_FOUND;
1908 }
1909 uprv_free(string);
1910
1911 return result;
1912 }
1913
1914 static struct SResource *
parseImport(ParseState * state,char * tag,uint32_t startline,const struct UString * comment,UErrorCode * status)1915 parseImport(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status)
1916 {
1917 uint32_t line;
1918 int32_t stringLength;
1919 LocalMemory<char> filename(getInvariantString(state, &line, NULL, stringLength, status));
1920 if (U_FAILURE(*status))
1921 {
1922 return NULL;
1923 }
1924
1925 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
1926
1927 if (U_FAILURE(*status))
1928 {
1929 return NULL;
1930 }
1931
1932 if(isVerbose()){
1933 printf(" import %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1934 }
1935
1936 /* Open the input file for reading */
1937 CharString fullname;
1938 if (state->inputdir != NULL) {
1939 fullname.append(state->inputdir, *status);
1940 }
1941 fullname.appendPathPart(filename.getAlias(), *status);
1942 if (U_FAILURE(*status)) {
1943 return NULL;
1944 }
1945
1946 FileStream *file = T_FileStream_open(fullname.data(), "rb");
1947 if (file == NULL)
1948 {
1949 error(line, "couldn't open input file %s", filename.getAlias());
1950 *status = U_FILE_ACCESS_ERROR;
1951 return NULL;
1952 }
1953
1954 int32_t len = T_FileStream_size(file);
1955 LocalMemory<uint8_t> data;
1956 if(data.allocateInsteadAndCopy(len) == NULL)
1957 {
1958 *status = U_MEMORY_ALLOCATION_ERROR;
1959 T_FileStream_close (file);
1960 return NULL;
1961 }
1962
1963 /* int32_t numRead = */ T_FileStream_read(file, data.getAlias(), len);
1964 T_FileStream_close (file);
1965
1966 return bin_open(state->bundle, tag, len, data.getAlias(), fullname.data(), comment, status);
1967 }
1968
1969 static struct SResource *
parseInclude(ParseState * state,char * tag,uint32_t startline,const struct UString * comment,UErrorCode * status)1970 parseInclude(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status)
1971 {
1972 struct SResource *result;
1973 int32_t len=0;
1974 char *filename;
1975 uint32_t line;
1976 UChar *pTarget = NULL;
1977
1978 UCHARBUF *ucbuf;
1979 char *fullname = NULL;
1980 const char* cp = NULL;
1981 const UChar* uBuffer = NULL;
1982
1983 int32_t stringLength;
1984 filename = getInvariantString(state, &line, NULL, stringLength, status);
1985
1986 if (U_FAILURE(*status))
1987 {
1988 return NULL;
1989 }
1990
1991 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
1992
1993 if (U_FAILURE(*status))
1994 {
1995 uprv_free(filename);
1996 return NULL;
1997 }
1998
1999 if(isVerbose()){
2000 printf(" include %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
2001 }
2002
2003 fullname = (char *) uprv_malloc(state->inputdirLength + stringLength + 2);
2004 /* test for NULL */
2005 if(fullname == NULL)
2006 {
2007 *status = U_MEMORY_ALLOCATION_ERROR;
2008 uprv_free(filename);
2009 return NULL;
2010 }
2011
2012 if(state->inputdir!=NULL){
2013 if (state->inputdir[state->inputdirLength - 1] != U_FILE_SEP_CHAR)
2014 {
2015
2016 uprv_strcpy(fullname, state->inputdir);
2017
2018 fullname[state->inputdirLength] = U_FILE_SEP_CHAR;
2019 fullname[state->inputdirLength + 1] = '\0';
2020
2021 uprv_strcat(fullname, filename);
2022 }
2023 else
2024 {
2025 uprv_strcpy(fullname, state->inputdir);
2026 uprv_strcat(fullname, filename);
2027 }
2028 }else{
2029 uprv_strcpy(fullname,filename);
2030 }
2031
2032 ucbuf = ucbuf_open(fullname, &cp,getShowWarning(),false,status);
2033
2034 if (U_FAILURE(*status)) {
2035 error(line, "couldn't open input file %s\n", filename);
2036 return NULL;
2037 }
2038
2039 uBuffer = ucbuf_getBuffer(ucbuf,&len,status);
2040 result = string_open(state->bundle, tag, uBuffer, len, comment, status);
2041
2042 ucbuf_close(ucbuf);
2043
2044 uprv_free(pTarget);
2045
2046 uprv_free(filename);
2047 uprv_free(fullname);
2048
2049 return result;
2050 }
2051
2052
2053
2054
2055
2056 U_STRING_DECL(k_type_string, "string", 6);
2057 U_STRING_DECL(k_type_binary, "binary", 6);
2058 U_STRING_DECL(k_type_bin, "bin", 3);
2059 U_STRING_DECL(k_type_table, "table", 5);
2060 U_STRING_DECL(k_type_table_no_fallback, "table(nofallback)", 17);
2061 U_STRING_DECL(k_type_int, "int", 3);
2062 U_STRING_DECL(k_type_integer, "integer", 7);
2063 U_STRING_DECL(k_type_array, "array", 5);
2064 U_STRING_DECL(k_type_alias, "alias", 5);
2065 U_STRING_DECL(k_type_intvector, "intvector", 9);
2066 U_STRING_DECL(k_type_import, "import", 6);
2067 U_STRING_DECL(k_type_include, "include", 7);
2068
2069 /* Various non-standard processing plugins that create one or more special resources. */
2070 U_STRING_DECL(k_type_plugin_uca_rules, "process(uca_rules)", 18);
2071 U_STRING_DECL(k_type_plugin_collation, "process(collation)", 18);
2072 U_STRING_DECL(k_type_plugin_transliterator, "process(transliterator)", 23);
2073 U_STRING_DECL(k_type_plugin_dependency, "process(dependency)", 19);
2074
2075 typedef enum EResourceType
2076 {
2077 RESTYPE_UNKNOWN,
2078 RESTYPE_STRING,
2079 RESTYPE_BINARY,
2080 RESTYPE_TABLE,
2081 RESTYPE_TABLE_NO_FALLBACK,
2082 RESTYPE_INTEGER,
2083 RESTYPE_ARRAY,
2084 RESTYPE_ALIAS,
2085 RESTYPE_INTVECTOR,
2086 RESTYPE_IMPORT,
2087 RESTYPE_INCLUDE,
2088 RESTYPE_PROCESS_UCA_RULES,
2089 RESTYPE_PROCESS_COLLATION,
2090 RESTYPE_PROCESS_TRANSLITERATOR,
2091 RESTYPE_PROCESS_DEPENDENCY,
2092 RESTYPE_RESERVED
2093 } EResourceType;
2094
2095 static struct {
2096 const char *nameChars; /* only used for debugging */
2097 const UChar *nameUChars;
2098 ParseResourceFunction *parseFunction;
2099 } gResourceTypes[] = {
2100 {"Unknown", NULL, NULL},
2101 {"string", k_type_string, parseString},
2102 {"binary", k_type_binary, parseBinary},
2103 {"table", k_type_table, parseTable},
2104 {"table(nofallback)", k_type_table_no_fallback, NULL}, /* parseFunction will never be called */
2105 {"integer", k_type_integer, parseInteger},
2106 {"array", k_type_array, parseArray},
2107 {"alias", k_type_alias, parseAlias},
2108 {"intvector", k_type_intvector, parseIntVector},
2109 {"import", k_type_import, parseImport},
2110 {"include", k_type_include, parseInclude},
2111 {"process(uca_rules)", k_type_plugin_uca_rules, parseUCARules},
2112 {"process(collation)", k_type_plugin_collation, NULL /* not implemented yet */},
2113 {"process(transliterator)", k_type_plugin_transliterator, parseTransliterator},
2114 {"process(dependency)", k_type_plugin_dependency, parseDependency},
2115 {"reserved", NULL, NULL}
2116 };
2117
initParser()2118 void initParser()
2119 {
2120 U_STRING_INIT(k_type_string, "string", 6);
2121 U_STRING_INIT(k_type_binary, "binary", 6);
2122 U_STRING_INIT(k_type_bin, "bin", 3);
2123 U_STRING_INIT(k_type_table, "table", 5);
2124 U_STRING_INIT(k_type_table_no_fallback, "table(nofallback)", 17);
2125 U_STRING_INIT(k_type_int, "int", 3);
2126 U_STRING_INIT(k_type_integer, "integer", 7);
2127 U_STRING_INIT(k_type_array, "array", 5);
2128 U_STRING_INIT(k_type_alias, "alias", 5);
2129 U_STRING_INIT(k_type_intvector, "intvector", 9);
2130 U_STRING_INIT(k_type_import, "import", 6);
2131 U_STRING_INIT(k_type_include, "include", 7);
2132
2133 U_STRING_INIT(k_type_plugin_uca_rules, "process(uca_rules)", 18);
2134 U_STRING_INIT(k_type_plugin_collation, "process(collation)", 18);
2135 U_STRING_INIT(k_type_plugin_transliterator, "process(transliterator)", 23);
2136 U_STRING_INIT(k_type_plugin_dependency, "process(dependency)", 19);
2137 }
2138
isTable(enum EResourceType type)2139 static inline UBool isTable(enum EResourceType type) {
2140 return (UBool)(type==RESTYPE_TABLE || type==RESTYPE_TABLE_NO_FALLBACK);
2141 }
2142
2143 static enum EResourceType
parseResourceType(ParseState * state,UErrorCode * status)2144 parseResourceType(ParseState* state, UErrorCode *status)
2145 {
2146 struct UString *tokenValue;
2147 struct UString comment;
2148 enum EResourceType result = RESTYPE_UNKNOWN;
2149 uint32_t line=0;
2150 ustr_init(&comment);
2151 expect(state, TOK_STRING, &tokenValue, &comment, &line, status);
2152
2153 if (U_FAILURE(*status))
2154 {
2155 return RESTYPE_UNKNOWN;
2156 }
2157
2158 *status = U_ZERO_ERROR;
2159
2160 /* Search for normal types */
2161 result=RESTYPE_UNKNOWN;
2162 while ((result=(EResourceType)(result+1)) < RESTYPE_RESERVED) {
2163 if (u_strcmp(tokenValue->fChars, gResourceTypes[result].nameUChars) == 0) {
2164 break;
2165 }
2166 }
2167 /* Now search for the aliases */
2168 if (u_strcmp(tokenValue->fChars, k_type_int) == 0) {
2169 result = RESTYPE_INTEGER;
2170 }
2171 else if (u_strcmp(tokenValue->fChars, k_type_bin) == 0) {
2172 result = RESTYPE_BINARY;
2173 }
2174 else if (result == RESTYPE_RESERVED) {
2175 char tokenBuffer[1024];
2176 u_austrncpy(tokenBuffer, tokenValue->fChars, sizeof(tokenBuffer));
2177 tokenBuffer[sizeof(tokenBuffer) - 1] = 0;
2178 *status = U_INVALID_FORMAT_ERROR;
2179 error(line, "unknown resource type '%s'", tokenBuffer);
2180 }
2181
2182 return result;
2183 }
2184
2185 /* parse a non-top-level resource */
2186 static struct SResource *
parseResource(ParseState * state,char * tag,const struct UString * comment,UErrorCode * status)2187 parseResource(ParseState* state, char *tag, const struct UString *comment, UErrorCode *status)
2188 {
2189 enum ETokenType token;
2190 enum EResourceType resType = RESTYPE_UNKNOWN;
2191 ParseResourceFunction *parseFunction = NULL;
2192 struct UString *tokenValue;
2193 uint32_t startline;
2194 uint32_t line;
2195
2196
2197 token = getToken(state, &tokenValue, NULL, &startline, status);
2198
2199 if(isVerbose()){
2200 printf(" resource %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
2201 }
2202
2203 /* name . [ ':' type ] '{' resource '}' */
2204 /* This function parses from the colon onwards. If the colon is present, parse the
2205 type then try to parse a resource of that type. If there is no explicit type,
2206 work it out using the lookahead tokens. */
2207 switch (token)
2208 {
2209 case TOK_EOF:
2210 *status = U_INVALID_FORMAT_ERROR;
2211 error(startline, "Unexpected EOF encountered");
2212 return NULL;
2213
2214 case TOK_ERROR:
2215 *status = U_INVALID_FORMAT_ERROR;
2216 return NULL;
2217
2218 case TOK_COLON:
2219 resType = parseResourceType(state, status);
2220 expect(state, TOK_OPEN_BRACE, &tokenValue, NULL, &startline, status);
2221
2222 if (U_FAILURE(*status))
2223 {
2224 return NULL;
2225 }
2226
2227 break;
2228
2229 case TOK_OPEN_BRACE:
2230 break;
2231
2232 default:
2233 *status = U_INVALID_FORMAT_ERROR;
2234 error(startline, "syntax error while reading a resource, expected '{' or ':'");
2235 return NULL;
2236 }
2237
2238
2239 if (resType == RESTYPE_UNKNOWN)
2240 {
2241 /* No explicit type, so try to work it out. At this point, we've read the first '{'.
2242 We could have any of the following:
2243 { { => array (nested)
2244 { :/} => array
2245 { string , => string array
2246
2247 { string { => table
2248
2249 { string :/{ => table
2250 { string } => string
2251 */
2252
2253 token = peekToken(state, 0, NULL, &line, NULL,status);
2254
2255 if (U_FAILURE(*status))
2256 {
2257 return NULL;
2258 }
2259
2260 if (token == TOK_OPEN_BRACE || token == TOK_COLON ||token ==TOK_CLOSE_BRACE )
2261 {
2262 resType = RESTYPE_ARRAY;
2263 }
2264 else if (token == TOK_STRING)
2265 {
2266 token = peekToken(state, 1, NULL, &line, NULL, status);
2267
2268 if (U_FAILURE(*status))
2269 {
2270 return NULL;
2271 }
2272
2273 switch (token)
2274 {
2275 case TOK_COMMA: resType = RESTYPE_ARRAY; break;
2276 case TOK_OPEN_BRACE: resType = RESTYPE_TABLE; break;
2277 case TOK_CLOSE_BRACE: resType = RESTYPE_STRING; break;
2278 case TOK_COLON: resType = RESTYPE_TABLE; break;
2279 default:
2280 *status = U_INVALID_FORMAT_ERROR;
2281 error(line, "Unexpected token after string, expected ',', '{' or '}'");
2282 return NULL;
2283 }
2284 }
2285 else
2286 {
2287 *status = U_INVALID_FORMAT_ERROR;
2288 error(line, "Unexpected token after '{'");
2289 return NULL;
2290 }
2291
2292 /* printf("Type guessed as %s\n", resourceNames[resType]); */
2293 } else if(resType == RESTYPE_TABLE_NO_FALLBACK) {
2294 *status = U_INVALID_FORMAT_ERROR;
2295 error(startline, "error: %s resource type not valid except on top bundle level", gResourceTypes[resType].nameChars);
2296 return NULL;
2297 }
2298
2299
2300 /* We should now know what we need to parse next, so call the appropriate parser
2301 function and return. */
2302 parseFunction = gResourceTypes[resType].parseFunction;
2303 if (parseFunction != NULL) {
2304 return parseFunction(state, tag, startline, comment, status);
2305 }
2306 else {
2307 *status = U_INTERNAL_PROGRAM_ERROR;
2308 error(startline, "internal error: %s resource type found and not handled", gResourceTypes[resType].nameChars);
2309 }
2310
2311 return NULL;
2312 }
2313
2314 /* parse the top-level resource */
2315 struct SRBRoot *
parse(UCHARBUF * buf,const char * inputDir,const char * outputDir,const char * filename,UBool makeBinaryCollation,UBool omitCollationRules,UBool icu4xMode,UErrorCode * status)2316 parse(UCHARBUF *buf, const char *inputDir, const char *outputDir, const char *filename,
2317 UBool makeBinaryCollation, UBool omitCollationRules, UBool icu4xMode, UErrorCode *status)
2318 {
2319 struct UString *tokenValue;
2320 struct UString comment;
2321 uint32_t line;
2322 enum EResourceType bundleType;
2323 enum ETokenType token;
2324 ParseState state;
2325 uint32_t i;
2326
2327
2328 for (i = 0; i < MAX_LOOKAHEAD + 1; i++)
2329 {
2330 ustr_init(&state.lookahead[i].value);
2331 ustr_init(&state.lookahead[i].comment);
2332 }
2333
2334 initLookahead(&state, buf, status);
2335
2336 state.inputdir = inputDir;
2337 state.inputdirLength = (state.inputdir != NULL) ? (uint32_t)uprv_strlen(state.inputdir) : 0;
2338 state.outputdir = outputDir;
2339 state.outputdirLength = (state.outputdir != NULL) ? (uint32_t)uprv_strlen(state.outputdir) : 0;
2340 state.filename = filename;
2341 state.makeBinaryCollation = makeBinaryCollation;
2342 state.omitCollationRules = omitCollationRules;
2343 state.icu4xMode = icu4xMode;
2344
2345 ustr_init(&comment);
2346 expect(&state, TOK_STRING, &tokenValue, &comment, NULL, status);
2347
2348 state.bundle = new SRBRoot(&comment, false, *status);
2349
2350 if (state.bundle == NULL || U_FAILURE(*status))
2351 {
2352 delete state.bundle;
2353
2354 return NULL;
2355 }
2356
2357
2358 state.bundle->setLocale(tokenValue->fChars, *status);
2359
2360 /* The following code is to make Empty bundle work no matter with :table specifer or not */
2361 token = getToken(&state, NULL, NULL, &line, status);
2362 if(token==TOK_COLON) {
2363 *status=U_ZERO_ERROR;
2364 bundleType=parseResourceType(&state, status);
2365
2366 if(isTable(bundleType))
2367 {
2368 expect(&state, TOK_OPEN_BRACE, NULL, NULL, &line, status);
2369 }
2370 else
2371 {
2372 *status=U_PARSE_ERROR;
2373 error(line, "parse error. Stopped parsing with %s", u_errorName(*status));
2374 }
2375 }
2376 else
2377 {
2378 /* not a colon */
2379 if(token==TOK_OPEN_BRACE)
2380 {
2381 *status=U_ZERO_ERROR;
2382 bundleType=RESTYPE_TABLE;
2383 }
2384 else
2385 {
2386 /* neither colon nor open brace */
2387 *status=U_PARSE_ERROR;
2388 bundleType=RESTYPE_UNKNOWN;
2389 error(line, "parse error, did not find open-brace '{' or colon ':', stopped with %s", u_errorName(*status));
2390 }
2391 }
2392
2393 if (U_FAILURE(*status))
2394 {
2395 delete state.bundle;
2396 return NULL;
2397 }
2398
2399 if(bundleType==RESTYPE_TABLE_NO_FALLBACK) {
2400 /*
2401 * Parse a top-level table with the table(nofallback) declaration.
2402 * This is the same as a regular table, but also sets the
2403 * URES_ATT_NO_FALLBACK flag in indexes[URES_INDEX_ATTRIBUTES] .
2404 */
2405 state.bundle->fNoFallback=true;
2406 }
2407 /* top-level tables need not handle special table names like "collations" */
2408 assert(!state.bundle->fIsPoolBundle);
2409 assert(state.bundle->fRoot->fType == URES_TABLE);
2410 TableResource *rootTable = static_cast<TableResource *>(state.bundle->fRoot);
2411 realParseTable(&state, rootTable, NULL, line, status);
2412 if(dependencyArray!=NULL){
2413 rootTable->add(dependencyArray, 0, *status);
2414 dependencyArray = NULL;
2415 }
2416 if (U_FAILURE(*status))
2417 {
2418 delete state.bundle;
2419 res_close(dependencyArray);
2420 return NULL;
2421 }
2422
2423 if (getToken(&state, NULL, NULL, &line, status) != TOK_EOF)
2424 {
2425 warning(line, "extraneous text after resource bundle (perhaps unmatched braces)");
2426 if(isStrict()){
2427 *status = U_INVALID_FORMAT_ERROR;
2428 return NULL;
2429 }
2430 }
2431
2432 cleanupLookahead(&state);
2433 ustr_deinit(&comment);
2434 return state.bundle;
2435 }
2436