1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2011-2014, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ppucd.cpp
9 * encoding: UTF-8
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2011dec11
14 * created by: Markus W. Scherer
15 */
16
17 #include "unicode/utypes.h"
18 #include "unicode/uchar.h"
19 #include "charstr.h"
20 #include "cstring.h"
21 #include "ppucd.h"
22 #include "uassert.h"
23 #include "uparse.h"
24
25 #include <stdio.h>
26 #include <string.h>
27
28 U_NAMESPACE_BEGIN
29
~PropertyNames()30 PropertyNames::~PropertyNames() {}
31
32 int32_t
getPropertyEnum(const char * name) const33 PropertyNames::getPropertyEnum(const char *name) const {
34 return u_getPropertyEnum(name);
35 }
36
37 int32_t
getPropertyValueEnum(int32_t property,const char * name) const38 PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const {
39 return u_getPropertyValueEnum((UProperty)property, name);
40 }
41
UniProps()42 UniProps::UniProps()
43 : start(U_SENTINEL), end(U_SENTINEL),
44 bmg(U_SENTINEL), bpb(U_SENTINEL),
45 scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL),
46 digitValue(-1), numericValue(NULL),
47 name(NULL), nameAlias(NULL) {
48 memset(binProps, 0, sizeof(binProps));
49 memset(intProps, 0, sizeof(intProps));
50 memset(age, 0, 4);
51 }
52
~UniProps()53 UniProps::~UniProps() {}
54
55 const int32_t PreparsedUCD::kNumLineBuffers;
56
PreparsedUCD(const char * filename,UErrorCode & errorCode)57 PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode)
58 : icuPnames(new PropertyNames()), pnames(icuPnames),
59 file(NULL),
60 defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0),
61 lineNumber(0),
62 lineType(NO_LINE),
63 fieldLimit(NULL), lineLimit(NULL) {
64 if(U_FAILURE(errorCode)) { return; }
65
66 if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
67 filename=NULL;
68 file=stdin;
69 } else {
70 file=fopen(filename, "r");
71 }
72 if(file==NULL) {
73 perror("error opening preparsed UCD");
74 fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\"");
75 errorCode=U_FILE_ACCESS_ERROR;
76 return;
77 }
78
79 memset(ucdVersion, 0, 4);
80 lines[0][0]=0;
81 }
82
~PreparsedUCD()83 PreparsedUCD::~PreparsedUCD() {
84 if(file!=stdin) {
85 fclose(file);
86 }
87 delete icuPnames;
88 }
89
90 // Same order as the LineType values.
91 static const char *lineTypeStrings[]={
92 NULL,
93 NULL,
94 "ucd",
95 "property",
96 "binary",
97 "value",
98 "defaults",
99 "block",
100 "cp",
101 "unassigned",
102 "algnamesrange"
103 };
104
105 PreparsedUCD::LineType
readLine(UErrorCode & errorCode)106 PreparsedUCD::readLine(UErrorCode &errorCode) {
107 if(U_FAILURE(errorCode)) { return NO_LINE; }
108 // Select the next available line buffer.
109 while(!isLineBufferAvailable(lineIndex)) {
110 ++lineIndex;
111 if (lineIndex == kNumLineBuffers) {
112 lineIndex = 0;
113 }
114 }
115 char *line=lines[lineIndex];
116 *line=0;
117 lineLimit=fieldLimit=line;
118 lineType=NO_LINE;
119 char *result=fgets(line, sizeof(lines[0]), file);
120 if(result==NULL) {
121 if(ferror(file)) {
122 perror("error reading preparsed UCD");
123 fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber);
124 errorCode=U_FILE_ACCESS_ERROR;
125 }
126 return NO_LINE;
127 }
128 ++lineNumber;
129 if(*line=='#') {
130 fieldLimit=strchr(line, 0);
131 return lineType=EMPTY_LINE;
132 }
133 // Remove trailing /r/n.
134 char c;
135 char *limit=strchr(line, 0);
136 while(line<limit && ((c=*(limit-1))=='\n' || c=='\r')) { --limit; }
137 // Remove trailing white space.
138 while(line<limit && ((c=*(limit-1))==' ' || c=='\t')) { --limit; }
139 *limit=0;
140 lineLimit=limit;
141 if(line==limit) {
142 fieldLimit=limit;
143 return lineType=EMPTY_LINE;
144 }
145 // Split by ';'.
146 char *semi=line;
147 while((semi=strchr(semi, ';'))!=NULL) { *semi++=0; }
148 fieldLimit=strchr(line, 0);
149 // Determine the line type.
150 int32_t type;
151 for(type=EMPTY_LINE+1;; ++type) {
152 if(type==LINE_TYPE_COUNT) {
153 fprintf(stderr,
154 "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n",
155 line, (long)lineNumber);
156 errorCode=U_PARSE_ERROR;
157 return NO_LINE;
158 }
159 if(0==strcmp(line, lineTypeStrings[type])) {
160 break;
161 }
162 }
163 lineType=(LineType)type;
164 if(lineType==UNICODE_VERSION_LINE && fieldLimit<lineLimit) {
165 u_versionFromString(ucdVersion, fieldLimit+1);
166 }
167 return lineType;
168 }
169
170 const char *
firstField()171 PreparsedUCD::firstField() {
172 char *field=lines[lineIndex];
173 fieldLimit=strchr(field, 0);
174 return field;
175 }
176
177 const char *
nextField()178 PreparsedUCD::nextField() {
179 if(fieldLimit==lineLimit) { return NULL; }
180 char *field=fieldLimit+1;
181 fieldLimit=strchr(field, 0);
182 return field;
183 }
184
185 const UniProps *
getProps(UnicodeSet & newValues,UErrorCode & errorCode)186 PreparsedUCD::getProps(UnicodeSet &newValues, UErrorCode &errorCode) {
187 if(U_FAILURE(errorCode)) { return NULL; }
188 newValues.clear();
189 if(!lineHasPropertyValues()) {
190 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
191 return NULL;
192 }
193 firstField();
194 const char *field=nextField();
195 if(field==NULL) {
196 // No range field after the type.
197 fprintf(stderr,
198 "error in preparsed UCD: missing default/block/cp range field "
199 "(no second field) on line %ld\n",
200 (long)lineNumber);
201 errorCode=U_PARSE_ERROR;
202 return NULL;
203 }
204 UChar32 start, end;
205 if(!parseCodePointRange(field, start, end, errorCode)) { return NULL; }
206 UniProps *props;
207 UBool insideBlock=FALSE; // TRUE if cp or unassigned range inside the block range.
208 switch(lineType) {
209 case DEFAULTS_LINE:
210 // Should occur before any block/cp/unassigned line.
211 if(blockLineIndex>=0) {
212 fprintf(stderr,
213 "error in preparsed UCD: default line %ld after one or more block lines\n",
214 (long)lineNumber);
215 errorCode=U_PARSE_ERROR;
216 return NULL;
217 }
218 if(defaultLineIndex>=0) {
219 fprintf(stderr,
220 "error in preparsed UCD: second line with default properties on line %ld\n",
221 (long)lineNumber);
222 errorCode=U_PARSE_ERROR;
223 return NULL;
224 }
225 if(start!=0 || end!=0x10ffff) {
226 fprintf(stderr,
227 "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n",
228 field, (long)lineNumber);
229 errorCode=U_PARSE_ERROR;
230 return NULL;
231 }
232 props=&defaultProps;
233 defaultLineIndex=lineIndex;
234 break;
235 case BLOCK_LINE:
236 blockProps=defaultProps; // Block inherits default properties.
237 props=&blockProps;
238 blockLineIndex=lineIndex;
239 break;
240 case CP_LINE:
241 case UNASSIGNED_LINE:
242 if(blockProps.start<=start && end<=blockProps.end) {
243 insideBlock=TRUE;
244 if(lineType==CP_LINE) {
245 // Code point range fully inside the last block inherits the block properties.
246 cpProps=blockProps;
247 } else {
248 // Unassigned line inside the block is based on default properties
249 // which override block properties.
250 cpProps=defaultProps;
251 newValues=blockValues;
252 // Except, it inherits the one blk=Block property.
253 int32_t blkIndex=UCHAR_BLOCK-UCHAR_INT_START;
254 cpProps.intProps[blkIndex]=blockProps.intProps[blkIndex];
255 newValues.remove((UChar32)UCHAR_BLOCK);
256 }
257 } else if(start>blockProps.end || end<blockProps.start) {
258 // Code point range fully outside the last block inherits the default properties.
259 cpProps=defaultProps;
260 } else {
261 // Code point range partially overlapping with the last block is illegal.
262 fprintf(stderr,
263 "error in preparsed UCD: cp range %s on line %ld only "
264 "partially overlaps with block range %04lX..%04lX\n",
265 field, (long)lineNumber, (long)blockProps.start, (long)blockProps.end);
266 errorCode=U_PARSE_ERROR;
267 return NULL;
268 }
269 props=&cpProps;
270 break;
271 default:
272 // Will not occur because of the range check above.
273 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
274 return NULL;
275 }
276 props->start=start;
277 props->end=end;
278 while((field=nextField())!=NULL) {
279 if(!parseProperty(*props, field, newValues, errorCode)) { return NULL; }
280 }
281 if(lineType==BLOCK_LINE) {
282 blockValues=newValues;
283 } else if(lineType==UNASSIGNED_LINE && insideBlock) {
284 // Unset newValues for values that are the same as the block values.
285 for(int32_t prop=0; prop<UCHAR_BINARY_LIMIT; ++prop) {
286 if(newValues.contains(prop) && cpProps.binProps[prop]==blockProps.binProps[prop]) {
287 newValues.remove(prop);
288 }
289 }
290 for(int32_t prop=UCHAR_INT_START; prop<UCHAR_INT_LIMIT; ++prop) {
291 int32_t index=prop-UCHAR_INT_START;
292 if(newValues.contains(prop) && cpProps.intProps[index]==blockProps.intProps[index]) {
293 newValues.remove(prop);
294 }
295 }
296 }
297 return props;
298 }
299
300 static const struct {
301 const char *name;
302 int32_t prop;
303 } ppucdProperties[]={
304 { "Name_Alias", PPUCD_NAME_ALIAS },
305 { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS },
306 { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING }
307 };
308
309 // Returns TRUE for "ok to continue parsing fields".
310 UBool
parseProperty(UniProps & props,const char * field,UnicodeSet & newValues,UErrorCode & errorCode)311 PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
312 UErrorCode &errorCode) {
313 CharString pBuffer;
314 const char *p=field;
315 const char *v=strchr(p, '=');
316 int binaryValue;
317 if(*p=='-') {
318 if(v!=NULL) {
319 fprintf(stderr,
320 "error in preparsed UCD: mix of binary-property-no and "
321 "enum-property syntax '%s' on line %ld\n",
322 field, (long)lineNumber);
323 errorCode=U_PARSE_ERROR;
324 return FALSE;
325 }
326 binaryValue=0;
327 ++p;
328 } else if(v==NULL) {
329 binaryValue=1;
330 } else {
331 binaryValue=-1;
332 // Copy out the property name rather than modifying the field (writing a NUL).
333 pBuffer.append(p, (int32_t)(v-p), errorCode);
334 p=pBuffer.data();
335 ++v;
336 }
337 int32_t prop=pnames->getPropertyEnum(p);
338 if(prop<0) {
339 for(int32_t i=0;; ++i) {
340 if(i==UPRV_LENGTHOF(ppucdProperties)) {
341 // Ignore unknown property names.
342 return TRUE;
343 }
344 if(0==uprv_stricmp(p, ppucdProperties[i].name)) {
345 prop=ppucdProperties[i].prop;
346 U_ASSERT(prop>=0);
347 break;
348 }
349 }
350 }
351 if(prop<UCHAR_BINARY_LIMIT) {
352 if(binaryValue>=0) {
353 props.binProps[prop]=(UBool)binaryValue;
354 } else {
355 // No binary value for a binary property.
356 fprintf(stderr,
357 "error in preparsed UCD: enum-property syntax '%s' "
358 "for binary property on line %ld\n",
359 field, (long)lineNumber);
360 errorCode=U_PARSE_ERROR;
361 }
362 } else if(binaryValue>=0) {
363 // Binary value for a non-binary property.
364 fprintf(stderr,
365 "error in preparsed UCD: binary-property syntax '%s' "
366 "for non-binary property on line %ld\n",
367 field, (long)lineNumber);
368 errorCode=U_PARSE_ERROR;
369 } else if (prop < UCHAR_INT_START) {
370 fprintf(stderr,
371 "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n",
372 prop, (long)lineNumber);
373 errorCode=U_PARSE_ERROR;
374 } else if(prop<UCHAR_INT_LIMIT) {
375 int32_t value=pnames->getPropertyValueEnum(prop, v);
376 if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) {
377 // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work.
378 char *end;
379 unsigned long ccc=uprv_strtoul(v, &end, 10);
380 if(v<end && *end==0 && ccc<=254) {
381 value=(int32_t)ccc;
382 }
383 }
384 if(value==UCHAR_INVALID_CODE) {
385 fprintf(stderr,
386 "error in preparsed UCD: '%s' is not a valid value on line %ld\n",
387 field, (long)lineNumber);
388 errorCode=U_PARSE_ERROR;
389 } else {
390 props.intProps[prop-UCHAR_INT_START]=value;
391 }
392 } else if(*v=='<') {
393 // Do not parse default values like <code point>, just set null values.
394 switch(prop) {
395 case UCHAR_BIDI_MIRRORING_GLYPH:
396 props.bmg=U_SENTINEL;
397 break;
398 case UCHAR_BIDI_PAIRED_BRACKET:
399 props.bpb=U_SENTINEL;
400 break;
401 case UCHAR_SIMPLE_CASE_FOLDING:
402 props.scf=U_SENTINEL;
403 break;
404 case UCHAR_SIMPLE_LOWERCASE_MAPPING:
405 props.slc=U_SENTINEL;
406 break;
407 case UCHAR_SIMPLE_TITLECASE_MAPPING:
408 props.stc=U_SENTINEL;
409 break;
410 case UCHAR_SIMPLE_UPPERCASE_MAPPING:
411 props.suc=U_SENTINEL;
412 break;
413 case UCHAR_CASE_FOLDING:
414 props.cf.remove();
415 break;
416 case UCHAR_LOWERCASE_MAPPING:
417 props.lc.remove();
418 break;
419 case UCHAR_TITLECASE_MAPPING:
420 props.tc.remove();
421 break;
422 case UCHAR_UPPERCASE_MAPPING:
423 props.uc.remove();
424 break;
425 case UCHAR_SCRIPT_EXTENSIONS:
426 props.scx.clear();
427 break;
428 default:
429 fprintf(stderr,
430 "error in preparsed UCD: '%s' is not a valid default value on line %ld\n",
431 field, (long)lineNumber);
432 errorCode=U_PARSE_ERROR;
433 }
434 } else {
435 char c;
436 switch(prop) {
437 case UCHAR_NUMERIC_VALUE:
438 props.numericValue=v;
439 c=*v;
440 if('0'<=c && c<='9' && v[1]==0) {
441 props.digitValue=c-'0';
442 } else {
443 props.digitValue=-1;
444 }
445 break;
446 case UCHAR_NAME:
447 props.name=v;
448 break;
449 case UCHAR_AGE:
450 u_versionFromString(props.age, v); // Writes 0.0.0.0 if v is not numeric.
451 break;
452 case UCHAR_BIDI_MIRRORING_GLYPH:
453 props.bmg=parseCodePoint(v, errorCode);
454 break;
455 case UCHAR_BIDI_PAIRED_BRACKET:
456 props.bpb=parseCodePoint(v, errorCode);
457 break;
458 case UCHAR_SIMPLE_CASE_FOLDING:
459 props.scf=parseCodePoint(v, errorCode);
460 break;
461 case UCHAR_SIMPLE_LOWERCASE_MAPPING:
462 props.slc=parseCodePoint(v, errorCode);
463 break;
464 case UCHAR_SIMPLE_TITLECASE_MAPPING:
465 props.stc=parseCodePoint(v, errorCode);
466 break;
467 case UCHAR_SIMPLE_UPPERCASE_MAPPING:
468 props.suc=parseCodePoint(v, errorCode);
469 break;
470 case UCHAR_CASE_FOLDING:
471 parseString(v, props.cf, errorCode);
472 break;
473 case UCHAR_LOWERCASE_MAPPING:
474 parseString(v, props.lc, errorCode);
475 break;
476 case UCHAR_TITLECASE_MAPPING:
477 parseString(v, props.tc, errorCode);
478 break;
479 case UCHAR_UPPERCASE_MAPPING:
480 parseString(v, props.uc, errorCode);
481 break;
482 case PPUCD_NAME_ALIAS:
483 props.nameAlias=v;
484 break;
485 case PPUCD_CONDITIONAL_CASE_MAPPINGS:
486 case PPUCD_TURKIC_CASE_FOLDING:
487 // No need to parse their values: They are hardcoded in the runtime library.
488 break;
489 case UCHAR_SCRIPT_EXTENSIONS:
490 parseScriptExtensions(v, props.scx, errorCode);
491 break;
492 default:
493 // Ignore unhandled properties.
494 return TRUE;
495 }
496 }
497 if(U_SUCCESS(errorCode)) {
498 newValues.add((UChar32)prop);
499 return TRUE;
500 } else {
501 return FALSE;
502 }
503 }
504
505 UBool
getRangeForAlgNames(UChar32 & start,UChar32 & end,UErrorCode & errorCode)506 PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
507 if(U_FAILURE(errorCode)) { return FALSE; }
508 if(lineType!=ALG_NAMES_RANGE_LINE) {
509 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
510 return FALSE;
511 }
512 firstField();
513 const char *field=nextField();
514 if(field==NULL) {
515 // No range field after the type.
516 fprintf(stderr,
517 "error in preparsed UCD: missing algnamesrange range field "
518 "(no second field) on line %ld\n",
519 (long)lineNumber);
520 errorCode=U_PARSE_ERROR;
521 return FALSE;
522 }
523 return parseCodePointRange(field, start, end, errorCode);
524 }
525
526 UChar32
parseCodePoint(const char * s,UErrorCode & errorCode)527 PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) {
528 char *end;
529 uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16);
530 if(end<=s || *end!=0 || value>=0x110000) {
531 fprintf(stderr,
532 "error in preparsed UCD: '%s' is not a valid code point on line %ld\n",
533 s, (long)lineNumber);
534 errorCode=U_PARSE_ERROR;
535 return U_SENTINEL;
536 }
537 return (UChar32)value;
538 }
539
540 UBool
parseCodePointRange(const char * s,UChar32 & start,UChar32 & end,UErrorCode & errorCode)541 PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
542 uint32_t st, e;
543 u_parseCodePointRange(s, &st, &e, &errorCode);
544 if(U_FAILURE(errorCode)) {
545 fprintf(stderr,
546 "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n",
547 s, (long)lineNumber);
548 return FALSE;
549 }
550 start=(UChar32)st;
551 end=(UChar32)e;
552 return TRUE;
553 }
554
555 void
parseString(const char * s,UnicodeString & uni,UErrorCode & errorCode)556 PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) {
557 UChar *buffer=toUCharPtr(uni.getBuffer(-1));
558 int32_t length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
559 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
560 errorCode=U_ZERO_ERROR;
561 uni.releaseBuffer(0);
562 buffer=toUCharPtr(uni.getBuffer(length));
563 length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
564 }
565 uni.releaseBuffer(length);
566 if(U_FAILURE(errorCode)) {
567 fprintf(stderr,
568 "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n",
569 s, (long)lineNumber);
570 }
571 }
572
573 void
parseScriptExtensions(const char * s,UnicodeSet & scx,UErrorCode & errorCode)574 PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) {
575 if(U_FAILURE(errorCode)) { return; }
576 scx.clear();
577 CharString scString;
578 for(;;) {
579 const char *scs;
580 const char *scLimit=strchr(s, ' ');
581 if(scLimit!=NULL) {
582 scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data();
583 if(U_FAILURE(errorCode)) { return; }
584 } else {
585 scs=s;
586 }
587 int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs);
588 if(script==UCHAR_INVALID_CODE) {
589 fprintf(stderr,
590 "error in preparsed UCD: '%s' is not a valid script code on line %ld\n",
591 scs, (long)lineNumber);
592 errorCode=U_PARSE_ERROR;
593 return;
594 } else if(scx.contains(script)) {
595 fprintf(stderr,
596 "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n",
597 scs, (long)lineNumber);
598 errorCode=U_PARSE_ERROR;
599 return;
600 } else {
601 scx.add(script);
602 }
603 if(scLimit!=NULL) {
604 s=scLimit+1;
605 } else {
606 break;
607 }
608 }
609 if(scx.isEmpty()) {
610 fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber);
611 errorCode=U_PARSE_ERROR;
612 }
613 }
614
615 U_NAMESPACE_END
616