1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2011-2014, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ppucd.cpp
9 * encoding: UTF-8
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2011dec11
14 * created by: Markus W. Scherer
15 */
16
17 #include "unicode/utypes.h"
18 #include "unicode/uchar.h"
19 #include "charstr.h"
20 #include "cstring.h"
21 #include "ppucd.h"
22 #include "uassert.h"
23 #include "uparse.h"
24
25 #include <stdio.h>
26 #include <string.h>
27
28 U_NAMESPACE_BEGIN
29
~PropertyNames()30 PropertyNames::~PropertyNames() {}
31
32 // TODO: Create a concrete subclass for the default PropertyNames implementation
33 // using the ICU library built-in property names API & data.
34 // Currently only the genprops tool uses PreparsedUCD, and provides its own
35 // PropertyNames implementation using its just-build property names data and its own code.
36 // At some point, we should use PreparsedUCD in tests, and then we will need the
37 // default implementation somewhere.
38 #if 0
39 int32_t
40 PropertyNames::getPropertyEnum(const char *name) const {
41 return u_getPropertyEnum(name);
42 }
43
44 int32_t
45 PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const {
46 return u_getPropertyValueEnum((UProperty)property, name);
47 }
48 #endif
49
UniProps()50 UniProps::UniProps()
51 : start(U_SENTINEL), end(U_SENTINEL),
52 bmg(U_SENTINEL), bpb(U_SENTINEL),
53 scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL),
54 digitValue(-1), numericValue(NULL),
55 name(NULL), nameAlias(NULL) {
56 memset(binProps, 0, sizeof(binProps));
57 memset(intProps, 0, sizeof(intProps));
58 memset(age, 0, 4);
59 }
60
~UniProps()61 UniProps::~UniProps() {}
62
63 const int32_t PreparsedUCD::kNumLineBuffers;
64
PreparsedUCD(const char * filename,UErrorCode & errorCode)65 PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode)
66 : pnames(nullptr),
67 file(NULL),
68 defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0),
69 lineNumber(0),
70 lineType(NO_LINE),
71 fieldLimit(NULL), lineLimit(NULL) {
72 if(U_FAILURE(errorCode)) { return; }
73
74 if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
75 filename=NULL;
76 file=stdin;
77 } else {
78 file=fopen(filename, "r");
79 }
80 if(file==NULL) {
81 perror("error opening preparsed UCD");
82 fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\"");
83 errorCode=U_FILE_ACCESS_ERROR;
84 return;
85 }
86
87 memset(ucdVersion, 0, 4);
88 lines[0][0]=0;
89 }
90
~PreparsedUCD()91 PreparsedUCD::~PreparsedUCD() {
92 if(file!=stdin) {
93 fclose(file);
94 }
95 }
96
97 // Same order as the LineType values.
98 static const char *lineTypeStrings[]={
99 NULL,
100 NULL,
101 "ucd",
102 "property",
103 "binary",
104 "value",
105 "defaults",
106 "block",
107 "cp",
108 "unassigned",
109 "algnamesrange"
110 };
111
112 PreparsedUCD::LineType
readLine(UErrorCode & errorCode)113 PreparsedUCD::readLine(UErrorCode &errorCode) {
114 if(U_FAILURE(errorCode)) { return NO_LINE; }
115 // Select the next available line buffer.
116 while(!isLineBufferAvailable(lineIndex)) {
117 ++lineIndex;
118 if (lineIndex == kNumLineBuffers) {
119 lineIndex = 0;
120 }
121 }
122 char *line=lines[lineIndex];
123 *line=0;
124 lineLimit=fieldLimit=line;
125 lineType=NO_LINE;
126 char *result=fgets(line, sizeof(lines[0]), file);
127 if(result==NULL) {
128 if(ferror(file)) {
129 perror("error reading preparsed UCD");
130 fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber);
131 errorCode=U_FILE_ACCESS_ERROR;
132 }
133 return NO_LINE;
134 }
135 ++lineNumber;
136 if(*line=='#') {
137 fieldLimit=strchr(line, 0);
138 return lineType=EMPTY_LINE;
139 }
140 // Remove trailing /r/n.
141 char c;
142 char *limit=strchr(line, 0);
143 while(line<limit && ((c=*(limit-1))=='\n' || c=='\r')) { --limit; }
144 // Remove trailing white space.
145 while(line<limit && ((c=*(limit-1))==' ' || c=='\t')) { --limit; }
146 *limit=0;
147 lineLimit=limit;
148 if(line==limit) {
149 fieldLimit=limit;
150 return lineType=EMPTY_LINE;
151 }
152 // Split by ';'.
153 char *semi=line;
154 while((semi=strchr(semi, ';'))!=NULL) { *semi++=0; }
155 fieldLimit=strchr(line, 0);
156 // Determine the line type.
157 int32_t type;
158 for(type=EMPTY_LINE+1;; ++type) {
159 if(type==LINE_TYPE_COUNT) {
160 fprintf(stderr,
161 "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n",
162 line, (long)lineNumber);
163 errorCode=U_PARSE_ERROR;
164 return NO_LINE;
165 }
166 if(0==strcmp(line, lineTypeStrings[type])) {
167 break;
168 }
169 }
170 lineType=(LineType)type;
171 if(lineType==UNICODE_VERSION_LINE && fieldLimit<lineLimit) {
172 u_versionFromString(ucdVersion, fieldLimit+1);
173 }
174 return lineType;
175 }
176
177 const char *
firstField()178 PreparsedUCD::firstField() {
179 char *field=lines[lineIndex];
180 fieldLimit=strchr(field, 0);
181 return field;
182 }
183
184 const char *
nextField()185 PreparsedUCD::nextField() {
186 if(fieldLimit==lineLimit) { return NULL; }
187 char *field=fieldLimit+1;
188 fieldLimit=strchr(field, 0);
189 return field;
190 }
191
192 const UniProps *
getProps(UnicodeSet & newValues,UErrorCode & errorCode)193 PreparsedUCD::getProps(UnicodeSet &newValues, UErrorCode &errorCode) {
194 if(U_FAILURE(errorCode)) { return NULL; }
195 newValues.clear();
196 if(!lineHasPropertyValues()) {
197 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
198 return NULL;
199 }
200 firstField();
201 const char *field=nextField();
202 if(field==NULL) {
203 // No range field after the type.
204 fprintf(stderr,
205 "error in preparsed UCD: missing default/block/cp range field "
206 "(no second field) on line %ld\n",
207 (long)lineNumber);
208 errorCode=U_PARSE_ERROR;
209 return NULL;
210 }
211 UChar32 start, end;
212 if(!parseCodePointRange(field, start, end, errorCode)) { return NULL; }
213 UniProps *props;
214 UBool insideBlock=false; // true if cp or unassigned range inside the block range.
215 switch(lineType) {
216 case DEFAULTS_LINE:
217 // Should occur before any block/cp/unassigned line.
218 if(blockLineIndex>=0) {
219 fprintf(stderr,
220 "error in preparsed UCD: default line %ld after one or more block lines\n",
221 (long)lineNumber);
222 errorCode=U_PARSE_ERROR;
223 return NULL;
224 }
225 if(defaultLineIndex>=0) {
226 fprintf(stderr,
227 "error in preparsed UCD: second line with default properties on line %ld\n",
228 (long)lineNumber);
229 errorCode=U_PARSE_ERROR;
230 return NULL;
231 }
232 if(start!=0 || end!=0x10ffff) {
233 fprintf(stderr,
234 "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n",
235 field, (long)lineNumber);
236 errorCode=U_PARSE_ERROR;
237 return NULL;
238 }
239 props=&defaultProps;
240 defaultLineIndex=lineIndex;
241 break;
242 case BLOCK_LINE:
243 blockProps=defaultProps; // Block inherits default properties.
244 props=&blockProps;
245 blockLineIndex=lineIndex;
246 break;
247 case CP_LINE:
248 case UNASSIGNED_LINE:
249 if(blockProps.start<=start && end<=blockProps.end) {
250 insideBlock=true;
251 if(lineType==CP_LINE) {
252 // Code point range fully inside the last block inherits the block properties.
253 cpProps=blockProps;
254 } else {
255 // Unassigned line inside the block is based on default properties
256 // which override block properties.
257 cpProps=defaultProps;
258 newValues=blockValues;
259 // Except, it inherits the one blk=Block property.
260 int32_t blkIndex=UCHAR_BLOCK-UCHAR_INT_START;
261 cpProps.intProps[blkIndex]=blockProps.intProps[blkIndex];
262 newValues.remove((UChar32)UCHAR_BLOCK);
263 }
264 } else if(start>blockProps.end || end<blockProps.start) {
265 // Code point range fully outside the last block inherits the default properties.
266 cpProps=defaultProps;
267 } else {
268 // Code point range partially overlapping with the last block is illegal.
269 fprintf(stderr,
270 "error in preparsed UCD: cp range %s on line %ld only "
271 "partially overlaps with block range %04lX..%04lX\n",
272 field, (long)lineNumber, (long)blockProps.start, (long)blockProps.end);
273 errorCode=U_PARSE_ERROR;
274 return NULL;
275 }
276 props=&cpProps;
277 break;
278 default:
279 // Will not occur because of the range check above.
280 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
281 return NULL;
282 }
283 props->start=start;
284 props->end=end;
285 while((field=nextField())!=NULL) {
286 if(!parseProperty(*props, field, newValues, errorCode)) { return NULL; }
287 }
288 if(lineType==BLOCK_LINE) {
289 blockValues=newValues;
290 } else if(lineType==UNASSIGNED_LINE && insideBlock) {
291 // Unset newValues for values that are the same as the block values.
292 for(int32_t prop=0; prop<UCHAR_BINARY_LIMIT; ++prop) {
293 if(newValues.contains(prop) && cpProps.binProps[prop]==blockProps.binProps[prop]) {
294 newValues.remove(prop);
295 }
296 }
297 for(int32_t prop=UCHAR_INT_START; prop<UCHAR_INT_LIMIT; ++prop) {
298 int32_t index=prop-UCHAR_INT_START;
299 if(newValues.contains(prop) && cpProps.intProps[index]==blockProps.intProps[index]) {
300 newValues.remove(prop);
301 }
302 }
303 }
304 return props;
305 }
306
307 static const struct {
308 const char *name;
309 int32_t prop;
310 } ppucdProperties[]={
311 { "Name_Alias", PPUCD_NAME_ALIAS },
312 { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS },
313 { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING }
314 };
315
316 // Returns true for "ok to continue parsing fields".
317 UBool
parseProperty(UniProps & props,const char * field,UnicodeSet & newValues,UErrorCode & errorCode)318 PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
319 UErrorCode &errorCode) {
320 CharString pBuffer;
321 const char *p=field;
322 const char *v=strchr(p, '=');
323 int binaryValue;
324 if(*p=='-') {
325 if(v!=NULL) {
326 fprintf(stderr,
327 "error in preparsed UCD: mix of binary-property-no and "
328 "enum-property syntax '%s' on line %ld\n",
329 field, (long)lineNumber);
330 errorCode=U_PARSE_ERROR;
331 return false;
332 }
333 binaryValue=0;
334 ++p;
335 } else if(v==NULL) {
336 binaryValue=1;
337 } else {
338 binaryValue=-1;
339 // Copy out the property name rather than modifying the field (writing a NUL).
340 pBuffer.append(p, (int32_t)(v-p), errorCode);
341 p=pBuffer.data();
342 ++v;
343 }
344 int32_t prop=pnames->getPropertyEnum(p);
345 if(prop<0) {
346 for(int32_t i=0;; ++i) {
347 if(i==UPRV_LENGTHOF(ppucdProperties)) {
348 // Ignore unknown property names.
349 return true;
350 }
351 if(0==uprv_stricmp(p, ppucdProperties[i].name)) {
352 prop=ppucdProperties[i].prop;
353 U_ASSERT(prop>=0);
354 break;
355 }
356 }
357 }
358 if(prop<UCHAR_BINARY_LIMIT) {
359 if(binaryValue>=0) {
360 props.binProps[prop]=(UBool)binaryValue;
361 } else {
362 // No binary value for a binary property.
363 fprintf(stderr,
364 "error in preparsed UCD: enum-property syntax '%s' "
365 "for binary property on line %ld\n",
366 field, (long)lineNumber);
367 errorCode=U_PARSE_ERROR;
368 }
369 } else if(binaryValue>=0) {
370 // Binary value for a non-binary property.
371 fprintf(stderr,
372 "error in preparsed UCD: binary-property syntax '%s' "
373 "for non-binary property on line %ld\n",
374 field, (long)lineNumber);
375 errorCode=U_PARSE_ERROR;
376 } else if (prop < UCHAR_INT_START) {
377 fprintf(stderr,
378 "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n",
379 prop, (long)lineNumber);
380 errorCode=U_PARSE_ERROR;
381 } else if(prop<UCHAR_INT_LIMIT) {
382 int32_t value=pnames->getPropertyValueEnum(prop, v);
383 if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) {
384 // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work.
385 char *end;
386 unsigned long ccc=uprv_strtoul(v, &end, 10);
387 if(v<end && *end==0 && ccc<=254) {
388 value=(int32_t)ccc;
389 }
390 }
391 if(value==UCHAR_INVALID_CODE) {
392 fprintf(stderr,
393 "error in preparsed UCD: '%s' is not a valid value on line %ld\n",
394 field, (long)lineNumber);
395 errorCode=U_PARSE_ERROR;
396 } else {
397 props.intProps[prop-UCHAR_INT_START]=value;
398 }
399 } else if(*v=='<') {
400 // Do not parse default values like <code point>, just set null values.
401 switch(prop) {
402 case UCHAR_BIDI_MIRRORING_GLYPH:
403 props.bmg=U_SENTINEL;
404 break;
405 case UCHAR_BIDI_PAIRED_BRACKET:
406 props.bpb=U_SENTINEL;
407 break;
408 case UCHAR_SIMPLE_CASE_FOLDING:
409 props.scf=U_SENTINEL;
410 break;
411 case UCHAR_SIMPLE_LOWERCASE_MAPPING:
412 props.slc=U_SENTINEL;
413 break;
414 case UCHAR_SIMPLE_TITLECASE_MAPPING:
415 props.stc=U_SENTINEL;
416 break;
417 case UCHAR_SIMPLE_UPPERCASE_MAPPING:
418 props.suc=U_SENTINEL;
419 break;
420 case UCHAR_CASE_FOLDING:
421 props.cf.remove();
422 break;
423 case UCHAR_LOWERCASE_MAPPING:
424 props.lc.remove();
425 break;
426 case UCHAR_TITLECASE_MAPPING:
427 props.tc.remove();
428 break;
429 case UCHAR_UPPERCASE_MAPPING:
430 props.uc.remove();
431 break;
432 case UCHAR_SCRIPT_EXTENSIONS:
433 props.scx.clear();
434 break;
435 default:
436 fprintf(stderr,
437 "error in preparsed UCD: '%s' is not a valid default value on line %ld\n",
438 field, (long)lineNumber);
439 errorCode=U_PARSE_ERROR;
440 }
441 } else {
442 char c;
443 switch(prop) {
444 case UCHAR_NUMERIC_VALUE:
445 props.numericValue=v;
446 c=*v;
447 if('0'<=c && c<='9' && v[1]==0) {
448 props.digitValue=c-'0';
449 } else {
450 props.digitValue=-1;
451 }
452 break;
453 case UCHAR_NAME:
454 props.name=v;
455 break;
456 case UCHAR_AGE:
457 u_versionFromString(props.age, v); // Writes 0.0.0.0 if v is not numeric.
458 break;
459 case UCHAR_BIDI_MIRRORING_GLYPH:
460 props.bmg=parseCodePoint(v, errorCode);
461 break;
462 case UCHAR_BIDI_PAIRED_BRACKET:
463 props.bpb=parseCodePoint(v, errorCode);
464 break;
465 case UCHAR_SIMPLE_CASE_FOLDING:
466 props.scf=parseCodePoint(v, errorCode);
467 break;
468 case UCHAR_SIMPLE_LOWERCASE_MAPPING:
469 props.slc=parseCodePoint(v, errorCode);
470 break;
471 case UCHAR_SIMPLE_TITLECASE_MAPPING:
472 props.stc=parseCodePoint(v, errorCode);
473 break;
474 case UCHAR_SIMPLE_UPPERCASE_MAPPING:
475 props.suc=parseCodePoint(v, errorCode);
476 break;
477 case UCHAR_CASE_FOLDING:
478 parseString(v, props.cf, errorCode);
479 break;
480 case UCHAR_LOWERCASE_MAPPING:
481 parseString(v, props.lc, errorCode);
482 break;
483 case UCHAR_TITLECASE_MAPPING:
484 parseString(v, props.tc, errorCode);
485 break;
486 case UCHAR_UPPERCASE_MAPPING:
487 parseString(v, props.uc, errorCode);
488 break;
489 case PPUCD_NAME_ALIAS:
490 props.nameAlias=v;
491 break;
492 case PPUCD_CONDITIONAL_CASE_MAPPINGS:
493 case PPUCD_TURKIC_CASE_FOLDING:
494 // No need to parse their values: They are hardcoded in the runtime library.
495 break;
496 case UCHAR_SCRIPT_EXTENSIONS:
497 parseScriptExtensions(v, props.scx, errorCode);
498 break;
499 default:
500 // Ignore unhandled properties.
501 return true;
502 }
503 }
504 if(U_SUCCESS(errorCode)) {
505 newValues.add((UChar32)prop);
506 return true;
507 } else {
508 return false;
509 }
510 }
511
512 UBool
getRangeForAlgNames(UChar32 & start,UChar32 & end,UErrorCode & errorCode)513 PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
514 if(U_FAILURE(errorCode)) { return false; }
515 if(lineType!=ALG_NAMES_RANGE_LINE) {
516 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
517 return false;
518 }
519 firstField();
520 const char *field=nextField();
521 if(field==NULL) {
522 // No range field after the type.
523 fprintf(stderr,
524 "error in preparsed UCD: missing algnamesrange range field "
525 "(no second field) on line %ld\n",
526 (long)lineNumber);
527 errorCode=U_PARSE_ERROR;
528 return false;
529 }
530 return parseCodePointRange(field, start, end, errorCode);
531 }
532
533 UChar32
parseCodePoint(const char * s,UErrorCode & errorCode)534 PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) {
535 char *end;
536 uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16);
537 if(end<=s || *end!=0 || value>=0x110000) {
538 fprintf(stderr,
539 "error in preparsed UCD: '%s' is not a valid code point on line %ld\n",
540 s, (long)lineNumber);
541 errorCode=U_PARSE_ERROR;
542 return U_SENTINEL;
543 }
544 return (UChar32)value;
545 }
546
547 UBool
parseCodePointRange(const char * s,UChar32 & start,UChar32 & end,UErrorCode & errorCode)548 PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
549 uint32_t st, e;
550 u_parseCodePointRange(s, &st, &e, &errorCode);
551 if(U_FAILURE(errorCode)) {
552 fprintf(stderr,
553 "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n",
554 s, (long)lineNumber);
555 return false;
556 }
557 start=(UChar32)st;
558 end=(UChar32)e;
559 return true;
560 }
561
562 void
parseString(const char * s,UnicodeString & uni,UErrorCode & errorCode)563 PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) {
564 UChar *buffer=toUCharPtr(uni.getBuffer(-1));
565 int32_t length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
566 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
567 errorCode=U_ZERO_ERROR;
568 uni.releaseBuffer(0);
569 buffer=toUCharPtr(uni.getBuffer(length));
570 length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
571 }
572 uni.releaseBuffer(length);
573 if(U_FAILURE(errorCode)) {
574 fprintf(stderr,
575 "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n",
576 s, (long)lineNumber);
577 }
578 }
579
580 void
parseScriptExtensions(const char * s,UnicodeSet & scx,UErrorCode & errorCode)581 PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) {
582 if(U_FAILURE(errorCode)) { return; }
583 scx.clear();
584 CharString scString;
585 for(;;) {
586 const char *scs;
587 const char *scLimit=strchr(s, ' ');
588 if(scLimit!=NULL) {
589 scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data();
590 if(U_FAILURE(errorCode)) { return; }
591 } else {
592 scs=s;
593 }
594 int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs);
595 if(script==UCHAR_INVALID_CODE) {
596 fprintf(stderr,
597 "error in preparsed UCD: '%s' is not a valid script code on line %ld\n",
598 scs, (long)lineNumber);
599 errorCode=U_PARSE_ERROR;
600 return;
601 } else if(scx.contains(script)) {
602 fprintf(stderr,
603 "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n",
604 scs, (long)lineNumber);
605 errorCode=U_PARSE_ERROR;
606 return;
607 } else {
608 scx.add(script);
609 }
610 if(scLimit!=NULL) {
611 s=scLimit+1;
612 } else {
613 break;
614 }
615 }
616 if(scx.isEmpty()) {
617 fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber);
618 errorCode=U_PARSE_ERROR;
619 }
620 }
621
622 U_NAMESPACE_END
623