1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2004-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: casepropsbuilder.cpp (was gencase/store.c)
11 * encoding: US-ASCII
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2004aug28
16 * created by: Markus W. Scherer
17 *
18 * Store Unicode case mapping properties efficiently for
19 * random access.
20 */
21
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include "unicode/utypes.h"
25 #include "unicode/localpointer.h"
26 #include "unicode/uchar.h"
27 #include "unicode/udata.h"
28 #include "unicode/uniset.h"
29 #include "unicode/usetiter.h"
30 #include "unicode/ustring.h"
31 #include "cmemory.h"
32 #include "cstring.h"
33 #include "genprops.h"
34 #include "ppucd.h"
35 #include "uassert.h"
36 #include "uarrsort.h"
37 #include "ucase.h"
38 #include "unewdata.h"
39 #include "utrie2.h"
40 #include "writesrc.h"
41
42 /* Unicode case mapping properties file format ---------------------------------
43
44 The file format prepared and written here contains several data
45 structures that store indexes or data.
46
47 Before the data contents described below, there are the headers required by
48 the udata API for loading ICU data. Especially, a UDataInfo structure
49 precedes the actual data. It contains platform properties values and the
50 file format version.
51
52 The following is a description of format version 4.0 .
53
54 Format version 1.1 adds data for case closure.
55
56 Format version 1.2 adds an exception bit for case-ignorable. Needed because
57 the Cased and Case_Ignorable properties are not disjoint.
58
59 Format version 2.0 changes from UTrie to UTrie2.
60
61 Format version 3.0 (ICU 49) shuffles the trie bits to simplify some builder and runtime code.
62 It moves the Case_Ignorable flag from sometimes-trie-bit 6, sometimes-exception-bit 11
63 to always-trie-bit 2 and adjusts the higher trie bits accordingly.
64 Exception index reduced from 12 bits to 11, simple case mapping delta reduced from 10 bits to 9.
65
66 Format version 4.0 (ICU 62) swaps trie data bits 3 and 4, exception vs. case-sensitive,
67 and when exception=1 then data bits 15..4 (not 15..5) are used for the exception index,
68 and the case-sensitive bit is moved into the excWord. This will allow for more exceptions words.
69 Also, an additional optional exception slot is used for a 16-bit delta,
70 with one more excWord bit if the delta is actually negative,
71 for a reasonably compact, and compressible, encoding of simple case mappings
72 between distant blocks for Cherokee, Georgian, and similar.
73 Another excWord bit is used to indicate that the character has no simple case folding,
74 even if it has a simple lowercase mapping.
75
76 The file contains the following structures:
77
78 const int32_t indexes[i0] with values i0, i1, ...:
79 (see UCASE_IX_... constants for names of indexes)
80
81 i0 indexLength; -- length of indexes[] (UCASE_IX_TOP)
82 i1 dataLength; -- length in bytes of the post-header data (incl. indexes[])
83 i2 trieSize; -- size in bytes of the case mapping properties trie
84 i3 exceptionsLength; -- length in uint16_t of the exceptions array
85 i4 unfoldLength; -- length in uint16_t of the reverse-folding array (new in format version 1.1)
86
87 i5..i14 reservedIndexes; -- reserved values; 0 for now
88
89 i15 maxFullLength; -- maximum length of a full case mapping/folding string
90
91
92 Serialized trie, see utrie2.h;
93
94 const uint16_t exceptions[exceptionsLength];
95
96 const UChar unfold[unfoldLength];
97
98
99 Trie data word:
100 Bits
101 if(exception) {
102 15..4 unsigned exception index
103 } else {
104 if(not uncased) {
105 15..7 signed delta to simple case mapping code point
106 (add delta to input code point)
107 } else {
108 15..7 reserved, 0
109 }
110 6..5 0 normal character with cc=0
111 1 soft-dotted character
112 2 cc=230
113 3 other cc
114 The runtime code relies on these two bits to be adjacent with this encoding.
115 }
116 4 case-sensitive
117 3 exception
118 2 case-ignorable
119 1..0 0 uncased
120 1 lowercase
121 2 uppercase
122 3 titlecase
123 The runtime code relies on the case-ignorable and case type bits 2..0
124 to be the lowest bits with this encoding.
125
126
127 Exceptions:
128 A sub-array of the exceptions array is indexed by the exception index in a
129 trie word.
130 The sub-array consists of the following fields:
131 uint16_t excWord;
132 uint16_t optional values [];
133 UTF-16 strings for full (string) mappings for lowercase, case folding, uppercase, titlecase
134
135 excWord: (see UCASE_EXC_...)
136 Bits
137 15 conditional case folding
138 14 conditional special casing
139 13..12 same as non-exception trie data bits 6..5
140 moved here because the exception index needs more bits than the delta
141 0 normal character with cc=0
142 1 soft-dotted character
143 2 cc=230
144 3 other cc
145 11 same as non-exception case-sensitive bit
146 10 the delta in the optional value slot is negative
147 9 no simple case folding, even if there is a simple lowercase mapping
148 8 if set, then for each optional-value slot there are 2 uint16_t values
149 (high and low parts of 32-bit values)
150 instead of single ones
151 7.. 0 bits for which optional value is present
152
153 Optional-value slots:
154 0 lowercase mapping (code point)
155 1 case folding (code point)
156 2 uppercase mapping (code point)
157 3 titlecase mapping (code point)
158 4 delta to simple case mapping code point
159 (add delta to input code point, or subtract if excWord bit 10 is set)
160 5 reserved
161 6 closure mappings (new in format version 1.1)
162 7 there is at least one full (string) case mapping
163 the length of each is encoded in a nibble of this optional value,
164 and the strings follow this optional value in the same order:
165 lower/fold/upper/title
166
167 The optional closure mappings value is used as follows:
168 Bits 0..3 contain the length of a string of code points for case closure.
169 The string immediately follows the full case mappings, or the closure value
170 slot if there are no full case mappings.
171 Bits 4..15 are reserved and could be used in the future to indicate the
172 number of strings for case closure.
173 Complete case closure for a code point is given by the union of all simple
174 and full case mappings and foldings, plus the case closure code points
175 (and potentially, in the future, case closure strings).
176
177 For space saving, some values are not stored. Lookups are as follows:
178 - If special casing is conditional, then no full lower/upper/title mapping
179 strings are stored.
180 - If case folding is conditional, then no simple or full case foldings are
181 stored.
182 - Fall back in this order:
183 full (string) mapping -- if full mappings are used
184 simple (code point) mapping of the same type
185 simple fold->simple lower
186 simple title->simple upper
187 finally, the original code point (no mapping)
188
189 This fallback order is strict:
190 In particular, the fallback from full case folding is to simple case folding,
191 not to full lowercase mapping.
192
193 Reverse case folding data ("unfold") array: (new in format version 1.1)
194
195 This array stores some miscellaneous values followed by a table. The data maps
196 back from multi-character strings to their original code points, for use
197 in case closure.
198
199 The table contains two columns of strings.
200 The string in the first column is the case folding of each of the code points
201 in the second column. The strings are terminated with NUL or by the end of the
202 column, whichever comes first.
203
204 The miscellaneous data takes up one pseudo-row and includes:
205 - number of rows
206 - number of UChars per row
207 - number of UChars in the left (folding string) column
208
209 The table is sorted by its first column. Values in the first column are unique.
210
211 ----------------------------------------------------------------------------- */
212
213 U_NAMESPACE_USE
214
215 /* UDataInfo cf. udata.h */
216 static UDataInfo dataInfo={
217 sizeof(UDataInfo),
218 0,
219
220 U_IS_BIG_ENDIAN,
221 U_CHARSET_FAMILY,
222 U_SIZEOF_UCHAR,
223 0,
224
225 /* dataFormat="cAsE" */
226 { UCASE_FMT_0, UCASE_FMT_1, UCASE_FMT_2, UCASE_FMT_3 },
227 { 4, 0, 0, 0 }, /* formatVersion */
228 { 11, 0, 0, 0 } /* dataVersion */
229 };
230
231 #define UGENCASE_EXC_SHIFT 20
232 #define UGENCASE_EXC_MASK 0xfff00000
233
234 enum {
235 MAX_EXC_COUNT=(UGENCASE_EXC_MASK>>UGENCASE_EXC_SHIFT)+1
236 };
237
238 struct ExcProps {
ExcPropsExcProps239 ExcProps() :
240 delta(0), hasConditionalCaseMappings(FALSE), hasTurkicCaseFolding(FALSE),
241 hasNoSimpleCaseFolding(FALSE) {}
ExcPropsExcProps242 ExcProps(const UniProps &otherProps) :
243 props(otherProps),
244 delta(0), hasConditionalCaseMappings(FALSE), hasTurkicCaseFolding(FALSE),
245 hasNoSimpleCaseFolding(FALSE) {}
246
247 UniProps props;
248 UnicodeSet closure;
249 int32_t delta;
250 UBool hasConditionalCaseMappings;
251 UBool hasTurkicCaseFolding;
252 UBool hasNoSimpleCaseFolding;
253 };
254
255 /*
256 * Values for the ucase.icu unfold[] data array.
257 * The values are stored in ucase.icu so that the runtime code will work with
258 * changing values, but they are hardcoded here for simplicity.
259 * They are optimized, that is, provide for minimal table column widths,
260 * for the actual Unicode data, so that the table size is minimized.
261 * Future versions of Unicode may require increases of some of these values.
262 */
263 enum {
264 UGENCASE_UNFOLD_STRING_WIDTH=3,
265 UGENCASE_UNFOLD_CP_WIDTH=2,
266 UGENCASE_UNFOLD_WIDTH=UGENCASE_UNFOLD_STRING_WIDTH+UGENCASE_UNFOLD_CP_WIDTH
267 };
268
269 class CasePropsBuilder : public PropsBuilder {
270 public:
271 CasePropsBuilder(UErrorCode &errorCode);
272 virtual ~CasePropsBuilder();
273
274 virtual void setUnicodeVersion(const UVersionInfo version);
275 virtual void setProps(const UniProps &, const UnicodeSet &newValues, UErrorCode &errorCode);
276 virtual void build(UErrorCode &errorCode);
277 virtual void writeCSourceFile(const char *path, UErrorCode &errorCode);
278 virtual void writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode);
279
280 private:
281 uint32_t makeExcProps(UChar32 c, uint32_t value, UErrorCode &errorCode);
282 void addUnfolding(UChar32 c, const UnicodeString &s, UErrorCode &errorCode);
283 void makeUnfoldData(UErrorCode &errorCode);
284 void addClosureMapping(UChar32 src, UChar32 dest, UErrorCode &errorCode);
285 UBool addClosure(UChar32 orig, UChar32 prev2, UChar32 prev, UChar32 c, uint32_t value,
286 UErrorCode &errorCode);
287 void makeCaseClosure(UErrorCode &errorCode);
288 int32_t makeException(UChar32 c, uint32_t value, ExcProps &ep, UErrorCode &errorCode);
289 void makeExceptions(UErrorCode &errorCode);
290
291 UnicodeSet relevantProps;
292 /*
293 * Unicode set collecting the case-sensitive characters;
294 * see uchar.h UCHAR_CASE_SENSITIVE.
295 * Add code points from case mappings/foldings in
296 * the root locale and with default options.
297 */
298 UnicodeSet caseSensitive;
299 /* reverse case folding ("unfold") data */
300 UnicodeString unfold;
301 UnicodeString exceptions;
302 ExcProps **excProps;
303 int32_t excPropsCount;
304 /* becomes indexes[UCASE_IX_MAX_FULL_LENGTH] */
305 int32_t maxFullLength;
306 UTrie2 *pTrie;
307 };
308
CasePropsBuilder(UErrorCode & errorCode)309 CasePropsBuilder::CasePropsBuilder(UErrorCode &errorCode)
310 : excProps(NULL), excPropsCount(0), maxFullLength(U16_MAX_LENGTH), pTrie(NULL) {
311 // This builder encodes the following properties.
312 relevantProps.
313 add(UCHAR_CANONICAL_COMBINING_CLASS). // 0 vs. 230 vs. other
314 add(UCHAR_SOFT_DOTTED).
315 add(UCHAR_LOWERCASE).
316 add(UCHAR_UPPERCASE).
317 add(UCHAR_CASE_IGNORABLE).
318 add(UCHAR_SIMPLE_CASE_FOLDING).
319 add(UCHAR_SIMPLE_LOWERCASE_MAPPING).
320 add(UCHAR_SIMPLE_TITLECASE_MAPPING).
321 add(UCHAR_SIMPLE_UPPERCASE_MAPPING).
322 add(UCHAR_CASE_FOLDING).
323 add(UCHAR_LOWERCASE_MAPPING).
324 add(UCHAR_TITLECASE_MAPPING).
325 add(UCHAR_UPPERCASE_MAPPING).
326 add(PPUCD_CONDITIONAL_CASE_MAPPINGS).
327 add(PPUCD_TURKIC_CASE_FOLDING);
328 // Write "unfold" meta data into the first row. Must be UGENCASE_UNFOLD_WIDTH UChars.
329 unfold.
330 append(0).
331 append((UChar)UGENCASE_UNFOLD_WIDTH).
332 append((UChar)UGENCASE_UNFOLD_STRING_WIDTH).
333 append(0).
334 append(0);
335 U_ASSERT(unfold.length()==UGENCASE_UNFOLD_WIDTH);
336 pTrie=utrie2_open(0, 0, &errorCode);
337 if(U_FAILURE(errorCode)) {
338 fprintf(stderr, "genprops error: casepropsbuilder utrie2_open() failed - %s\n",
339 u_errorName(errorCode));
340 return;
341 }
342 excProps=new ExcProps *[MAX_EXC_COUNT];
343 if(excProps==NULL) {
344 fprintf(stderr,
345 "genprops error: casepropsbuilder out of memory allocating "
346 "the array of exceptions properties\n");
347 errorCode=U_MEMORY_ALLOCATION_ERROR;
348 }
349 }
350
~CasePropsBuilder()351 CasePropsBuilder::~CasePropsBuilder() {
352 utrie2_close(pTrie);
353 for(int32_t i=0; i<excPropsCount; ++i) {
354 delete excProps[i];
355 }
356 delete[] excProps;
357 }
358
359 void
setUnicodeVersion(const UVersionInfo version)360 CasePropsBuilder::setUnicodeVersion(const UVersionInfo version) {
361 uprv_memcpy(dataInfo.dataVersion, version, 4);
362 }
363
364 /* -------------------------------------------------------------------------- */
365
366 void
addUnfolding(UChar32 c,const UnicodeString & s,UErrorCode & errorCode)367 CasePropsBuilder::addUnfolding(UChar32 c, const UnicodeString &s, UErrorCode &errorCode) {
368 if(U_FAILURE(errorCode)) { return; }
369
370 int32_t length=s.length();
371 if(length>UGENCASE_UNFOLD_STRING_WIDTH) {
372 fprintf(stderr, "genprops error: case folding too long (length=%ld>%d=UGENCASE_UNFOLD_STRING_WIDTH)\n",
373 (long)length, UGENCASE_UNFOLD_STRING_WIDTH);
374 errorCode=U_INTERNAL_PROGRAM_ERROR;
375 }
376 unfold.append(s);
377 while(length<UGENCASE_UNFOLD_STRING_WIDTH) {
378 unfold.append(0);
379 ++length;
380 }
381
382 unfold.append(c);
383 if(U16_LENGTH(c)<UGENCASE_UNFOLD_CP_WIDTH) {
384 unfold.append(0);
385 }
386
387 U_ASSERT((unfold.length()%UGENCASE_UNFOLD_WIDTH)==0);
388 }
389
390 /* store a character's properties ------------------------------------------- */
391
392 void
setProps(const UniProps & props,const UnicodeSet & newValues,UErrorCode & errorCode)393 CasePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
394 UErrorCode &errorCode) {
395 if(U_FAILURE(errorCode) || newValues.containsNone(relevantProps)) { return; }
396
397 UChar32 start=props.start;
398 UChar32 end=props.end;
399
400 /* default: map to self */
401 int32_t delta=0;
402 UBool noDelta=FALSE;
403
404 uint32_t type;
405 if(props.binProps[UCHAR_LOWERCASE]) {
406 type=UCASE_LOWER;
407 } else if(props.binProps[UCHAR_UPPERCASE]) {
408 type=UCASE_UPPER;
409 } else if(props.getIntProp(UCHAR_GENERAL_CATEGORY)==U_TITLECASE_LETTER) {
410 type=UCASE_TITLE;
411 } else {
412 type=UCASE_NONE;
413 }
414 uint32_t value=type;
415
416 // Examine simple case mappings.
417 UBool hasMapping=FALSE;
418 if(props.suc>=0) {
419 /* uppercase mapping as delta if the character is lowercase */
420 hasMapping=TRUE;
421 if(type==UCASE_LOWER) {
422 delta=props.suc-start;
423 } else {
424 noDelta=TRUE;
425 value|=UCASE_EXCEPTION;
426 }
427 }
428 if(props.slc>=0) {
429 /* lowercase mapping as delta if the character is uppercase or titlecase */
430 hasMapping=TRUE;
431 if(type>=UCASE_UPPER) {
432 delta=props.slc-start;
433 } else {
434 noDelta=TRUE;
435 value|=UCASE_EXCEPTION;
436 }
437 }
438 if(props.stc>=0) {
439 hasMapping=TRUE;
440 }
441 if(props.suc!=props.stc) {
442 noDelta=TRUE;
443 value|=UCASE_EXCEPTION;
444 }
445
446 // Simple case folding falls back to simple lowercasing.
447 // If they differ, then store them separately.
448 UChar32 scf=props.scf;
449 if(scf>=0 && scf!=props.slc) {
450 hasMapping=noDelta=TRUE;
451 value|=UCASE_EXCEPTION;
452 }
453
454 // If there is no case folding but there is a lowercase mapping,
455 // then set a bit for that.
456 // For example: Cherokee uppercase syllables since Unicode 8.
457 // (Full case folding falls back to simple case folding,
458 // not to full lowercasing, so we need not also handle it specially
459 // for such cases.)
460 UBool hasNoSimpleCaseFolding=FALSE;
461 if(scf<0 && props.slc>=0) {
462 hasNoSimpleCaseFolding=TRUE;
463 value|=UCASE_EXCEPTION;
464 }
465
466 if(noDelta) {
467 delta=0;
468 } else if(delta<UCASE_MIN_DELTA || UCASE_MAX_DELTA<delta) {
469 // The case mapping delta is too big for the main data word.
470 // Store it in an exceptions slot.
471 value|=UCASE_EXCEPTION;
472 }
473
474 // Examine full case mappings.
475 if(!props.lc.isEmpty() || !props.uc.isEmpty() || !props.tc.isEmpty() ||
476 newValues.contains(PPUCD_CONDITIONAL_CASE_MAPPINGS)
477 ) {
478 hasMapping=TRUE;
479 value|=UCASE_EXCEPTION;
480 }
481 if( (!props.cf.isEmpty() && props.cf!=UnicodeString(props.scf)) ||
482 newValues.contains(PPUCD_TURKIC_CASE_FOLDING)
483 ) {
484 hasMapping=TRUE;
485 value|=UCASE_EXCEPTION;
486 }
487
488 if(props.binProps[UCHAR_SOFT_DOTTED]) {
489 value|=UCASE_SOFT_DOTTED;
490 }
491 int32_t cc=props.getIntProp(UCHAR_CANONICAL_COMBINING_CLASS);
492 if(cc!=0) {
493 if(props.binProps[UCHAR_SOFT_DOTTED]) {
494 fprintf(stderr, "genprops error: a soft-dotted character has ccc!=0\n");
495 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
496 return;
497 }
498 if(cc==230) {
499 value|=UCASE_ABOVE;
500 } else {
501 value|=UCASE_OTHER_ACCENT;
502 }
503 }
504
505 if(props.binProps[UCHAR_CASE_IGNORABLE]) {
506 value|=UCASE_IGNORABLE;
507 }
508
509 if((hasMapping || (value&UCASE_EXCEPTION)) && start!=end) {
510 fprintf(stderr,
511 "genprops error: range %04lX..%04lX has case mappings "
512 "or reasons for data structure exceptions\n",
513 (long)start, (long)end);
514 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
515 return;
516 }
517
518 /* handle exceptions */
519 if(value&UCASE_EXCEPTION) {
520 /* simply store exceptions for later processing and encoding */
521 if(excPropsCount==MAX_EXC_COUNT) {
522 fprintf(stderr, "genprops error: casepropsbuilder: too many exceptions\n");
523 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
524 return;
525 }
526 ExcProps *newExcProps=new ExcProps(props);
527 if(newExcProps==NULL) {
528 fprintf(stderr,
529 "genprops error: casepropsbuilder out of memory allocating "
530 "exceptions properties\n");
531 errorCode=U_MEMORY_ALLOCATION_ERROR;
532 return;
533 }
534 newExcProps->props.scf=scf;
535 newExcProps->delta=delta;
536 newExcProps->hasConditionalCaseMappings=
537 newValues.contains(PPUCD_CONDITIONAL_CASE_MAPPINGS) ||
538 // See ICU-13416: և ligature ech-yiwn has language-specific
539 // uppercase and titlecase mappings.
540 start==0x0587;
541 newExcProps->hasTurkicCaseFolding=newValues.contains(PPUCD_TURKIC_CASE_FOLDING);
542 newExcProps->hasNoSimpleCaseFolding=hasNoSimpleCaseFolding;
543 value|=(uint32_t)excPropsCount<<UGENCASE_EXC_SHIFT;
544 excProps[excPropsCount++]=newExcProps;
545 } else {
546 /* store the simple case mapping delta */
547 value|=((uint32_t)delta<<UCASE_DELTA_SHIFT)&UCASE_DELTA_MASK;
548 }
549
550 utrie2_setRange32(pTrie, start, end, value, TRUE, &errorCode);
551 if(U_FAILURE(errorCode)) {
552 fprintf(stderr, "genprops error: unable to set case mapping values: %s\n",
553 u_errorName(errorCode));
554 return;
555 }
556
557 if(hasMapping) {
558 /* update the case-sensitive set */
559 caseSensitive.add(start);
560 if(scf>=0) { caseSensitive.add(scf); }
561 if(props.slc>=0) { caseSensitive.add(props.slc); }
562 if(props.suc>=0) { caseSensitive.add(props.suc); }
563 if(props.stc>=0) { caseSensitive.add(props.stc); }
564 caseSensitive.addAll(props.cf);
565 caseSensitive.addAll(props.lc);
566 caseSensitive.addAll(props.uc);
567 caseSensitive.addAll(props.tc);
568
569 /* update maxFullLength */
570 if(props.cf.length()>maxFullLength) { maxFullLength=props.cf.length(); }
571 if(props.lc.length()>maxFullLength) { maxFullLength=props.lc.length(); }
572 if(props.uc.length()>maxFullLength) { maxFullLength=props.uc.length(); }
573 if(props.tc.length()>maxFullLength) { maxFullLength=props.tc.length(); }
574 }
575
576 /* add the multi-character case folding to the "unfold" data */
577 if(props.cf.hasMoreChar32Than(0, 0x7fffffff, 1)) {
578 addUnfolding(start, props.cf, errorCode);
579 }
580 }
581
582 uint32_t
makeExcProps(UChar32 c,uint32_t value,UErrorCode & errorCode)583 CasePropsBuilder::makeExcProps(UChar32 c, uint32_t value, UErrorCode &errorCode) {
584 if(U_FAILURE(errorCode)) { return 0; }
585 if(excPropsCount==MAX_EXC_COUNT) {
586 fprintf(stderr, "genprops error: casepropsbuilder: too many exceptions\n");
587 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
588 return 0;
589 }
590 LocalPointer<ExcProps> newExcProps(new ExcProps);
591 if(newExcProps==NULL) {
592 fprintf(stderr,
593 "genprops error: casepropsbuilder out of memory allocating "
594 "exceptions properties\n");
595 errorCode=U_MEMORY_ALLOCATION_ERROR;
596 return 0;
597 }
598
599 if((value&UCASE_TYPE_MASK)>UCASE_NONE) {
600 // Decode the simple case mapping.
601 UChar32 next=c+UCASE_GET_DELTA(value);
602 if(next!=c) {
603 UniProps &p=newExcProps->props;
604 if((value&UCASE_TYPE_MASK)==UCASE_LOWER) {
605 p.suc=p.stc=next;
606 } else {
607 p.slc=next;
608 }
609 }
610 }
611
612 value&=~(UGENCASE_EXC_MASK|UCASE_DELTA_MASK); // remove previous simple mapping
613 value|=(uint32_t)excPropsCount<<UGENCASE_EXC_SHIFT;
614 value|=UCASE_EXCEPTION;
615 excProps[excPropsCount++]=newExcProps.orphan();
616 return value;
617 }
618
619 /* finalize reverse case folding ("unfold") data ---------------------------- */
620
621 static int32_t U_CALLCONV
compareUnfold(const void * context,const void * left,const void * right)622 compareUnfold(const void *context, const void *left, const void *right) {
623 return u_memcmp((const UChar *)left, (const UChar *)right, UGENCASE_UNFOLD_WIDTH);
624 }
625
626 void
makeUnfoldData(UErrorCode & errorCode)627 CasePropsBuilder::makeUnfoldData(UErrorCode &errorCode) {
628 if(U_FAILURE(errorCode)) { return; }
629
630 UChar *p, *q;
631 int32_t i, j, k;
632
633 /* sort the data */
634 int32_t unfoldLength=unfold.length();
635 int32_t unfoldRows=unfoldLength/UGENCASE_UNFOLD_WIDTH-1;
636 UChar *unfoldBuffer=unfold.getBuffer(-1);
637 uprv_sortArray(unfoldBuffer+UGENCASE_UNFOLD_WIDTH, unfoldRows, UGENCASE_UNFOLD_WIDTH*2,
638 compareUnfold, NULL, FALSE, &errorCode);
639
640 /* make unique-string rows by merging adjacent ones' code point columns */
641
642 /* make p point to row i-1 */
643 p=unfoldBuffer+UGENCASE_UNFOLD_WIDTH;
644
645 for(i=1; i<unfoldRows;) {
646 if(0==u_memcmp(p, p+UGENCASE_UNFOLD_WIDTH, UGENCASE_UNFOLD_STRING_WIDTH)) {
647 /* concatenate code point columns */
648 q=p+UGENCASE_UNFOLD_STRING_WIDTH;
649 for(j=1; j<UGENCASE_UNFOLD_CP_WIDTH && q[j]!=0; ++j) {}
650 for(k=0; k<UGENCASE_UNFOLD_CP_WIDTH && q[UGENCASE_UNFOLD_WIDTH+k]!=0; ++j, ++k) {
651 q[j]=q[UGENCASE_UNFOLD_WIDTH+k];
652 }
653 if(j>UGENCASE_UNFOLD_CP_WIDTH) {
654 fprintf(stderr, "genprops error: too many code points in unfold[]: %ld>%d=UGENCASE_UNFOLD_CP_WIDTH\n",
655 (long)j, UGENCASE_UNFOLD_CP_WIDTH);
656 errorCode=U_BUFFER_OVERFLOW_ERROR;
657 return;
658 }
659
660 /* move following rows up one */
661 --unfoldRows;
662 u_memmove(p+UGENCASE_UNFOLD_WIDTH, p+UGENCASE_UNFOLD_WIDTH*2, (unfoldRows-i)*UGENCASE_UNFOLD_WIDTH);
663 } else {
664 p+=UGENCASE_UNFOLD_WIDTH;
665 ++i;
666 }
667 }
668
669 unfoldBuffer[UCASE_UNFOLD_ROWS]=(UChar)unfoldRows;
670
671 if(beVerbose) {
672 puts("unfold data:");
673
674 p=unfoldBuffer;
675 for(i=0; i<unfoldRows; ++i) {
676 p+=UGENCASE_UNFOLD_WIDTH;
677 printf("[%2d] %04x %04x %04x <- %04x %04x\n",
678 (int)i, p[0], p[1], p[2], p[3], p[4]);
679 }
680 }
681
682 unfold.releaseBuffer((unfoldRows+1)*UGENCASE_UNFOLD_WIDTH);
683 }
684
685 /* case closure ------------------------------------------------------------- */
686
687 void
addClosureMapping(UChar32 src,UChar32 dest,UErrorCode & errorCode)688 CasePropsBuilder::addClosureMapping(UChar32 src, UChar32 dest, UErrorCode &errorCode) {
689 if(U_FAILURE(errorCode)) { return; }
690
691 if(beVerbose) {
692 printf("add closure mapping U+%04lx->U+%04lx\n",
693 (unsigned long)src, (unsigned long)dest);
694 }
695
696 uint32_t value=utrie2_get32(pTrie, src);
697 if((value&UCASE_EXCEPTION)==0) {
698 /*
699 * decode value into p2 (enough for makeException() to work properly),
700 * add the closure mapping,
701 * and set the new exception for src
702 */
703 value=makeExcProps(src, value, errorCode);
704 utrie2_set32(pTrie, src, value, &errorCode);
705 if(U_FAILURE(errorCode)) {
706 fprintf(stderr, "genprops error: unable to set case mapping values, code: %s\n",
707 u_errorName(errorCode));
708 return;
709 }
710 }
711 excProps[value>>UGENCASE_EXC_SHIFT]->closure.add(dest);
712 }
713
714 /*
715 * Find missing case mapping relationships and add mappings for case closure.
716 * This function starts from an "original" code point and recursively
717 * finds its case mappings and the case mappings of where it maps to.
718 *
719 * The recursion depth is capped at 3 nested calls of this function.
720 * In each call, the current code point is c, and the function enumerates
721 * all of c's simple (single-code point) case mappings.
722 * prev is the code point that case-mapped to c.
723 * prev2 is the code point that case-mapped to prev.
724 *
725 * The initial function call has prev2<0, prev<0, and c==orig
726 * (marking no code points).
727 * It enumerates c's case mappings and recurses without further action.
728 *
729 * The second-level function call has prev2<0, prev==orig, and c is
730 * the destination code point of one of prev's case mappings.
731 * The function checks if any of c's case mappings go back to orig
732 * and adds a closure mapping if not.
733 * In other words, it turns a case mapping relationship of
734 * orig->c
735 * into
736 * orig<->c
737 *
738 * The third-level function call has prev2==orig, prev>=0, and c is
739 * the destination code point of one of prev's case mappings.
740 * (And prev is the destination of one of prev2's case mappings.)
741 * The function checks if any of c's case mappings go back to orig
742 * and adds a closure mapping if not.
743 * In other words, it turns case mapping relationships of
744 * orig->prev->c or orig->prev<->c
745 * into
746 * orig->prev->c->orig or orig->prev<->c->orig
747 * etc.
748 * (Graphically, this closes a triangle.)
749 *
750 * With repeated application on all code points until no more closure mappings
751 * are added, all case equivalence groups get complete mappings.
752 * That is, in each group of code points with case relationships
753 * each code point will in the end have some mapping to each other
754 * code point in the group.
755 *
756 * @return TRUE if a closure mapping was added
757 */
758 UBool
addClosure(UChar32 orig,UChar32 prev2,UChar32 prev,UChar32 c,uint32_t value,UErrorCode & errorCode)759 CasePropsBuilder::addClosure(UChar32 orig, UChar32 prev2, UChar32 prev, UChar32 c, uint32_t value,
760 UErrorCode &errorCode) {
761 if(U_FAILURE(errorCode)) { return FALSE; }
762
763 UChar32 next;
764 UBool someMappingsAdded=FALSE;
765
766 if(c!=orig) {
767 /* get the properties for c */
768 value=utrie2_get32(pTrie, c);
769 }
770 /* else if c==orig then c's value was passed in */
771
772 if(value&UCASE_EXCEPTION) {
773 UnicodeSet set;
774
775 ExcProps &ep=*excProps[value>>UGENCASE_EXC_SHIFT];
776 UniProps &p=ep.props;
777
778 /*
779 * marker for whether any of c's mappings goes to orig
780 * c==orig: prevent adding a closure mapping when getting orig's own, direct mappings
781 */
782 UBool mapsToOrig=(UBool)(c==orig);
783
784 /* collect c's case mapping destinations in set[] */
785 if((next=p.suc)>=0 && next!=c) {
786 set.add(next);
787 }
788 if((next=p.slc)>=0 && next!=c) {
789 set.add(next);
790 }
791 if(p.suc!=(next=p.stc) && next!=c) {
792 set.add(next);
793 }
794 if((next=p.scf)>=0 && next!=c) {
795 set.add(next);
796 }
797
798 /* add c's current closure mappings to set */
799 set.addAll(ep.closure);
800
801 /* process all code points to which c case-maps */
802 UnicodeSetIterator iter(set);
803 while(iter.next()) {
804 next=iter.getCodepoint(); /* next!=c */
805
806 if(next==orig) {
807 mapsToOrig=TRUE; /* remember that we map to orig */
808 } else if(prev2<0 && next!=prev) {
809 /*
810 * recurse unless
811 * we have reached maximum depth (prev2>=0) or
812 * this is a mapping to one of the previous code points (orig, prev, c)
813 */
814 someMappingsAdded|=addClosure(orig, prev, c, next, 0, errorCode);
815 }
816 }
817
818 if(!mapsToOrig) {
819 addClosureMapping(c, orig, errorCode);
820 return TRUE;
821 }
822 } else {
823 if((value&UCASE_TYPE_MASK)>UCASE_NONE) {
824 /* one simple case mapping, don't care which one */
825 next=c+UCASE_GET_DELTA(value);
826 if(next!=c) {
827 /*
828 * recurse unless
829 * we have reached maximum depth (prev2>=0) or
830 * this is a mapping to one of the previous code points (orig, prev, c)
831 */
832 if(prev2<0 && next!=orig && next!=prev) {
833 someMappingsAdded|=addClosure(orig, prev, c, next, 0, errorCode);
834 }
835
836 if(c!=orig && next!=orig) {
837 /* c does not map to orig, add a closure mapping c->orig */
838 addClosureMapping(c, orig, errorCode);
839 return TRUE;
840 }
841 }
842 }
843 }
844
845 return someMappingsAdded;
846 }
847
848 void
makeCaseClosure(UErrorCode & errorCode)849 CasePropsBuilder::makeCaseClosure(UErrorCode &errorCode) {
850 if(U_FAILURE(errorCode)) { return; }
851
852 /*
853 * finalize the "unfold" data because we need to use it to add closure mappings
854 * for situations like FB05->"st"<-FB06
855 * where we would otherwise miss the FB05<->FB06 relationship
856 */
857 makeUnfoldData(errorCode);
858
859 /* use the "unfold" data to add mappings */
860
861 /* p always points to the code points; this loop ignores the strings completely */
862 const UChar *p=unfold.getBuffer()+UGENCASE_UNFOLD_WIDTH+UGENCASE_UNFOLD_STRING_WIDTH;
863 int32_t unfoldRows=unfold.length()/UGENCASE_UNFOLD_WIDTH-1;
864
865 for(int32_t i=0; i<unfoldRows; p+=UGENCASE_UNFOLD_WIDTH, ++i) {
866 int32_t j=0;
867 UChar32 c;
868 U16_NEXT_UNSAFE(p, j, c);
869 while(j<UGENCASE_UNFOLD_CP_WIDTH && p[j]!=0) {
870 UChar32 c2;
871 U16_NEXT_UNSAFE(p, j, c2);
872 addClosure(c, U_SENTINEL, c, c2, 0, errorCode);
873 }
874 }
875
876 if(beVerbose) {
877 puts("---- ---- ---- ---- (done with closures from unfolding)");
878 }
879
880 /* add further closure mappings from analyzing simple mappings */
881 UBool someMappingsAdded;
882 do {
883 someMappingsAdded=FALSE;
884
885 for(UChar32 c=0; c<=0x10ffff; ++c) {
886 uint32_t value=utrie2_get32(pTrie, c);
887 if(value!=0) {
888 someMappingsAdded|=addClosure(c, U_SENTINEL, U_SENTINEL, c, value, errorCode);
889 }
890 }
891
892 if(beVerbose && someMappingsAdded) {
893 puts("---- ---- ---- ----");
894 }
895 } while(someMappingsAdded);
896 }
897
898 /* exceptions --------------------------------------------------------------- */
899
900 static UBool
fullMappingEqualsSimple(const UnicodeString & s,UChar32 simple,UChar32 c)901 fullMappingEqualsSimple(const UnicodeString &s, UChar32 simple, UChar32 c) {
902 if(simple<=0) {
903 simple=c; /* UCD has no simple mapping if it's the same as the code point itself */
904 }
905 return s.length()==U16_LENGTH(simple) && s.char32At(0)==simple;
906 }
907
908 int32_t
makeException(UChar32 c,uint32_t value,ExcProps & ep,UErrorCode & errorCode)909 CasePropsBuilder::makeException(UChar32 c, uint32_t value, ExcProps &ep, UErrorCode &errorCode) {
910 if(U_FAILURE(errorCode)) { return 0; }
911
912 /* exceptions.length() might be returned for storing in the trie word */
913 if(exceptions.length()>=UCASE_MAX_EXCEPTIONS) {
914 fprintf(stderr, "genprops error: casepropsbuilder: too many exceptions words\n");
915 errorCode=U_BUFFER_OVERFLOW_ERROR;
916 return 0;
917 }
918
919 /* copy and shift the soft-dotted and case-sensitive bits */
920 UChar excWord=(UChar)((value&(UCASE_DOT_MASK|UCASE_SENSITIVE))<<UCASE_EXC_DOT_SHIFT);
921
922 UniProps &p=ep.props;
923
924 /* set the bits for conditional mappings */
925 if(ep.hasConditionalCaseMappings) {
926 excWord|=UCASE_EXC_CONDITIONAL_SPECIAL;
927 p.lc.remove();
928 p.uc.remove();
929 p.tc.remove();
930 }
931 if(ep.hasTurkicCaseFolding) {
932 excWord|=UCASE_EXC_CONDITIONAL_FOLD;
933 p.cf.remove();
934 }
935 if(ep.hasNoSimpleCaseFolding) {
936 excWord|=UCASE_EXC_NO_SIMPLE_CASE_FOLDING;
937 }
938
939 /* remove redundant data */
940 /* do not store full mappings if they are the same as the simple ones */
941 if(fullMappingEqualsSimple(p.lc, p.slc, c)) {
942 p.lc.remove();
943 }
944 if(fullMappingEqualsSimple(p.uc, p.suc, c)) {
945 p.uc.remove();
946 }
947 if(fullMappingEqualsSimple(p.tc, p.stc, c)) {
948 p.tc.remove();
949 }
950 if(fullMappingEqualsSimple(p.cf, p.scf, c)) {
951 p.cf.remove();
952 }
953
954 /* write the optional slots */
955 uint32_t slots[8];
956 uint32_t slotBits=0;
957 int32_t count=0;
958
959 if(ep.delta!=0) {
960 int32_t delta=ep.delta;
961 if(delta<0) {
962 excWord|=UCASE_EXC_DELTA_IS_NEGATIVE;
963 delta=-delta;
964 }
965 slots[count]=(uint32_t)delta;
966 slotBits|=slots[count];
967 ++count;
968 excWord|=U_MASK(UCASE_EXC_DELTA);
969 } else {
970 if(p.slc>=0) {
971 slots[count]=(uint32_t)p.slc;
972 slotBits|=slots[count];
973 ++count;
974 excWord|=U_MASK(UCASE_EXC_LOWER);
975 }
976 if( p.scf>=0 &&
977 (p.slc>=0 ?
978 p.scf!=p.slc :
979 p.scf!=c)) {
980 slots[count]=(uint32_t)p.scf;
981 slotBits|=slots[count];
982 ++count;
983 excWord|=U_MASK(UCASE_EXC_FOLD);
984 }
985 if(p.suc>=0) {
986 slots[count]=(uint32_t)p.suc;
987 slotBits|=slots[count];
988 ++count;
989 excWord|=U_MASK(UCASE_EXC_UPPER);
990 }
991 if(p.suc!=p.stc) {
992 if(p.stc>=0) {
993 slots[count]=(uint32_t)p.stc;
994 } else {
995 slots[count]=(uint32_t)c;
996 }
997 slotBits|=slots[count];
998 ++count;
999 excWord|=U_MASK(UCASE_EXC_TITLE);
1000 }
1001 }
1002
1003 /* length of case closure */
1004 UnicodeString closureString;
1005 if(!ep.closure.isEmpty()) {
1006 UnicodeSetIterator iter(ep.closure);
1007 while(iter.next()) { closureString.append(iter.getCodepoint()); }
1008 int32_t length=closureString.length();
1009 if(length>UCASE_CLOSURE_MAX_LENGTH) {
1010 fprintf(stderr,
1011 "genprops error: case closure for U+%04lX has length %d "
1012 "which exceeds UCASE_CLOSURE_MAX_LENGTH=%d\n",
1013 (long)c, (int)length, (int)UCASE_CLOSURE_MAX_LENGTH);
1014 errorCode=U_BUFFER_OVERFLOW_ERROR;
1015 return 0;
1016 }
1017 slots[count]=(uint32_t)length; /* must be 1..UCASE_CLOSURE_MAX_LENGTH */
1018 slotBits|=slots[count];
1019 ++count;
1020 excWord|=U_MASK(UCASE_EXC_CLOSURE);
1021 }
1022
1023 /* lengths of full case mapping strings, stored in the last slot */
1024 int32_t fullLengths=
1025 p.lc.length()|
1026 (p.cf.length()<<4)|
1027 (p.uc.length()<<8)|
1028 (p.tc.length()<<12);
1029 if(fullLengths!=0) {
1030 slots[count]=(uint32_t)fullLengths;
1031 slotBits|=slots[count];
1032 ++count;
1033 excWord|=U_MASK(UCASE_EXC_FULL_MAPPINGS);
1034 }
1035
1036 if(count==0) {
1037 /* No optional slots: Try to share excWord entries. */
1038 int32_t excIndex=exceptions.indexOf((UChar)excWord);
1039 if(excIndex>=0) {
1040 return excIndex;
1041 }
1042 /* not found */
1043 excIndex=exceptions.length();
1044 exceptions.append((UChar)excWord);
1045 return excIndex;
1046 } else {
1047 /* write slots */
1048 UnicodeString excString;
1049 excString.append((UChar)0); /* placeholder for excWord which will be stored at excIndex */
1050
1051 if(slotBits<=0xffff) {
1052 for(int32_t i=0; i<count; ++i) {
1053 excString.append((UChar)slots[i]);
1054 }
1055 } else {
1056 excWord|=UCASE_EXC_DOUBLE_SLOTS;
1057 for(int32_t i=0; i<count; ++i) {
1058 excString.append((UChar)(slots[i]>>16));
1059 excString.append((UChar)slots[i]);
1060 }
1061 }
1062
1063 /* write the full case mapping strings */
1064 excString.append(p.lc);
1065 excString.append(p.cf);
1066 excString.append(p.uc);
1067 excString.append(p.tc);
1068
1069 /* write the closure data */
1070 excString.append(closureString);
1071
1072 /* write the main exceptions word */
1073 excString.setCharAt(0, (UChar)excWord);
1074
1075 // Try to share data.
1076 if(count==1 && ep.delta!=0) {
1077 int32_t excIndex=exceptions.indexOf(excString);
1078 if(excIndex>=0) {
1079 return excIndex;
1080 }
1081 }
1082 int32_t excIndex=exceptions.length();
1083 exceptions.append(excString);
1084 return excIndex;
1085 }
1086 }
1087
1088 void
makeExceptions(UErrorCode & errorCode)1089 CasePropsBuilder::makeExceptions(UErrorCode &errorCode) {
1090 if(U_FAILURE(errorCode)) { return; }
1091
1092 /*
1093 * Encode case-ignorable as delta==1 on uncased characters,
1094 * and with an exception bit on cased characters and characters with another exception.
1095 *
1096 * Change from temporary UGENCASE_EXC_SHIFT'ed index into excProps[]
1097 * to UCASE_EXC_SHIFT'ed index into encoded exceptions[].
1098 */
1099 for(UChar32 c=0; c<=0x10ffff; ++c) {
1100 uint32_t value=utrie2_get32(pTrie, c);
1101 if(value&UCASE_EXCEPTION) {
1102 int32_t excIndex=makeException(c, value, *excProps[value>>UGENCASE_EXC_SHIFT], errorCode);
1103 value=(value&~(UGENCASE_EXC_MASK|UCASE_EXC_MASK))|((uint32_t)excIndex<<UCASE_EXC_SHIFT);
1104 utrie2_set32(pTrie, c, value, &errorCode);
1105 }
1106 }
1107 }
1108
1109 /* generate output data ----------------------------------------------------- */
1110
1111 static int32_t indexes[UCASE_IX_TOP]={
1112 UCASE_IX_TOP, 0, 0, 0,
1113 0, 0, 0, 0,
1114 0, 0, 0, 0,
1115 0, 0, 0, 0
1116 };
1117
1118 static uint8_t trieBlock[100000];
1119 static int32_t trieSize;
1120
1121 void
build(UErrorCode & errorCode)1122 CasePropsBuilder::build(UErrorCode &errorCode) {
1123 if(!beQuiet) {
1124 puts("* ucase.icu stats *");
1125 }
1126
1127 makeCaseClosure(errorCode);
1128 if(U_FAILURE(errorCode)) { return; }
1129
1130 /*
1131 * Add one complex mapping to caseSensitive that was filtered out above:
1132 * Greek final Sigma has a conditional mapping but not locale-sensitive,
1133 * and it is taken when lowercasing just U+03A3 alone.
1134 * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1135 */
1136 caseSensitive.add(0x3c2);
1137
1138 UnicodeSetIterator iter(caseSensitive);
1139 while(iter.next()) {
1140 UChar32 c=iter.getCodepoint();
1141 uint32_t value=utrie2_get32(pTrie, c);
1142 if((value&UCASE_SENSITIVE)==0) {
1143 utrie2_set32(pTrie, c, value|UCASE_SENSITIVE, &errorCode);
1144 }
1145 }
1146 if(U_FAILURE(errorCode)) {
1147 fprintf(stderr, "genprops/case error: unable to set UCASE_SENSITIVE: %s\n",
1148 u_errorName(errorCode));
1149 return;
1150 }
1151
1152 makeExceptions(errorCode);
1153 if(U_FAILURE(errorCode)) { return; }
1154
1155 utrie2_freeze(pTrie, UTRIE2_16_VALUE_BITS, &errorCode);
1156 if(U_FAILURE(errorCode)) {
1157 fprintf(stderr, "genprops/case error: utrie2_freeze() failed: %s\n",
1158 u_errorName(errorCode));
1159 return;
1160 }
1161 trieSize=utrie2_serialize(pTrie, trieBlock, sizeof(trieBlock), &errorCode);
1162 if(U_FAILURE(errorCode)) {
1163 fprintf(stderr, "genprops/case error: utrie2_serialize() failed: %s (length %ld)\n",
1164 u_errorName(errorCode), (long)trieSize);
1165 return;
1166 }
1167
1168 indexes[UCASE_IX_EXC_LENGTH]=exceptions.length();
1169 indexes[UCASE_IX_TRIE_SIZE]=trieSize;
1170 indexes[UCASE_IX_UNFOLD_LENGTH]=unfold.length();
1171 indexes[UCASE_IX_LENGTH]=(int32_t)sizeof(indexes)+trieSize+2*exceptions.length()+2*unfold.length();
1172
1173 indexes[UCASE_IX_MAX_FULL_LENGTH]=maxFullLength;
1174
1175 if(!beQuiet) {
1176 printf("trie size in bytes: %5d\n", (int)trieSize);
1177 printf("number of code points with exceptions: %5d\n", excPropsCount);
1178 printf("size in bytes of exceptions: %5d\n", 2*exceptions.length());
1179 printf("size in bytes of reverse foldings: %5d\n", 2*unfold.length());
1180 printf("data size: %5d\n", (int)indexes[UCASE_IX_LENGTH]);
1181 }
1182 }
1183
1184 void
writeCSourceFile(const char * path,UErrorCode & errorCode)1185 CasePropsBuilder::writeCSourceFile(const char *path, UErrorCode &errorCode) {
1186 if(U_FAILURE(errorCode)) { return; }
1187
1188 FILE *f=usrc_create(path, "ucase_props_data.h", 2016,
1189 "icu/tools/unicode/c/genprops/casepropsbuilder.cpp");
1190 if(f==NULL) {
1191 errorCode=U_FILE_ACCESS_ERROR;
1192 return;
1193 }
1194 fputs("#ifdef INCLUDED_FROM_UCASE_CPP\n\n", f);
1195 usrc_writeArray(f,
1196 "static const UVersionInfo ucase_props_dataVersion={",
1197 dataInfo.dataVersion, 8, 4,
1198 "};\n\n");
1199 usrc_writeArray(f,
1200 "static const int32_t ucase_props_indexes[UCASE_IX_TOP]={",
1201 indexes, 32, UCASE_IX_TOP,
1202 "};\n\n");
1203 usrc_writeUTrie2Arrays(f,
1204 "static const uint16_t ucase_props_trieIndex[%ld]={\n", NULL,
1205 pTrie,
1206 "\n};\n\n");
1207 usrc_writeArray(f,
1208 "static const uint16_t ucase_props_exceptions[%ld]={\n",
1209 exceptions.getBuffer(), 16, exceptions.length(),
1210 "\n};\n\n");
1211 usrc_writeArray(f,
1212 "static const uint16_t ucase_props_unfold[%ld]={\n",
1213 unfold.getBuffer(), 16, unfold.length(),
1214 "\n};\n\n");
1215 fputs(
1216 "static const UCaseProps ucase_props_singleton={\n"
1217 " NULL,\n"
1218 " ucase_props_indexes,\n"
1219 " ucase_props_exceptions,\n"
1220 " ucase_props_unfold,\n",
1221 f);
1222 usrc_writeUTrie2Struct(f,
1223 " {\n",
1224 pTrie, "ucase_props_trieIndex", NULL,
1225 " },\n");
1226 usrc_writeArray(f, " { ", dataInfo.formatVersion, 8, 4, " }\n");
1227 fputs("};\n\n"
1228 "#endif // INCLUDED_FROM_UCASE_CPP\n", f);
1229 fclose(f);
1230 }
1231
1232 void
writeBinaryData(const char * path,UBool withCopyright,UErrorCode & errorCode)1233 CasePropsBuilder::writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode) {
1234 if(U_FAILURE(errorCode)) { return; }
1235
1236 UNewDataMemory *pData=udata_create(path, "icu", "ucase", &dataInfo,
1237 withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
1238 if(U_FAILURE(errorCode)) {
1239 fprintf(stderr, "genprops: udata_create(%s, ucase.icu) failed - %s\n",
1240 path, u_errorName(errorCode));
1241 return;
1242 }
1243
1244 udata_writeBlock(pData, indexes, sizeof(indexes));
1245 udata_writeBlock(pData, trieBlock, trieSize);
1246 udata_writeBlock(pData, exceptions.getBuffer(), 2*exceptions.length());
1247 udata_writeBlock(pData, unfold.getBuffer(), 2*unfold.length());
1248
1249 /* finish up */
1250 long dataLength=udata_finish(pData, &errorCode);
1251 if(U_FAILURE(errorCode)) {
1252 fprintf(stderr, "genprops error: casepropsbuilder error %d writing the output file\n", errorCode);
1253 exit(errorCode);
1254 }
1255
1256 if(dataLength!=indexes[UCASE_IX_LENGTH]) {
1257 fprintf(stderr,
1258 "udata_finish(ucase.icu) reports %ld bytes written but should be %ld\n",
1259 dataLength, (long)indexes[UCASE_IX_LENGTH]);
1260 errorCode=U_INTERNAL_PROGRAM_ERROR;
1261 }
1262 }
1263
1264 PropsBuilder *
createCasePropsBuilder(UErrorCode & errorCode)1265 createCasePropsBuilder(UErrorCode &errorCode) {
1266 if(U_FAILURE(errorCode)) { return NULL; }
1267 PropsBuilder *pb=new CasePropsBuilder(errorCode);
1268 if(pb==NULL) {
1269 errorCode=U_MEMORY_ALLOCATION_ERROR;
1270 }
1271 return pb;
1272 }
1273
1274 /*
1275 * Hey, Emacs, please set the following:
1276 *
1277 * Local Variables:
1278 * indent-tabs-mode: nil
1279 * End:
1280 *
1281 */
1282