• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2000-2009, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  genuca.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created at the end of XX century
14 *   created by: Vladimir Weinstein
15 *
16 *   This program reads the Franctional UCA table and generates
17 *   internal format for UCA table as well as inverse UCA table.
18 *   It then writes binary files containing the data: ucadata.dat
19 *   & invuca.dat
20 *   Change history:
21 *   02/23/2001  grhoten                 Made it into a tool
22 *   02/23/2001  weiv                    Moved element & table handling code to i18n
23 *   05/09/2001  weiv                    Case bits are now in the CEs, not in front
24 */
25 
26 #include "unicode/utypes.h"
27 #include "unicode/putil.h"
28 #include "unicode/udata.h"
29 #include "unicode/uclean.h"
30 #include "ucol_imp.h"
31 #include "genuca.h"
32 #include "uoptions.h"
33 #include "toolutil.h"
34 #include "unewdata.h"
35 #include "cstring.h"
36 #include "cmemory.h"
37 
38 #include <stdio.h>
39 
40 /*
41  * Global - verbosity
42  */
43 UBool VERBOSE = FALSE;
44 
45 static UVersionInfo UCAVersion;
46 
47 #if UCONFIG_NO_COLLATION
48 
49 /* dummy UDataInfo cf. udata.h */
50 static UDataInfo dummyDataInfo = {
51     sizeof(UDataInfo),
52     0,
53 
54     U_IS_BIG_ENDIAN,
55     U_CHARSET_FAMILY,
56     U_SIZEOF_UCHAR,
57     0,
58 
59     { 0, 0, 0, 0 },                 /* dummy dataFormat */
60     { 0, 0, 0, 0 },                 /* dummy formatVersion */
61     { 0, 0, 0, 0 }                  /* dummy dataVersion */
62 };
63 
64 #else
65 
66 static const UDataInfo ucaDataInfo={
67     sizeof(UDataInfo),
68     0,
69 
70     U_IS_BIG_ENDIAN,
71     U_CHARSET_FAMILY,
72     sizeof(UChar),
73     0,
74 
75     {UCA_DATA_FORMAT_0, UCA_DATA_FORMAT_1, UCA_DATA_FORMAT_2, UCA_DATA_FORMAT_3},     /* dataFormat="UCol"            */
76     /* 03/26/2002 bumped up version since format has changed */
77     /* 09/16/2002 bumped up version since we went from UColAttributeValue */
78     /*            to int32_t in UColOptionSet */
79     /* 05/13/2003 This one also updated since we added UCA and UCD versions */
80     /*            to header */
81     /* 09/11/2003 Adding information required by data swapper */
82     {UCA_FORMAT_VERSION_0, UCA_FORMAT_VERSION_1, UCA_FORMAT_VERSION_2, UCA_FORMAT_VERSION_3},                 /* formatVersion                */
83     {0, 0, 0, 0}                  /* dataVersion = Unicode Version*/
84 };
85 
86 static const UDataInfo invUcaDataInfo={
87     sizeof(UDataInfo),
88     0,
89 
90     U_IS_BIG_ENDIAN,
91     U_CHARSET_FAMILY,
92     sizeof(UChar),
93     0,
94 
95     {INVUCA_DATA_FORMAT_0, INVUCA_DATA_FORMAT_1, INVUCA_DATA_FORMAT_2, INVUCA_DATA_FORMAT_3},     /* dataFormat="InvC"            */
96     /* 03/26/2002 bumped up version since format has changed */
97     /* 04/29/2003 2.1 format - we have added UCA version to header */
98     {INVUCA_FORMAT_VERSION_0, INVUCA_FORMAT_VERSION_1, INVUCA_FORMAT_VERSION_2, INVUCA_FORMAT_VERSION_3},                 /* formatVersion                */
99     {0, 0, 0, 0}                  /* dataVersion = Unicode Version*/
100 };
101 
102 UCAElements le;
103 
readElement(char ** from,char * to,char separator,UErrorCode * status)104 int32_t readElement(char **from, char *to, char separator, UErrorCode *status) {
105     if(U_FAILURE(*status)) {
106         return 0;
107     }
108     char buffer[1024];
109     int32_t i = 0;
110     while(**from != separator) {
111         if(**from != ' ') {
112             *(buffer+i++) = **from;
113         }
114         (*from)++;
115     }
116     (*from)++;
117     *(buffer + i) = 0;
118     //*to = (char *)malloc(strlen(buffer)+1);
119     strcpy(to, buffer);
120     return i/2;
121 }
122 
123 
getSingleCEValue(char * primary,char * secondary,char * tertiary,UErrorCode * status)124 uint32_t getSingleCEValue(char *primary, char *secondary, char *tertiary, UErrorCode *status) {
125     if(U_FAILURE(*status)) {
126         return 0;
127     }
128     uint32_t value = 0;
129     char primsave = '\0';
130     char secsave = '\0';
131     char tersave = '\0';
132     char *primend = primary+4;
133     if(strlen(primary) > 4) {
134         primsave = *primend;
135         *primend = '\0';
136     }
137     char *secend = secondary+2;
138     if(strlen(secondary) > 2) {
139         secsave = *secend;
140         *secend = '\0';
141     }
142     char *terend = tertiary+2;
143     if(strlen(tertiary) > 2) {
144         tersave = *terend;
145         *terend = '\0';
146     }
147     uint32_t primvalue = (uint32_t)((*primary!='\0')?strtoul(primary, &primend, 16):0);
148     uint32_t secvalue = (uint32_t)((*secondary!='\0')?strtoul(secondary, &secend, 16):0);
149     uint32_t tervalue = (uint32_t)((*tertiary!='\0')?strtoul(tertiary, &terend, 16):0);
150     if(primvalue <= 0xFF) {
151       primvalue <<= 8;
152     }
153 
154     value = ((primvalue<<UCOL_PRIMARYORDERSHIFT)&UCOL_PRIMARYORDERMASK)|
155         ((secvalue<<UCOL_SECONDARYORDERSHIFT)&UCOL_SECONDARYORDERMASK)|
156         (tervalue&UCOL_TERTIARYORDERMASK);
157 
158     if(primsave!='\0') {
159         *primend = primsave;
160     }
161     if(secsave!='\0') {
162         *secend = secsave;
163     }
164     if(tersave!='\0') {
165         *terend = tersave;
166     }
167     return value;
168 }
169 
170 static uint32_t inverseTable[0xFFFF][3];
171 static uint32_t inversePos = 0;
172 static UChar stringContinue[0xFFFF];
173 static uint32_t sContPos = 0;
174 
addNewInverse(UCAElements * element,UErrorCode * status)175 static void addNewInverse(UCAElements *element, UErrorCode *status) {
176   if(U_FAILURE(*status)) {
177     return;
178   }
179   if(VERBOSE && isContinuation(element->CEs[1])) {
180     //fprintf(stdout, "+");
181   }
182   inversePos++;
183   inverseTable[inversePos][0] = element->CEs[0];
184   if(element->noOfCEs > 1 && isContinuation(element->CEs[1])) {
185     inverseTable[inversePos][1] = element->CEs[1];
186   } else {
187     inverseTable[inversePos][1] = 0;
188   }
189   if(element->cSize < 2) {
190     inverseTable[inversePos][2] = element->cPoints[0];
191   } else { /* add a new store of cruft */
192     inverseTable[inversePos][2] = ((element->cSize+1) << UCOL_INV_SHIFTVALUE) | sContPos;
193     memcpy(stringContinue+sContPos, element->cPoints, element->cSize*sizeof(UChar));
194     sContPos += element->cSize+1;
195   }
196 }
197 
insertInverse(UCAElements * element,uint32_t position,UErrorCode * status)198 static void insertInverse(UCAElements *element, uint32_t position, UErrorCode *status) {
199   if(U_FAILURE(*status)) {
200     return;
201   }
202 
203   if(VERBOSE && isContinuation(element->CEs[1])) {
204     //fprintf(stdout, "+");
205   }
206   if(position <= inversePos) {
207     /*move stuff around */
208     uint32_t amountToMove = (inversePos - position+1)*sizeof(inverseTable[0]);
209     uprv_memmove(inverseTable[position+1], inverseTable[position], amountToMove);
210   }
211   inverseTable[position][0] = element->CEs[0];
212   if(element->noOfCEs > 1 && isContinuation(element->CEs[1])) {
213     inverseTable[position][1] = element->CEs[1];
214   } else {
215     inverseTable[position][1] = 0;
216   }
217   if(element->cSize < 2) {
218     inverseTable[position][2] = element->cPoints[0];
219   } else { /* add a new store of cruft */
220     inverseTable[position][2] = ((element->cSize+1) << UCOL_INV_SHIFTVALUE) | sContPos;
221     memcpy(stringContinue+sContPos, element->cPoints, element->cSize*sizeof(UChar));
222     sContPos += element->cSize+1;
223   }
224   inversePos++;
225 }
226 
addToExistingInverse(UCAElements * element,uint32_t position,UErrorCode * status)227 static void addToExistingInverse(UCAElements *element, uint32_t position, UErrorCode *status) {
228 
229   if(U_FAILURE(*status)) {
230     return;
231   }
232 
233       if((inverseTable[position][2] & UCOL_INV_SIZEMASK) == 0) { /* single element, have to make new extension place and put both guys there */
234         stringContinue[sContPos] = (UChar)inverseTable[position][2];
235         inverseTable[position][2] = ((element->cSize+3) << UCOL_INV_SHIFTVALUE) | sContPos;
236         sContPos++;
237         stringContinue[sContPos++] = 0xFFFF;
238         memcpy(stringContinue+sContPos, element->cPoints, element->cSize*sizeof(UChar));
239         sContPos += element->cSize;
240         stringContinue[sContPos++] = 0xFFFE;
241       } else { /* adding to the already existing continuing table */
242         uint32_t contIndex = inverseTable[position][2] & UCOL_INV_OFFSETMASK;
243         uint32_t contSize = (inverseTable[position][2] & UCOL_INV_SIZEMASK) >> UCOL_INV_SHIFTVALUE;
244 
245         if(contIndex+contSize < sContPos) {
246           /*fprintf(stderr, ".", sContPos, contIndex+contSize);*/
247           memcpy(stringContinue+contIndex+contSize+element->cSize+1, stringContinue+contIndex+contSize, (element->cSize+1)*sizeof(UChar));
248         }
249 
250         stringContinue[contIndex+contSize-1] = 0xFFFF;
251         memcpy(stringContinue+contIndex+contSize, element->cPoints, element->cSize*sizeof(UChar));
252         sContPos += element->cSize+1;
253         stringContinue[contIndex+contSize+element->cSize] = 0xFFFE;
254 
255         inverseTable[position][2] = ((contSize+element->cSize+1) << UCOL_INV_SHIFTVALUE) | contIndex;
256       }
257 }
258 
259 /*
260  * Takes two CEs (lead and continuation) and
261  * compares them as CEs should be compared:
262  * primary vs. primary, secondary vs. secondary
263  * tertiary vs. tertiary
264  */
compareCEs(uint32_t * source,uint32_t * target)265 static int32_t compareCEs(uint32_t *source, uint32_t *target) {
266   uint32_t s1 = source[0], s2, t1 = target[0], t2;
267   if(isContinuation(source[1])) {
268     s2 = source[1];
269   } else {
270     s2 = 0;
271   }
272   if(isContinuation(target[1])) {
273     t2 = target[1];
274   } else {
275     t2 = 0;
276   }
277 
278   uint32_t s = 0, t = 0;
279   if(s1 == t1 && s2 == t2) {
280     return 0;
281   }
282   s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
283   t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
284   if(s < t) {
285     return -1;
286   } else if(s > t) {
287     return 1;
288   } else {
289     s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
290     t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
291     if(s < t) {
292       return -1;
293     } else if(s > t) {
294       return 1;
295     } else {
296       s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
297       t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
298       if(s < t) {
299         return -1;
300       } else {
301         return 1;
302       }
303     }
304   }
305 }
306 
addToInverse(UCAElements * element,UErrorCode * status)307 static uint32_t addToInverse(UCAElements *element, UErrorCode *status) {
308   uint32_t position = inversePos;
309   uint32_t saveElement = element->CEs[0];
310   int32_t compResult = 0;
311   element->CEs[0] &= 0xFFFFFF3F;
312   if(element->noOfCEs == 1) {
313     element->CEs[1] = 0;
314   }
315   if(inversePos == 0) {
316     inverseTable[0][0] = inverseTable[0][1] = inverseTable[0][2] = 0;
317     addNewInverse(element, status);
318   } else if(compareCEs(inverseTable[inversePos], element->CEs) > 0) {
319     while((compResult = compareCEs(inverseTable[--position], element->CEs)) > 0);
320     if(VERBOSE) { fprintf(stdout, "p:%u ", (int)position); }
321     if(compResult == 0) {
322       addToExistingInverse(element, position, status);
323     } else {
324       insertInverse(element, position+1, status);
325     }
326   } else if(compareCEs(inverseTable[inversePos], element->CEs) == 0) {
327     addToExistingInverse(element, inversePos, status);
328   } else {
329     addNewInverse(element, status);
330   }
331   element->CEs[0] = saveElement;
332   if(VERBOSE) { fprintf(stdout, "+"); }
333   return inversePos;
334 }
335 
assembleInverseTable(UErrorCode * status)336 static InverseUCATableHeader *assembleInverseTable(UErrorCode *status)
337 {
338   InverseUCATableHeader *result = NULL;
339   uint32_t headerByteSize = paddedsize(sizeof(InverseUCATableHeader));
340   uint32_t inverseTableByteSize = (inversePos+2)*sizeof(uint32_t)*3;
341   uint32_t contsByteSize = sContPos * sizeof(UChar);
342   uint32_t i = 0;
343 
344   result = (InverseUCATableHeader *)uprv_malloc(headerByteSize + inverseTableByteSize + contsByteSize);
345   uprv_memset(result, 0, headerByteSize + inverseTableByteSize + contsByteSize);
346   if(result != NULL) {
347     result->byteSize = headerByteSize + inverseTableByteSize + contsByteSize;
348 
349     inversePos++;
350     inverseTable[inversePos][0] = 0xFFFFFFFF;
351     inverseTable[inversePos][1] = 0xFFFFFFFF;
352     inverseTable[inversePos][2] = 0x0000FFFF;
353     inversePos++;
354 
355     for(i = 2; i<inversePos; i++) {
356       if(compareCEs(inverseTable[i-1], inverseTable[i]) > 0) {
357         fprintf(stderr, "Error at %i: %08X & %08X\n", (int)i, (int)inverseTable[i-1][0], (int)inverseTable[i][0]);
358       } else if(inverseTable[i-1][0] == inverseTable[i][0] && !(inverseTable[i-1][1] < inverseTable[i][1])) {
359         fprintf(stderr, "Continuation error at %i: %08X %08X & %08X %08X\n", (int)i, (int)inverseTable[i-1][0], (int)inverseTable[i-1][1], (int)inverseTable[i][0], (int)inverseTable[i][1]);
360       }
361     }
362 
363     result->tableSize = inversePos;
364     result->contsSize = sContPos;
365 
366     result->table = headerByteSize;
367     result->conts = headerByteSize + inverseTableByteSize;
368 
369     memcpy((uint8_t *)result + result->table, inverseTable, inverseTableByteSize);
370     memcpy((uint8_t *)result + result->conts, stringContinue, contsByteSize);
371 
372   } else {
373     *status = U_MEMORY_ALLOCATION_ERROR;
374     return NULL;
375   }
376 
377   return result;
378 }
379 
380 
writeOutInverseData(InverseUCATableHeader * data,const char * outputDir,const char * copyright,UErrorCode * status)381 static void writeOutInverseData(InverseUCATableHeader *data,
382                   const char *outputDir,
383                   const char *copyright,
384                   UErrorCode *status)
385 {
386     UNewDataMemory *pData;
387 
388     long dataLength;
389 
390     UDataInfo invUcaInfo;
391     uprv_memcpy(&invUcaInfo, &invUcaDataInfo, sizeof(UDataInfo));
392     u_getUnicodeVersion(invUcaInfo.dataVersion);
393 
394     pData=udata_create(outputDir, INVC_DATA_TYPE, INVC_DATA_NAME, &invUcaInfo,
395                        copyright, status);
396 
397     if(U_FAILURE(*status)) {
398         fprintf(stderr, "Error: unable to create %s"INVC_DATA_NAME", error %s\n", outputDir, u_errorName(*status));
399         return;
400     }
401 
402     /* write the data to the file */
403     if (VERBOSE) {
404         fprintf(stdout, "Writing out inverse UCA table: %s%c%s.%s\n", outputDir, U_FILE_SEP_CHAR,
405                                                                 INVC_DATA_NAME,
406                                                                 INVC_DATA_TYPE);
407     }
408     udata_writeBlock(pData, data, data->byteSize);
409 
410     /* finish up */
411     dataLength=udata_finish(pData, status);
412     if(U_FAILURE(*status)) {
413         fprintf(stderr, "Error: error %d writing the output file\n", *status);
414         return;
415     }
416 }
417 
418 
419 
hex2num(char hex)420 static int32_t hex2num(char hex) {
421     if(hex>='0' && hex <='9') {
422         return hex-'0';
423     } else if(hex>='a' && hex<='f') {
424         return hex-'a'+10;
425     } else if(hex>='A' && hex<='F') {
426         return hex-'A'+10;
427     } else {
428         return 0;
429     }
430 }
431 
readAnElement(FILE * data,tempUCATable * t,UCAConstants * consts,UErrorCode * status)432 UCAElements *readAnElement(FILE *data, tempUCATable *t, UCAConstants *consts, UErrorCode *status) {
433     char buffer[2048], primary[100], secondary[100], tertiary[100];
434     UBool detectedContraction;
435     int32_t i = 0;
436     unsigned int theValue;
437     char *pointer = NULL;
438     char *commentStart = NULL;
439     char *startCodePoint = NULL;
440     char *endCodePoint = NULL;
441     char *spacePointer = NULL;
442     char *dashPointer = NULL;
443     char *result = fgets(buffer, 2048, data);
444     int32_t buflen = (int32_t)uprv_strlen(buffer);
445     if(U_FAILURE(*status)) {
446         return 0;
447     }
448     *primary = *secondary = *tertiary = '\0';
449     if(result == NULL) {
450         if(feof(data)) {
451             return NULL;
452         } else {
453             fprintf(stderr, "empty line but no EOF!\n");
454             *status = U_INVALID_FORMAT_ERROR;
455             return NULL;
456         }
457     }
458     while(buflen>0 && (buffer[buflen-1] == '\r' || buffer[buflen-1] == '\n')) {
459       buffer[--buflen] = 0;
460     }
461 
462     if(buffer[0] == 0 || buffer[0] == '#') {
463         return NULL; // just a comment, skip whole line
464     }
465 
466     UCAElements *element = &le; //(UCAElements *)malloc(sizeof(UCAElements));
467 
468     enum ActionType {
469       READCE,
470       READHEX,
471       READUCAVERSION
472     };
473 
474     // Directives.
475     if(buffer[0] == '[') {
476       uint32_t cnt = 0;
477       static const struct {
478         char name[128];
479         uint32_t *what;
480         ActionType what_to_do;
481       } vt[]  = { {"[first tertiary ignorable",  consts->UCA_FIRST_TERTIARY_IGNORABLE,  READCE},
482                   {"[last tertiary ignorable",   consts->UCA_LAST_TERTIARY_IGNORABLE,   READCE},
483                   {"[first secondary ignorable", consts->UCA_FIRST_SECONDARY_IGNORABLE, READCE},
484                   {"[last secondary ignorable",  consts->UCA_LAST_SECONDARY_IGNORABLE,  READCE},
485                   {"[first primary ignorable",   consts->UCA_FIRST_PRIMARY_IGNORABLE,   READCE},
486                   {"[last primary ignorable",    consts->UCA_LAST_PRIMARY_IGNORABLE,    READCE},
487                   {"[first variable",            consts->UCA_FIRST_VARIABLE,            READCE},
488                   {"[last variable",             consts->UCA_LAST_VARIABLE,             READCE},
489                   {"[first regular",             consts->UCA_FIRST_NON_VARIABLE,        READCE},
490                   {"[last regular",              consts->UCA_LAST_NON_VARIABLE,         READCE},
491                   {"[first implicit",            consts->UCA_FIRST_IMPLICIT,            READCE},
492                   {"[last implicit",             consts->UCA_LAST_IMPLICIT,             READCE},
493                   {"[first trailing",            consts->UCA_FIRST_TRAILING,            READCE},
494                   {"[last trailing",             consts->UCA_LAST_TRAILING,             READCE},
495 
496                   {"[fixed top",                       &consts->UCA_PRIMARY_TOP_MIN,           READHEX},
497                   {"[fixed first implicit byte",       &consts->UCA_PRIMARY_IMPLICIT_MIN,      READHEX},
498                   {"[fixed last implicit byte",        &consts->UCA_PRIMARY_IMPLICIT_MAX,      READHEX},
499                   {"[fixed first trail byte",          &consts->UCA_PRIMARY_TRAILING_MIN,      READHEX},
500                   {"[fixed last trail byte",           &consts->UCA_PRIMARY_TRAILING_MAX,      READHEX},
501                   {"[fixed first special byte",        &consts->UCA_PRIMARY_SPECIAL_MIN,       READHEX},
502                   {"[fixed last special byte",         &consts->UCA_PRIMARY_SPECIAL_MAX,       READHEX},
503                   {"[variable top = ",                &t->options->variableTopValue,          READHEX},
504                   {"[UCA version = ",                 NULL,                          READUCAVERSION}
505       };
506       for (cnt = 0; cnt<sizeof(vt)/sizeof(vt[0]); cnt++) {
507         uint32_t vtLen = (uint32_t)uprv_strlen(vt[cnt].name);
508         if(uprv_strncmp(buffer, vt[cnt].name, vtLen) == 0) {
509             element->variableTop = TRUE;
510             if(vt[cnt].what_to_do == READHEX) {
511               if(sscanf(buffer+vtLen, "%4x", &theValue) != 1) /* read first code point */
512               {
513                   fprintf(stderr, " scanf(hex) failed on !\n ");
514               }
515               *(vt[cnt].what) = (UChar)theValue;
516               //if(cnt == 1) { // first implicit
517                 // we need to set the value for top next
518                 //uint32_t nextTop = ucol_prv_calculateImplicitPrimary(0x4E00); // CJK base
519                 //consts->UCA_NEXT_TOP_VALUE = theValue<<24 | 0x030303;
520               //}
521             } else if (vt[cnt].what_to_do == READCE) { /* vt[cnt].what_to_do == READCE */
522               // TODO: combine & clean up the two CE parsers
523               pointer = strchr(buffer+vtLen, '[');
524               if(pointer) {
525                 pointer++;
526                 element->sizePrim[0]=readElement(&pointer, primary, ',', status);
527                 element->sizeSec[0]=readElement(&pointer, secondary, ',', status);
528                 element->sizeTer[0]=readElement(&pointer, tertiary, ']', status);
529 
530                 vt[cnt].what[0] = getSingleCEValue(primary, secondary, tertiary, status);
531                 if(element->sizePrim[0] > 2 || element->sizeSec[0] > 1 || element->sizeTer[0] > 1) {
532                   uint32_t CEi = 1;
533                   uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
534                     if(2*CEi<element->sizePrim[i]) {
535                         value |= ((hex2num(*(primary+4*CEi))&0xF)<<28);
536                         value |= ((hex2num(*(primary+4*CEi+1))&0xF)<<24);
537                     }
538 
539                     if(2*CEi+1<element->sizePrim[i]) {
540                         value |= ((hex2num(*(primary+4*CEi+2))&0xF)<<20);
541                         value |= ((hex2num(*(primary+4*CEi+3))&0xF)<<16);
542                     }
543 
544                     if(CEi<element->sizeSec[i]) {
545                         value |= ((hex2num(*(secondary+2*CEi))&0xF)<<12);
546                         value |= ((hex2num(*(secondary+2*CEi+1))&0xF)<<8);
547                     }
548 
549                     if(CEi<element->sizeTer[i]) {
550                         value |= ((hex2num(*(tertiary+2*CEi))&0x3)<<4);
551                         value |= (hex2num(*(tertiary+2*CEi+1))&0xF);
552                     }
553 
554                     CEi++;
555 
556                     vt[cnt].what[1] = value;
557                     //element->CEs[CEindex++] = value;
558                 } else {
559                   vt[cnt].what[1] = 0;
560                 }
561               } else {
562                 fprintf(stderr, "Failed to read a CE from line %s\n", buffer);
563               }
564             } else { //vt[cnt].what_to_do == READUCAVERSION
565               u_versionFromString(UCAVersion, buffer+vtLen);
566               if(VERBOSE) {
567                 fprintf(stdout, "UCA version [%hu.%hu.%hu.%hu]\n", UCAVersion[0], UCAVersion[1], UCAVersion[2], UCAVersion[3]);
568               }
569             }
570             //element->cPoints[0] = (UChar)theValue;
571             //return element;
572             return NULL;
573         }
574       }
575       fprintf(stderr, "Warning: unrecognized option: %s\n", buffer);
576       //*status = U_INVALID_FORMAT_ERROR;
577       return NULL;
578     }
579     element->variableTop = FALSE;
580 
581     startCodePoint = buffer;
582     endCodePoint = strchr(startCodePoint, ';');
583 
584     if(endCodePoint == 0) {
585         fprintf(stderr, "error - line with no code point!\n");
586         *status = U_INVALID_FORMAT_ERROR; /* No code point - could be an error, but probably only an empty line */
587         return NULL;
588     } else {
589         *(endCodePoint) = 0;
590     }
591 
592     memset(element, 0, sizeof(*element));
593 
594     element->cPoints = element->uchars;
595 
596     spacePointer = strchr(buffer, ' ');
597     if(sscanf(buffer, "%4x", &theValue) != 1) /* read first code point */
598     {
599       fprintf(stderr, " scanf(hex) failed!\n ");
600     }
601     element->cPoints[0] = (UChar)theValue;
602 
603     if(spacePointer == 0) {
604         detectedContraction = FALSE;
605         element->cSize = 1;
606     } else {
607         dashPointer = strchr(buffer, '|');
608         if (dashPointer != NULL) {
609             // prefix characters
610             element->prefixChars[0] = (UChar)theValue;
611             element->prefixSize = 1;
612             element->prefix = element->prefixChars;
613             sscanf(dashPointer+1, "%4x", &theValue);
614             element->cPoints[0] = (UChar)theValue;
615             element->cSize = 1;
616         }
617         else {
618           // Contractions or surrogate characters.
619             i = 1;
620             detectedContraction = TRUE;
621             while(spacePointer != NULL) {
622                 sscanf(spacePointer+1, "%4x", &theValue);
623                 element->cPoints[i++] = (UChar)theValue;
624                 spacePointer = strchr(spacePointer+1, ' ');
625             }
626             element->cSize = i;
627         }
628 
629 
630         //fprintf(stderr, "Number of codepoints in contraction: %i\n", i);
631     }
632 
633     startCodePoint = endCodePoint+1;
634 
635     commentStart = strchr(startCodePoint, '#');
636     if(commentStart == NULL) {
637         commentStart = strlen(startCodePoint) + startCodePoint;
638     }
639 
640     i = 0;
641     uint32_t CEindex = 0;
642     element->noOfCEs = 0;
643     for(;;) {
644         endCodePoint = strchr(startCodePoint, ']');
645         if(endCodePoint == NULL || endCodePoint >= commentStart) {
646             break;
647         }
648         pointer = strchr(startCodePoint, '[');
649         pointer++;
650 
651         element->sizePrim[i]=readElement(&pointer, primary, ',', status);
652         element->sizeSec[i]=readElement(&pointer, secondary, ',', status);
653         element->sizeTer[i]=readElement(&pointer, tertiary, ']', status);
654 
655 
656         /* I want to get the CEs entered right here, including continuation */
657         element->CEs[CEindex++] = getSingleCEValue(primary, secondary, tertiary, status);
658 
659         uint32_t CEi = 1;
660         while(2*CEi<element->sizePrim[i] || CEi<element->sizeSec[i] || CEi<element->sizeTer[i]) {
661           uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
662             if(2*CEi<element->sizePrim[i]) {
663                 value |= ((hex2num(*(primary+4*CEi))&0xF)<<28);
664                 value |= ((hex2num(*(primary+4*CEi+1))&0xF)<<24);
665             }
666 
667             if(2*CEi+1<element->sizePrim[i]) {
668                 value |= ((hex2num(*(primary+4*CEi+2))&0xF)<<20);
669                 value |= ((hex2num(*(primary+4*CEi+3))&0xF)<<16);
670             }
671 
672             if(CEi<element->sizeSec[i]) {
673                 value |= ((hex2num(*(secondary+2*CEi))&0xF)<<12);
674                 value |= ((hex2num(*(secondary+2*CEi+1))&0xF)<<8);
675             }
676 
677             if(CEi<element->sizeTer[i]) {
678                 value |= ((hex2num(*(tertiary+2*CEi))&0x3)<<4);
679                 value |= (hex2num(*(tertiary+2*CEi+1))&0xF);
680             }
681 
682             CEi++;
683 
684             element->CEs[CEindex++] = value;
685         }
686 
687       startCodePoint = endCodePoint+1;
688       i++;
689     }
690     element->noOfCEs = CEindex;
691 #if 0
692     element->isThai = UCOL_ISTHAIPREVOWEL(element->cPoints[0]);
693 #endif
694     // we don't want any strange stuff after useful data!
695     if (pointer == NULL) {
696         /* huh? Did we get ']' without the '['? Pair your brackets! */
697         *status=U_INVALID_FORMAT_ERROR;
698     }
699     else {
700         while(pointer < commentStart)  {
701             if(*pointer != ' ' && *pointer != '\t')
702             {
703                 *status=U_INVALID_FORMAT_ERROR;
704                 break;
705             }
706             pointer++;
707         }
708     }
709     // Check for valid bytes in CE weights.
710     // TODO: Tighten this so that it allows 03 & 04 in intermediate bytes
711     // but not in final bytes.
712     // See http://bugs.icu-project.org/trac/ticket/7167
713     for (i = 0; i < (int32_t)CEindex; ++i) {
714         uint32_t value = element->CEs[i];
715         uint8_t bytes[4] = {
716             (uint8_t)(value >> 24),
717             (uint8_t)(value >> 16),
718             (uint8_t)(value >> 8),
719             (uint8_t)(value & UCOL_NEW_TERTIARYORDERMASK)
720         };
721         for (int j = 0; j < 4; ++j) {
722             uint8_t maxByte =
723                 (isContinuation(value) || j == 1) ?
724                     UCOL_BYTE_FIRST_TAILORED :
725                     UCOL_BYTE_COMMON;
726             if (0 != bytes[j] && bytes[j] < maxByte) {
727                 fprintf(stderr, "Warning: invalid UCA weight byte %02X for %s\n", bytes[j], buffer);
728                 // TODO: return NULL;
729             }
730         }
731     }
732 
733     if(U_FAILURE(*status)) {
734         fprintf(stderr, "problem putting stuff in hash table %s\n", u_errorName(*status));
735         *status = U_INTERNAL_PROGRAM_ERROR;
736         return NULL;
737     }
738 
739     return element;
740 }
741 
742 
writeOutData(UCATableHeader * data,UCAConstants * consts,UChar contractions[][3],uint32_t noOfcontractions,const char * outputDir,const char * copyright,UErrorCode * status)743 void writeOutData(UCATableHeader *data,
744                   UCAConstants *consts,
745                   UChar contractions[][3],
746                   uint32_t noOfcontractions,
747                   const char *outputDir,
748                   const char *copyright,
749                   UErrorCode *status)
750 {
751     if(U_FAILURE(*status)) {
752         return;
753     }
754 
755     uint32_t size = data->size;
756 
757     data->UCAConsts = data->size;
758     data->size += paddedsize(sizeof(UCAConstants));
759 
760     if(noOfcontractions != 0) {
761       contractions[noOfcontractions][0] = 0;
762       contractions[noOfcontractions][1] = 0;
763       contractions[noOfcontractions][2] = 0;
764       noOfcontractions++;
765 
766 
767       data->contractionUCACombos = data->size;
768       data->contractionUCACombosWidth = 3;
769       data->contractionUCACombosSize = noOfcontractions;
770       data->size += paddedsize((noOfcontractions*3*sizeof(UChar)));
771     }
772 
773     UNewDataMemory *pData;
774 
775     long dataLength;
776     UDataInfo ucaInfo;
777     uprv_memcpy(&ucaInfo, &ucaDataInfo, sizeof(UDataInfo));
778     u_getUnicodeVersion(ucaInfo.dataVersion);
779 
780     pData=udata_create(outputDir, UCA_DATA_TYPE, UCA_DATA_NAME, &ucaInfo,
781                        copyright, status);
782 
783     if(U_FAILURE(*status)) {
784         fprintf(stderr, "Error: unable to create %s"UCA_DATA_NAME", error %s\n", outputDir, u_errorName(*status));
785         return;
786     }
787 
788     /* write the data to the file */
789     if (VERBOSE) {
790         fprintf(stdout, "Writing out UCA table: %s%c%s.%s\n", outputDir,
791                                                         U_FILE_SEP_CHAR,
792                                                         U_ICUDATA_NAME "_" UCA_DATA_NAME,
793                                                         UCA_DATA_TYPE);
794     }
795     udata_writeBlock(pData, data, size);
796 
797     // output the constants here
798     udata_writeBlock(pData, consts, sizeof(UCAConstants));
799 
800     if(noOfcontractions != 0) {
801       udata_writeBlock(pData, contractions, noOfcontractions*3*sizeof(UChar));
802       udata_writePadding(pData, paddedsize((noOfcontractions*3*sizeof(UChar))) - noOfcontractions*3*sizeof(uint16_t));
803     }
804 
805     /* finish up */
806     dataLength=udata_finish(pData, status);
807     if(U_FAILURE(*status)) {
808         fprintf(stderr, "Error: error %d writing the output file\n", *status);
809         return;
810     }
811 }
812 
813 enum {
814     /*
815      * Maximum number of UCA contractions we can store.
816      * May need to be increased for a new Unicode version.
817      */
818     MAX_UCA_CONTRACTION_CES=2048
819 };
820 
821 static int32_t
write_uca_table(const char * filename,const char * outputDir,const char * copyright,UErrorCode * status)822 write_uca_table(const char *filename,
823                 const char *outputDir,
824                 const char *copyright,
825                 UErrorCode *status)
826 {
827     FILE *data = fopen(filename, "r");
828     if(data == NULL) {
829         fprintf(stderr, "Couldn't open file: %s\n", filename);
830         return -1;
831     }
832     uint32_t line = 0;
833     UCAElements *element = NULL;
834     UChar variableTopValue = 0;
835     UCATableHeader *myD = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
836     /* test for NULL */
837     if(myD == NULL) {
838         *status = U_MEMORY_ALLOCATION_ERROR;
839         fclose(data);
840         return 0;
841     }
842     uprv_memset(myD, 0, sizeof(UCATableHeader));
843     UColOptionSet *opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
844     /* test for NULL */
845     if(opts == NULL) {
846         *status = U_MEMORY_ALLOCATION_ERROR;
847         uprv_free(myD);
848         fclose(data);
849         return 0;
850     }
851     uprv_memset(opts, 0, sizeof(UColOptionSet));
852     UChar contractionCEs[MAX_UCA_CONTRACTION_CES][3];
853     uprv_memset(contractionCEs, 0, sizeof(contractionCEs));
854     uint32_t noOfContractions = 0;
855     UCAConstants consts;
856     uprv_memset(&consts, 0, sizeof(consts));
857 #if 0
858     UCAConstants consts = {
859       UCOL_RESET_TOP_VALUE,
860       UCOL_FIRST_PRIMARY_IGNORABLE,
861       UCOL_LAST_PRIMARY_IGNORABLE,
862       UCOL_LAST_PRIMARY_IGNORABLE_CONT,
863       UCOL_FIRST_SECONDARY_IGNORABLE,
864       UCOL_LAST_SECONDARY_IGNORABLE,
865       UCOL_FIRST_TERTIARY_IGNORABLE,
866       UCOL_LAST_TERTIARY_IGNORABLE,
867       UCOL_FIRST_VARIABLE,
868       UCOL_LAST_VARIABLE,
869       UCOL_FIRST_NON_VARIABLE,
870       UCOL_LAST_NON_VARIABLE,
871 
872       UCOL_NEXT_TOP_VALUE,
873 /*
874       UCOL_NEXT_FIRST_PRIMARY_IGNORABLE,
875       UCOL_NEXT_LAST_PRIMARY_IGNORABLE,
876       UCOL_NEXT_FIRST_SECONDARY_IGNORABLE,
877       UCOL_NEXT_LAST_SECONDARY_IGNORABLE,
878       UCOL_NEXT_FIRST_TERTIARY_IGNORABLE,
879       UCOL_NEXT_LAST_TERTIARY_IGNORABLE,
880       UCOL_NEXT_FIRST_VARIABLE,
881       UCOL_NEXT_LAST_VARIABLE,
882 */
883 
884       PRIMARY_IMPLICIT_MIN,
885       PRIMARY_IMPLICIT_MAX
886     };
887 #endif
888 
889 
890     uprv_memset(inverseTable, 0xDA, sizeof(int32_t)*3*0xFFFF);
891 
892     opts->variableTopValue = variableTopValue;
893     opts->strength = UCOL_TERTIARY;
894     opts->frenchCollation = UCOL_OFF;
895     opts->alternateHandling = UCOL_NON_IGNORABLE; /* attribute for handling variable elements*/
896     opts->caseFirst = UCOL_OFF;         /* who goes first, lower case or uppercase */
897     opts->caseLevel = UCOL_OFF;         /* do we have an extra case level */
898     opts->normalizationMode = UCOL_OFF; /* attribute for normalization */
899     opts->hiraganaQ = UCOL_OFF; /* attribute for JIS X 4061, used only in Japanese */
900     opts->numericCollation = UCOL_OFF;
901     myD->jamoSpecial = FALSE;
902 
903     tempUCATable *t = uprv_uca_initTempTable(myD, opts, NULL, IMPLICIT_TAG, LEAD_SURROGATE_TAG, status);
904     if(U_FAILURE(*status))
905     {
906         fprintf(stderr, "Failed to init UCA temp table: %s\n", u_errorName(*status));
907         uprv_free(opts);
908         uprv_free(myD);
909         fclose(data);
910         return -1;
911     }
912 
913 #if 0
914     IMPLICIT_TAG = 9,
915 /*
916  *****************************************************************************************
917  * NON_CHARACTER FDD0 - FDEF, FFFE, FFFF, 1FFFE, 1FFFF, 2FFFE, 2FFFF,...e.g. **FFFE, **FFFF
918  ******************************************************************************************
919  */
920 #endif
921 
922 // * set to zero
923 struct {
924       UChar32 start;
925       UChar32 end;
926       int32_t value;
927     } ranges[] =
928     {
929 #if 0
930       {0xAC00, 0xD7AF, UCOL_SPECIAL_FLAG | (HANGUL_SYLLABLE_TAG << 24) },  //0 HANGUL_SYLLABLE_TAG,/* AC00-D7AF*/
931       {0xD800, 0xDBFF, UCOL_SPECIAL_FLAG | (LEAD_SURROGATE_TAG << 24)  },  //1 LEAD_SURROGATE_TAG,  /* D800-DBFF*/
932       {0xDC00, 0xDFFF, UCOL_SPECIAL_FLAG | (TRAIL_SURROGATE_TAG << 24) },  //2 TRAIL_SURROGATE DC00-DFFF
933       {0x3400, 0x4DB5, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)    },  //3 CJK_IMPLICIT_TAG,   /* 0x3400-0x4DB5*/
934       {0x4E00, 0x9FA5, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)    },  //4 CJK_IMPLICIT_TAG,   /* 0x4E00-0x9FA5*/
935       {0xF900, 0xFA2D, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)    },  //5 CJK_IMPLICIT_TAG,   /* 0xF900-0xFA2D*/
936       {0x20000, 0x2A6D6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)  },  //6 CJK_IMPLICIT_TAG,   /* 0x20000-0x2A6D6*/
937       {0x2F800, 0x2FA1D, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)  },  //7 CJK_IMPLICIT_TAG,   /* 0x2F800-0x2FA1D*/
938 #endif
939       {0xAC00, 0xD7B0, UCOL_SPECIAL_FLAG | (HANGUL_SYLLABLE_TAG << 24) },  //0 HANGUL_SYLLABLE_TAG,/* AC00-D7AF*/
940       //{0xD800, 0xDC00, UCOL_SPECIAL_FLAG | (LEAD_SURROGATE_TAG << 24)  },  //1 LEAD_SURROGATE_TAG,  /* D800-DBFF*/
941       {0xDC00, 0xE000, UCOL_SPECIAL_FLAG | (TRAIL_SURROGATE_TAG << 24) },  //2 TRAIL_SURROGATE DC00-DFFF
942       // Now directly handled in the collation code by the swapCJK function.
943       //{0x3400, 0x4DB6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)    },  //3 CJK_IMPLICIT_TAG,   /* 0x3400-0x4DB5*/
944       //{0x4E00, 0x9FA6, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)    },  //4 CJK_IMPLICIT_TAG,   /* 0x4E00-0x9FA5*/
945       //{0xF900, 0xFA2E, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)    },  //5 CJK_IMPLICIT_TAG,   /* 0xF900-0xFA2D*/
946       //{0x20000, 0x2A6D7, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)  },  //6 CJK_IMPLICIT_TAG,   /* 0x20000-0x2A6D6*/
947       //{0x2F800, 0x2FA1E, UCOL_SPECIAL_FLAG | (CJK_IMPLICIT_TAG << 24)  },  //7 CJK_IMPLICIT_TAG,   /* 0x2F800-0x2FA1D*/
948     };
949     uint32_t i = 0;
950 
951     for(i = 0; i<sizeof(ranges)/sizeof(ranges[0]); i++) {
952       /*ucmpe32_setRange32(t->mapping, ranges[i].start, ranges[i].end, ranges[i].value); */
953       utrie_setRange32(t->mapping, ranges[i].start, ranges[i].end, ranges[i].value, TRUE);
954     }
955 
956 
957     int32_t surrogateCount = 0;
958     while(!feof(data)) {
959         if(U_FAILURE(*status)) {
960             fprintf(stderr, "Something returned an error %i (%s) while processing line %u of %s. Exiting...\n",
961                 *status, u_errorName(*status), (int)line, filename);
962             exit(*status);
963         }
964 
965         element = readAnElement(data, t, &consts, status);
966         line++;
967         if(VERBOSE) {
968           fprintf(stdout, "%u ", (int)line);
969         }
970         if(element != NULL) {
971             // we have read the line, now do something sensible with the read data!
972 
973             // Below stuff was taken care of in readAnElement
974             //if(element->variableTop == TRUE && variableTopValue == 0) {
975             //    t->options->variableTopValue = element->cPoints[0];
976             //}
977 
978             // if element is a contraction, we want to add it to contractions
979             if(element->cSize > 1 && element->cPoints[0] != 0xFDD0) { // this is a contraction
980               if(UTF_IS_LEAD(element->cPoints[0]) && UTF_IS_TRAIL(element->cPoints[1]) && element->cSize == 2) {
981                 surrogateCount++;
982               } else {
983                 if(noOfContractions>=MAX_UCA_CONTRACTION_CES) {
984                   fprintf(stderr,
985                           "\nMore than %d contractions. Please increase MAX_UCA_CONTRACTION_CES in genuca.cpp. "
986                           "Exiting...\n",
987                           (int)MAX_UCA_CONTRACTION_CES);
988                   exit(*status);
989                 }
990                 contractionCEs[noOfContractions][0] = element->cPoints[0];
991                 contractionCEs[noOfContractions][1] = element->cPoints[1];
992                 if(element->cSize > 2) { // the third one
993                   contractionCEs[noOfContractions][2] = element->cPoints[2];
994                 } else {
995                   contractionCEs[noOfContractions][2] = 0;
996                 }
997                 noOfContractions++;
998               }
999             }
1000             else {
1001                 // TODO (claireho): does this work? Need more tests
1002                 // The following code is to handle the UCA pre-context rules
1003                 // for L/l with middle dot. We share the structures for contractionCombos.
1004                 // The format for pre-context character is
1005                 // contractionCEs[0]: codepoint in element->cPoints[0]
1006                 // contractionCEs[1]: '\0' to differentiate with contractions.
1007                 // contractionCEs[2]: prefix char
1008                 if (element->prefixSize>0) {
1009                     if(noOfContractions>=MAX_UCA_CONTRACTION_CES) {
1010                       fprintf(stderr,
1011                               "\nMore than %d contractions. Please increase MAX_UCA_CONTRACTION_CES in genuca.cpp. "
1012                               "Exiting...\n",
1013                               (int)MAX_UCA_CONTRACTION_CES);
1014                       exit(*status);
1015                     }
1016                     contractionCEs[noOfContractions][0]=element->cPoints[0];
1017                     contractionCEs[noOfContractions][1]='\0';
1018                     contractionCEs[noOfContractions][2]=element->prefixChars[0];
1019                     noOfContractions++;
1020                 }
1021 
1022             }
1023 
1024             /* we're first adding to inverse, because addAnElement will reverse the order */
1025             /* of code points and stuff... we don't want that to happen */
1026             addToInverse(element, status);
1027             if(!(element->cSize > 1 && element->cPoints[0] == 0xFDD0)) {
1028               uprv_uca_addAnElement(t, element, status);
1029             }
1030         }
1031     }
1032 
1033     if(UCAVersion[0] == 0 && UCAVersion[1] == 0 && UCAVersion[2] == 0 && UCAVersion[3] == 0) {
1034         fprintf(stderr, "UCA version not specified. Cannot create data file!\n");
1035         uprv_uca_closeTempTable(t);
1036         uprv_free(opts);
1037         uprv_free(myD);
1038         fclose(data);
1039         return -1;
1040     }
1041 /*    {
1042         uint32_t trieWord = utrie_get32(t->mapping, 0xDC01, NULL);
1043     }*/
1044 
1045     if (VERBOSE) {
1046         fprintf(stdout, "\nLines read: %u\n", (int)line);
1047         fprintf(stdout, "Surrogate count: %i\n", (int)surrogateCount);
1048         fprintf(stdout, "Raw data breakdown:\n");
1049         /*fprintf(stdout, "Compact array stage1 top: %i, stage2 top: %i\n", t->mapping->stage1Top, t->mapping->stage2Top);*/
1050         fprintf(stdout, "Number of contractions: %u\n", (int)noOfContractions);
1051         fprintf(stdout, "Contraction image size: %u\n", (int)t->image->contractionSize);
1052         fprintf(stdout, "Expansions size: %i\n", (int)t->expansions->position);
1053     }
1054 
1055 
1056     /* produce canonical closure for table */
1057     /* first set up constants for implicit calculation */
1058     uprv_uca_initImplicitConstants(status);
1059     /* do the closure */
1060     int32_t noOfClosures = uprv_uca_canonicalClosure(t, NULL, status);
1061     if(noOfClosures != 0) {
1062       fprintf(stderr, "Warning: %i canonical closures occured!\n", (int)noOfClosures);
1063     }
1064 
1065     /* test */
1066     UCATableHeader *myData = uprv_uca_assembleTable(t, status);
1067 
1068     if (VERBOSE) {
1069         fprintf(stdout, "Compacted data breakdown:\n");
1070         /*fprintf(stdout, "Compact array stage1 top: %i, stage2 top: %i\n", t->mapping->stage1Top, t->mapping->stage2Top);*/
1071         fprintf(stdout, "Number of contractions: %u\n", (int)noOfContractions);
1072         fprintf(stdout, "Contraction image size: %u\n", (int)t->image->contractionSize);
1073         fprintf(stdout, "Expansions size: %i\n", (int)t->expansions->position);
1074     }
1075 
1076     if(U_FAILURE(*status)) {
1077         fprintf(stderr, "Error creating table: %s\n", u_errorName(*status));
1078         uprv_uca_closeTempTable(t);
1079         uprv_free(opts);
1080         uprv_free(myD);
1081         fclose(data);
1082         return -1;
1083     }
1084 
1085     /* populate the version info struct with version info*/
1086     myData->version[0] = UCOL_BUILDER_VERSION;
1087     myData->version[1] = UCAVersion[0];
1088     myData->version[2] = UCAVersion[1];
1089     myData->version[3] = UCAVersion[2];
1090     /*TODO:The fractional rules version should be taken from FractionalUCA.txt*/
1091     // Removed this macro. Instead, we use the fields below
1092     //myD->version[1] = UCOL_FRACTIONAL_UCA_VERSION;
1093     //myD->UCAVersion = UCAVersion; // out of FractionalUCA.txt
1094     uprv_memcpy(myData->UCAVersion, UCAVersion, sizeof(UVersionInfo));
1095     u_getUnicodeVersion(myData->UCDVersion);
1096 
1097     writeOutData(myData, &consts, contractionCEs, noOfContractions, outputDir, copyright, status);
1098 
1099     InverseUCATableHeader *inverse = assembleInverseTable(status);
1100     uprv_memcpy(inverse->UCAVersion, UCAVersion, sizeof(UVersionInfo));
1101     writeOutInverseData(inverse, outputDir, copyright, status);
1102 
1103     uprv_uca_closeTempTable(t);
1104     uprv_free(myD);
1105     uprv_free(opts);
1106 
1107 
1108     uprv_free(myData);
1109     uprv_free(inverse);
1110     fclose(data);
1111 
1112     return 0;
1113 }
1114 
1115 #endif /* #if !UCONFIG_NO_COLLATION */
1116 
1117 static UOption options[]={
1118     UOPTION_HELP_H,              /* 0  Numbers for those who*/
1119     UOPTION_HELP_QUESTION_MARK,  /* 1   can't count. */
1120     UOPTION_COPYRIGHT,           /* 2 */
1121     UOPTION_VERSION,             /* 3 */
1122     UOPTION_DESTDIR,             /* 4 */
1123     UOPTION_SOURCEDIR,           /* 5 */
1124     UOPTION_VERBOSE,             /* 6 */
1125     UOPTION_ICUDATADIR           /* 7 */
1126     /* weiv can't count :))))) */
1127 };
1128 
main(int argc,char * argv[])1129 int main(int argc, char* argv[]) {
1130     UErrorCode status = U_ZERO_ERROR;
1131     const char* destdir = NULL;
1132     const char* srcDir = NULL;
1133     char filename[300];
1134     char *basename = NULL;
1135     const char *copyright = NULL;
1136     uprv_memset(&UCAVersion, 0, 4);
1137 
1138     U_MAIN_INIT_ARGS(argc, argv);
1139 
1140     /* preset then read command line options */
1141     options[4].value=u_getDataDirectory();
1142     options[5].value="";
1143     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
1144 
1145     /* error handling, printing usage message */
1146     if(argc<0) {
1147         fprintf(stderr,
1148             "error in command line argument \"%s\"\n",
1149             argv[-argc]);
1150     } else if(argc<2) {
1151         argc=-1;
1152     }
1153     if(options[0].doesOccur || options[1].doesOccur) {
1154         fprintf(stderr,
1155             "usage: %s [-options] file\n"
1156             "\tRead in UCA collation text data and write out the binary collation data\n"
1157             "options:\n"
1158             "\t-h or -? or --help  this usage text\n"
1159             "\t-V or --version     show a version message\n"
1160             "\t-c or --copyright   include a copyright notice\n"
1161             "\t-d or --destdir     destination directory, followed by the path\n"
1162             "\t-s or --sourcedir   source directory, followed by the path\n"
1163             "\t-v or --verbose     turn on verbose output\n"
1164             "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
1165             "\t                    followed by path, defaults to %s\n",
1166             argv[0], u_getDataDirectory());
1167         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
1168     }
1169     if(options[3].doesOccur) {
1170         fprintf(stdout, "genuca version %hu.%hu, ICU tool to read UCA text data and create UCA data tables for collation.\n",
1171 #if UCONFIG_NO_COLLATION
1172             0, 0
1173 #else
1174             UCA_FORMAT_VERSION_0, UCA_FORMAT_VERSION_1
1175 #endif
1176             );
1177         fprintf(stdout, U_COPYRIGHT_STRING"\n");
1178         exit(0);
1179     }
1180 
1181     /* get the options values */
1182     destdir = options[4].value;
1183     srcDir = options[5].value;
1184     VERBOSE = options[6].doesOccur;
1185 
1186     if (options[2].doesOccur) {
1187         copyright = U_COPYRIGHT_STRING;
1188     }
1189 
1190     if (options[7].doesOccur) {
1191         u_setDataDirectory(options[7].value);
1192     }
1193     /* Initialize ICU */
1194     u_init(&status);
1195     if (U_FAILURE(status) && status != U_FILE_ACCESS_ERROR) {
1196         fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
1197             argv[0], u_errorName(status));
1198         exit(1);
1199     }
1200     status = U_ZERO_ERROR;
1201 
1202 
1203     /* prepare the filename beginning with the source dir */
1204     uprv_strcpy(filename, srcDir);
1205     basename=filename+uprv_strlen(filename);
1206 
1207     if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
1208         *basename++ = U_FILE_SEP_CHAR;
1209     }
1210 
1211     if(argc < 0) {
1212       uprv_strcpy(basename, "FractionalUCA.txt");
1213     } else {
1214       argv++;
1215       uprv_strcpy(basename, getLongPathname(*argv));
1216     }
1217 
1218 #if 0
1219     if(u_getCombiningClass(0x0053) == 0)
1220     {
1221         fprintf(stderr, "SEVERE ERROR: Normalization data is not functioning! Bailing out.  Was not able to load unorm.dat.\n");
1222         exit(1);
1223     }
1224 #endif
1225 
1226 #if UCONFIG_NO_COLLATION
1227 
1228     UNewDataMemory *pData;
1229     const char *msg;
1230 
1231     msg = "genuca writes dummy " UCA_DATA_NAME "." UCA_DATA_TYPE " because of UCONFIG_NO_COLLATION, see uconfig.h";
1232     fprintf(stderr, "%s\n", msg);
1233     pData = udata_create(destdir, UCA_DATA_TYPE, UCA_DATA_NAME, &dummyDataInfo,
1234                          NULL, &status);
1235     udata_writeBlock(pData, msg, strlen(msg));
1236     udata_finish(pData, &status);
1237 
1238     msg = "genuca writes dummy " INVC_DATA_NAME "." INVC_DATA_TYPE " because of UCONFIG_NO_COLLATION, see uconfig.h";
1239     fprintf(stderr, "%s\n", msg);
1240     pData = udata_create(destdir, INVC_DATA_TYPE, INVC_DATA_NAME, &dummyDataInfo,
1241                          NULL, &status);
1242     udata_writeBlock(pData, msg, strlen(msg));
1243     udata_finish(pData, &status);
1244 
1245     return (int)status;
1246 
1247 #else
1248 
1249     return write_uca_table(filename, destdir, copyright, &status);
1250 
1251 #endif
1252 }
1253 
1254 /*
1255  * Hey, Emacs, please set the following:
1256  *
1257  * Local Variables:
1258  * indent-tabs-mode: nil
1259  * End:
1260  *
1261  */
1262