• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2003, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *
11 * File colprobe.cpp
12 *
13 * Modification History:
14 *
15 *   Date        Name        Description
16 *   03/18/2003  weiv        Creation.
17 *******************************************************************************
18 */
19 
20 #include "uoptions.h"
21 #include "unicode/ucol.h"
22 #include "unicode/ucoleitr.h"
23 #include "unicode/ures.h"
24 #include "unicode/uniset.h"
25 #include "unicode/usetiter.h"
26 #include "unicode/ustring.h"
27 #include "unicode/uchar.h"
28 #include "unicode/uscript.h"
29 #include "unicode/locid.h"
30 #include "unicode/ucnv.h"
31 #include "uprops.h"
32 #include "hash.h"
33 #include "ucol_imp.h"
34 
35 #include "unicode/ustdio.h"
36 #include "unicode/utrans.h"
37 
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <fcntl.h>
42 
43 // unix tolower
44 #include <ctype.h>
45 // unix setlocale
46 #include <locale.h>
47 
48 #include "colprobe.h"
49 
50 #include "line.h"
51 #include "sortedlines.h"
52 #include "strengthprobe.h"
53 
54 void testWin(StrengthProbe &probe, UErrorCode &status) ;
55 
56 #if defined WIN32
57 #include <io.h>
58 #include <windows.h>
59 #include <sys/types.h>
60 #include <sys/stat.h>
61 #include <direct.h>
62 
createDir(const char * dirName)63 int createDir(const char* dirName) {
64   struct _stat myStat;
65   int result = _stat(dirName, &myStat);
66 
67   if(result == -1) {
68     result = _mkdir(dirName);
69     return result;
70   } else if(myStat.st_mode & _S_IFDIR) {
71     return 0;
72   } else {
73     return 1;
74   }
75 }
76 
77 //#elif defined POSIX
78 #else
79 #include <sys/stat.h>
80 #include <unistd.h>
81 
createDir(const char * dirName)82 int createDir(const char* dirName) {
83   struct stat myStat;
84   int result = stat(dirName, &myStat);
85 
86   if(result == -1) {
87     result = mkdir(dirName, S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IWGRP|S_IXGRP|S_IROTH|S_IWOTH|S_IXOTH);
88     return result;
89   } else if(S_ISDIR(myStat.st_mode)) {
90     return 0;
91   } else {
92     return 1;
93   }
94 }
95 //
96 //  Stubs for Windows API functions when building on UNIXes.
97 //
98 typedef int DWORD;
CompareStringW(DWORD,DWORD,UChar *,int,UChar *,int)99 inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;};
100 //#else
101 //#error "Not POSIX or Windows. Won't work."
102 #endif
103 
104 #include "line.h"
105 
106 static UBool gVerbose = false;
107 static UBool gDebug = false;
108 static UBool gQuiet = false;
109 static UBool gExemplar = false;
110 
111 DWORD          gWinLCID;
112 int            gCount;
113 UCollator     *gCol;
114 UCollator     *gUCA;
115 UConverter    *utf8cnv;
116 CompareFn gComparer;
117 int       gRefNum;
118 UnicodeSet gExcludeSet;
119 UnicodeSet gRepertoire;
120 
121 const UChar separatorChar = 0x0030;
122 
123 UPrinter *logger;
124 UPrinter *debug;
125 UPrinter *tailoringBundle;
126 UPrinter *referenceBundle;
127 UPrinter *bundle;
128 FILE     *fTailoringDump;
129 FILE     *fDefaultDump;
130 
131 const char *progName = "colprobe";
132 
133 const char *gLocale = NULL;
134 int32_t platformIndex = -1;
135 int32_t gPlatformNo = 0;
136 int32_t gPlatformIndexes[10];
137 int32_t gLocaleNo = 0;
138 const char* gLocales[100];
139 UBool gRulesStdin = false;
140 const char *outputFormat = "HTML";
141 const char *outExtension = "html";
142 
143 enum {
144   HELP1,
145     HELP2,
146     VERBOSE,
147     QUIET,
148     VERSION,
149     ICUDATADIR,
150     COPYRIGHT,
151     LOCALE,
152     PLATFORM,
153     DEBUG,
154     EXEMPLAR,
155     RULESSTDIN,
156     REFERENCE,
157     EXCLUDESET,
158     REPERTOIRE,
159   INTERACTIVE,
160   PRINTREF,
161   DIFF,
162   OUTPUT
163 };
164 
165 UOption options[]={
166   /*0*/ UOPTION_HELP_H,
167   /*1*/ UOPTION_HELP_QUESTION_MARK,
168   /*2*/ UOPTION_VERBOSE,
169   /*3*/ UOPTION_QUIET,
170   /*4*/ UOPTION_VERSION,
171   /*5*/ UOPTION_ICUDATADIR,
172   /*6*/ UOPTION_COPYRIGHT,
173   /*7*/ UOPTION_DEF("locale", 'l', UOPT_REQUIRES_ARG),
174   /*8*/ UOPTION_DEF("platform", 'p', UOPT_REQUIRES_ARG),
175   /*9*/ UOPTION_DEF("debug", 'D', UOPT_NO_ARG),
176   /*10*/ UOPTION_DEF("exemplar", 'E', UOPT_NO_ARG),
177   /*11*/ UOPTION_DEF("rulesstdin", 'R', UOPT_NO_ARG),
178   /*12*/ UOPTION_DEF("ref", 'c', UOPT_REQUIRES_ARG),
179   /*13*/ UOPTION_DEF("excludeset", 'x', UOPT_REQUIRES_ARG),
180   /*14*/ UOPTION_DEF("repertoire", 't', UOPT_REQUIRES_ARG),
181   /*15*/ UOPTION_DEF("interactive", 'I', UOPT_NO_ARG),
182   /*16*/ UOPTION_DEF("printref", 0, UOPT_NO_ARG),
183   /*17*/ UOPTION_DEF("diff", 0, UOPT_NO_ARG),
184   /*18*/ UOPTION_DEF("output", 0, UOPT_REQUIRES_ARG)
185 };
186 
187 UChar compA[256];
188 UChar compB[256];
189 int32_t compALen = 0;
190 int32_t compBLen = 0;
191 
192 char compUTF8A[256];
193 char compUTF8B[256];
194 int32_t compUTF8ALen = 0;
195 int32_t compUTF8BLen = 0;
196 
UNIXstrcmp(const void * a,const void * b)197 int UNIXstrcmp(const void *a, const void *b) {
198   UErrorCode status = U_ZERO_ERROR;
199     gCount++;
200     int t;
201     compALen = unorm_normalize((*(Line **)a)->name, (*(Line **)a)->len, UNORM_NFC, 0, compA, 256, &status);
202     compBLen = unorm_normalize((*(Line **)b)->name, (*(Line **)b)->len, UNORM_NFC, 0, compB, 256, &status);
203     compUTF8ALen = ucnv_fromUChars(utf8cnv, compUTF8A, 256, compA, compALen, &status);
204     compUTF8A[compUTF8ALen] = 0;
205     compUTF8BLen = ucnv_fromUChars(utf8cnv, compUTF8B, 256, compB, compBLen, &status);
206     compUTF8B[compUTF8BLen] = 0;
207     t = strcoll(compUTF8A, compUTF8B);
208     return t;
209 }
210 
UNIXgetSortKey(const UChar * string,int32_t len,uint8_t * buffer,int32_t buffCapacity)211 int UNIXgetSortKey(const UChar *string, int32_t len, uint8_t *buffer, int32_t buffCapacity) {
212   UErrorCode status = U_ZERO_ERROR;
213   compALen = unorm_normalize(string, len, UNORM_NFC, 0, compA, 256, &status);
214   compUTF8ALen = ucnv_fromUChars(utf8cnv, compUTF8A, 256, compA, compALen, &status);
215   compUTF8A[compUTF8ALen] = 0;
216   return (strxfrm((char *)buffer, compUTF8A, buffCapacity)+1);
217 }
218 
219 #ifdef WIN32
Winstrcmp(const void * a,const void * b)220 int Winstrcmp(const void *a, const void *b) {
221   UErrorCode status = U_ZERO_ERROR;
222     gCount++;
223     int t;
224     //compALen = unorm_compose(compA, 256, (*(Line **)a)->name, (*(Line **)a)->len, false, 0, &status);
225     //compBLen = unorm_compose(compB, 256, (*(Line **)b)->name, (*(Line **)b)->len, false, 0, &status);
226     compALen = unorm_normalize((*(Line **)a)->name, (*(Line **)a)->len, UNORM_NFC, 0, compA, 256, &status);
227     compBLen = unorm_normalize((*(Line **)b)->name, (*(Line **)b)->len, UNORM_NFC, 0, compB, 256, &status);
228     t = CompareStringW(gWinLCID,  SORT_STRINGSORT, //0,
229       compA, compALen,
230       compB, compBLen);
231 
232 /*
233     t = CompareStringW(gWinLCID, 0,
234       (*(Line **)a)->name, (*(Line **)a)->len,
235       (*(Line **)b)->name, (*(Line **)b)->len);
236 */
237     return t-2;
238 }
239 
WingetSortKey(const UChar * string,int32_t len,uint8_t * buffer,int32_t buffCapacity)240 int WingetSortKey(const UChar *string, int32_t len, uint8_t *buffer, int32_t buffCapacity) {
241   UErrorCode status = U_ZERO_ERROR;
242   compALen = unorm_normalize(string, len, UNORM_NFC, 0, compA, 256, &status);
243   return LCMapStringW(gWinLCID, LCMAP_SORTKEY | SORT_STRINGSORT, compA, compALen, (unsigned short *)buffer, buffCapacity);
244 }
245 
246 #if 0
247 int Winstrcmp(const void *a, const void *b) {
248   UErrorCode status = U_ZERO_ERROR;
249   uint8_t b1[256], b2[256];
250   int32_t b1Len, b2Len;
251   b1Len = WingetSortKey((*(Line **)a)->name, (*(Line **)a)->len, b1, 256);
252   b2Len = WingetSortKey((*(Line **)b)->name, (*(Line **)b)->len, b2, 256);
253 
254   b1[b1Len] = 0;
255   b2[b2Len] = 0;
256 
257   return strcmp((const char *)b1, (const char *)b2);
258 }
259 #endif
260 
261 #else
Winstrcmp(const void * a,const void * b)262 int Winstrcmp(const void *a, const void *b) {
263   if(a == b);
264   return 0;
265 }
WingetSortKey(const UChar *,int32_t,uint8_t *,int32_t)266 int WingetSortKey(const UChar *, int32_t , uint8_t *, int32_t ) {
267   return 0;
268 }
269 #endif
270 
ICUstrcmp(const void * a,const void * b)271 int ICUstrcmp(const void *a, const void *b) {
272     gCount++;
273     UCollationResult t;
274     t = ucol_strcoll(gCol,
275       (*(Line **)a)->name, (*(Line **)a)->len,
276       (*(Line **)b)->name, (*(Line **)b)->len);
277     if (t == UCOL_LESS) return -1;
278     if (t == UCOL_GREATER) return +1;
279     return 0;
280 }
281 
ICUgetSortKey(const UChar * string,int32_t len,uint8_t * buffer,int32_t buffCapacity)282 int ICUgetSortKey(const UChar *string, int32_t len, uint8_t *buffer, int32_t buffCapacity) {
283   return ucol_getSortKey(gCol, string, len, buffer, buffCapacity);
284 }
285 
286 struct {
287   const char* name;
288   CompareFn comparer;
289   GetSortKeyFn skgetter;
290 } platforms[] = {
291   { "icu", ICUstrcmp, ICUgetSortKey },
292   { "w2k", Winstrcmp, WingetSortKey},
293   { "winxp", Winstrcmp, WingetSortKey},
294   { "aix", UNIXstrcmp, UNIXgetSortKey},
295   { "linux", UNIXstrcmp, UNIXgetSortKey}
296 };
297 
298 
stringToLower(char * string)299 void stringToLower(char *string) {
300   uint32_t i = 0;
301   for(i = 0; i < strlen(string); i++) {
302     string[i] = tolower(string[i]);
303   }
304 }
305 
usage(const char * name)306 void usage(const char *name) {
307   logger->log("Usage: %s --locale loc_name --platform platform\n", name);
308 }
309 
listKnownPlatforms()310 void listKnownPlatforms() {
311   uint32_t i = 0;
312   logger->log("Known platforms:\n");
313   for(i = 0; i < sizeof(platforms)/sizeof(platforms[0]); i++) {
314     logger->log("\t%s\n", platforms[i]);
315   }
316 }
317 
addPlatform(const char * platform)318 void addPlatform(const char *platform) {
319   uint32_t i;
320   //stringToLower(platform);
321   int32_t oldPlatformNo = gPlatformNo;
322 
323   for(i = 0; i < sizeof(platforms)/sizeof(platforms[0]); i++) {
324     if(strcmp(platform, platforms[i].name) == 0) {
325       gPlatformIndexes[gPlatformNo++] = i;
326     }
327   }
328   if(gPlatformNo == oldPlatformNo) {
329     logger->log("Unknown platform %s\n", platform);
330     listKnownPlatforms();
331   }
332 }
333 
processArgs(int argc,char * argv[],UErrorCode & status)334 void processArgs(int argc, char* argv[], UErrorCode &status)
335 {
336   int32_t i = 0;
337   U_MAIN_INIT_ARGS(argc, argv);
338 
339   argc = u_parseArgs(argc, argv, (int32_t)(sizeof(options)/sizeof(options[0])), options);
340 
341   if(argc < 0) {
342     logger->log("Unknown option: %s\n", argv[-argc]);
343     usage(progName);
344     return;
345   }
346 
347   if(options[0].doesOccur || options[1].doesOccur) {
348     usage(progName);
349     return;
350   }
351   if(options[VERBOSE].doesOccur) {
352     gVerbose = true;
353   }
354   if(options[DEBUG].doesOccur) {
355     gDebug = true;
356     gVerbose = true;
357   }
358   if(options[EXEMPLAR].doesOccur) {
359     gExemplar = true;
360   }
361   if(options[QUIET].doesOccur) {
362     gQuiet = true;
363   }
364 
365   // ASCII based options specified on the command line
366   // this is for testing purposes, will allow to load
367   // up ICU rules and then poke through them.
368   // In that case, we test only ICU and don't need
369   // a locale.
370   if(options[RULESSTDIN].doesOccur) {
371     gRulesStdin = true;
372     addPlatform("icu");
373     return;
374   }
375 
376   if(options[LOCALE].doesOccur) {
377     gLocale = options[LOCALE].value;
378   } else {
379     gLocale = argv[1];
380     //for(i = 1; i < argc; i++) {
381     //gLocales[gLocaleNo++] = argv[i];
382     //}
383   }
384 
385   if(options[PLATFORM].doesOccur) {
386     addPlatform(options[PLATFORM].value);
387   } else { // there is a list of platforms
388     addPlatform("icu");
389   }
390 
391   if(options[REFERENCE].doesOccur) {
392     for(i = 0; i < (int32_t)(sizeof(platforms)/sizeof(platforms[0])); i++) {
393       if(strcmp(options[REFERENCE].value, platforms[i].name) == 0) {
394         gRefNum = i;
395         break;
396       }
397     }
398     if(i == sizeof(platforms)/sizeof(platforms[0])) {
399       logger->log("Unknown reference %s!\n", options[REFERENCE].value);
400       status = U_ILLEGAL_ARGUMENT_ERROR;
401       return;
402     }
403   } else {
404     gRefNum = 0;
405   }
406 
407   if(options[EXCLUDESET].doesOccur) {
408     gExcludeSet.applyPattern(UnicodeString(options[EXCLUDESET].value), status);
409     if(U_FAILURE(status)) {
410       logger->log("Cannot construct exclude set from argument %s. Error %s\n", options[EXCLUDESET].value, u_errorName(status));
411       return;
412     } else {
413       UnicodeString pattern;
414       logger->log(gExcludeSet.toPattern(pattern, true), true);
415     }
416   }
417 
418   if(options[REPERTOIRE].doesOccur)  {
419     gRepertoire.applyPattern(UnicodeString(options[REPERTOIRE].value), status);
420     if(U_FAILURE(status)) {
421       logger->log("Cannot construct repertoire from argument %s. Error %s\n", options[REPERTOIRE].value, u_errorName(status));
422       return;
423     }
424   }
425 
426   if(options[OUTPUT].doesOccur) {
427     outputFormat = options[OUTPUT].value;
428     if(strcmp(outputFormat, "HTML") == 0) {
429       outExtension = "html";
430     } else if(strcmp(outputFormat, "XML") == 0) {
431       outExtension = "xml";
432     } else {
433       outExtension = "txt";
434     }
435   }
436 
437 }
438 
439 // Check whether upper case comes before lower case or vice-versa
440 int32_t
checkCaseOrdering(void)441 checkCaseOrdering(void) {
442   UChar stuff[][3] = {
443     { 0x0061, separatorChar, 0x0061}, //"aa",
444     { 0x0061, separatorChar, 0x0041 }, //"a\\u00E0",
445     { 0x0041, separatorChar, 0x0061 }, //"\\u00E0a",
446     { 0x0041, separatorChar, 0x0041 }, //"\\u00E0a",
447     //{ 0x00E0, separatorChar, 0x00E0 }  //"\\u00E0\\u00E0"
448   };
449   const int32_t size = sizeof(stuff)/sizeof(stuff[0]);
450 
451   Line **sortedLines = new Line*[size];
452   Line lines[size];
453 
454   int32_t i = 0;
455   int32_t ordered = 0, reversed = 0;
456 
457   for(i = 0; i < size; i++) {
458     lines[i].setName(stuff[i], 3);
459   }
460   //setArray(sortedLines, lines, size);
461   qsort(sortedLines, size, sizeof(Line*), gComparer);
462 
463   for(i = 0; i < size; i++) {
464     if(*(sortedLines+i) == &lines[i]) {
465       ordered++;
466     }
467     if(*(sortedLines+i) == &lines[size-i-1]) {
468       reversed++;
469     }
470   }
471 
472   delete[] sortedLines;
473   if(ordered == size) {
474     return 0; // in normal order
475   } else if(reversed == size) {
476     return 1; // in reversed order
477   } else {
478     return -1; // unknown order
479   }
480 }
481 
482 void
getExemplars(const char * locale,UnicodeSet & exemplars,UErrorCode & status)483 getExemplars(const char *locale, UnicodeSet &exemplars, UErrorCode &status) {
484   // first we fill out structures with exemplar characters.
485   UResourceBundle *res = ures_open(NULL, locale, &status);
486   UnicodeString exemplarString = ures_getUnicodeStringByKey(res, "ExemplarCharacters", &status);
487   exemplars.clear();
488   exemplars.applyPattern(exemplarString, status);
489   ures_close(res);
490 }
491 
492 
493 void
getFileNames(const char * name,char * tailoringName,char * tailoringDumpName,char * defaultName,char * defaultDumpName,char * diffName)494 getFileNames(const char *name, char *tailoringName, char *tailoringDumpName, char *defaultName, char *defaultDumpName, char *diffName) {
495   if(tailoringName) {
496     strcpy(tailoringName, platforms[gPlatformIndexes[0]].name);
497     strcat(tailoringName, "/");
498     strcat(tailoringName, name);
499     strcat(tailoringName, "_raw.");
500     strcat(tailoringName, outExtension);
501   }
502   if(tailoringDumpName) {
503     strcpy(tailoringDumpName, platforms[gPlatformIndexes[0]].name);
504     strcat(tailoringDumpName, "/");
505     strcat(tailoringDumpName, name);
506     strcat(tailoringDumpName, ".dump");
507   }
508 
509   if(diffName) {
510     strcpy(diffName, platforms[gPlatformIndexes[0]].name);
511     strcat(diffName, "/");
512     strcat(diffName, name);
513     strcat(diffName, "_collation.");
514     strcat(diffName, outExtension);
515   }
516 
517   if(defaultName) {
518     strcpy(defaultName, platforms[gRefNum].name);
519     strcat(defaultName, "/");
520     strcat(defaultName, name);
521     strcat(defaultName, "_default_raw.");
522     strcat(defaultName, outExtension);
523   }
524 
525   if(defaultDumpName) {
526     strcpy(defaultDumpName, platforms[gRefNum].name);
527     strcat(defaultDumpName, "/");
528     strcat(defaultDumpName, name);
529     strcat(defaultDumpName, "_default.dump");
530   }
531 }
532 
533 void
setFiles(const char * name,UErrorCode & status)534 setFiles(const char *name, UErrorCode &status) {
535   if(U_FAILURE(status)) {
536     return;
537   }
538   int32_t i = 0;
539   char tailoringName[256];
540   char tailoringDumpName[256];
541   char defaultName[256];
542   char defaultDumpName[256];
543   char diffName[256];
544 
545   getFileNames(name, tailoringName, tailoringDumpName, defaultName, defaultDumpName, diffName);
546   if(options[PLATFORM].doesOccur && !options[DIFF].doesOccur) {
547     if(createDir(platforms[gPlatformIndexes[0]].name) == 0) {
548       tailoringBundle = new UPrinter(tailoringName, "en", "utf-8", NULL, false);
549       fTailoringDump = fopen(tailoringDumpName, "wb");
550     } else {
551       status = U_FILE_ACCESS_ERROR;
552       return;
553     }
554   }
555 
556   if(options[REFERENCE].doesOccur && !options[DIFF].doesOccur) {
557     if(createDir(platforms[gRefNum].name) == 0) {
558       referenceBundle = new UPrinter(defaultName, "en", "utf-8", NULL, false);
559       fDefaultDump = fopen(defaultDumpName, "wb");
560     } else {
561       status = U_FILE_ACCESS_ERROR;
562       return;
563     }
564   }
565 
566   if((options[PLATFORM].doesOccur && options[REFERENCE].doesOccur) || options[DIFF].doesOccur) {
567     if(createDir(platforms[gPlatformIndexes[0]].name) == 0) {
568       bundle = new UPrinter(diffName, "en", "utf-8", NULL, false);
569     }
570   }
571   if(options[DIFF].doesOccur) {
572     fTailoringDump = fopen(tailoringDumpName, "rb");
573     fDefaultDump = fopen(defaultDumpName, "rb");
574   }
575 }
576 
577 
578 UErrorCode status = U_ZERO_ERROR;
579 static UnicodeSet UNASSIGNED(UnicodeString("[:Cn:]"), status);
580 static UnicodeSet GENERAL_ACCENTS(UnicodeString("[[:block=Combining Diacritical Marks:]-[:Cn:]]"), status);
581 //static UnicodeSet ASCII_BASE(UnicodeString("[[:ASCII:]-[:L:]-[:N:]]"), status);
582 static UnicodeSet ASCII_BASE(UnicodeString("[[:ASCII:]]"), status);
583 static UnicodeSet ALPHABETIC(UnicodeString("[:alphabetic:]"), status);
584 //static UnicodeSet CONTROL(UnicodeString("[[:control:][\\u0000-\\u002F]]"), status);
585 static UnicodeSet BMP(UnicodeString("[\\u0000-\\uFFFF]"), status);
586 
587 static UnicodeSet CONTROL(UnicodeString("[:control:]"), status);
588 
589 UCollator *
setLocale(const char * locale,UErrorCode & status)590 setLocale(const char* locale, UErrorCode &status)
591 {
592   gWinLCID = uloc_getLCID(locale);
593   setlocale(LC_COLLATE, locale);
594 
595   if(gCol) {
596     ucol_close(gCol);
597   }
598   gCol = ucol_open(locale, &status);
599   ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
600   //ucol_setAttribute(col, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status);
601   //ucol_setAttribute(col, UCOL_STRENGTH, UCOL_QUATERNARY, &status);
602 
603   return gCol;
604 }
605 
606 
607 
608 UCollator *
setReference(UErrorCode & status)609 setReference(UErrorCode &status)
610 {
611   gWinLCID = uloc_getLCID("en");
612   setlocale(LC_COLLATE, "en_US.UTF-8");
613   if(gCol) {
614     ucol_close(gCol);
615   }
616   gCol = ucol_open("root", &status);
617   ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
618   return gCol;
619 }
620 
621 void
processInteractive()622 processInteractive() {
623   char command[256];
624   while(fgets(command, 256, stdin)) {
625 
626   }
627 }
628 
629 UChar probeChars[][4] = {
630   { 0x0061, 0x0062, 0x00E1, 0x0041 }, // latin with a-grave
631   { 0x0041, 0x0042, 0x00C1, 0x0061 }, // upper first
632   { 0x006E, 0x006F, 0x00F1, 0x004E }, // latin with n-tilda
633   { 0x004E, 0x004F, 0x00D1, 0x006E }, // upper first
634   { 0x0433, 0x0493, 0x0491, 0x0413 }, // Cyrillic
635   { 0x0413, 0x0492, 0x0490, 0x0433 }, // upper first
636   { 0x3045, 0x3047, 0x3094, 0x3046 }  // Hiragana/Katakana (last resort)
637 
638 };
639 
640 void
processCollator(UCollator * col,UErrorCode & status)641 processCollator(UCollator *col, UErrorCode &status) {
642   int32_t i = 0;
643   uint32_t j = 0;
644   gCol = col;
645   UChar ruleString[16384];
646   char myLoc[256];
647 
648   int32_t ruleStringLength = ucol_getRulesEx(gCol, UCOL_TAILORING_ONLY, ruleString, 16384);
649   logger->log(UnicodeString(ruleString, ruleStringLength), true);
650   const char *locale = ucol_getLocale(gCol, ULOC_REQUESTED_LOCALE, &status);
651   if(locale == NULL) {
652     locale = "en";
653   }
654   strcpy(myLoc, locale);
655   UnicodeSet exemplarUSet;
656   UnicodeSet RefRepertoire;
657 
658   UnicodeSet tailored;
659 
660   tailored = *((UnicodeSet *)ucol_getTailoredSet(gCol, &status));
661   tailored.removeAll(CONTROL);
662 
663 
664   UnicodeString pattern;
665   int sanityResult;
666 
667   UnicodeSet hanSet;
668   UBool hanAppears = false;
669 
670   debug->log("\nGenerating order for platform: %s\n", platforms[gPlatformIndexes[0]].name);
671   gComparer = platforms[gPlatformIndexes[0]].comparer;
672 
673   StrengthProbe probe(platforms[gPlatformIndexes[0]].comparer, platforms[gPlatformIndexes[0]].skgetter, 0x0030, probeChars[0][0], probeChars[0][1], probeChars[0][2], probeChars[0][3]);
674   sanityResult = probe.checkSanity();
675   j = 0;
676   while(sanityResult && j+1 < sizeof(probeChars)/sizeof(probeChars[0])) {
677    j++;
678    sanityResult =  probe.setProbeChars(probeChars[j][0], probeChars[j][1], probeChars[j][2], probeChars[j][3]);
679   }
680   if(sanityResult) {
681     logger->log("Bad choice of probe characters! Sanity returned %i. Exiting\n", sanityResult, sanityResult);
682     return;
683   }
684   logger->log("Probe chars: %C, %C, %C, %C\n", probeChars[j][0], probeChars[j][1], probeChars[j][2], probeChars[j][3]);
685 
686   debug->off();
687 
688   if(gRepertoire.size()) {
689     exemplarUSet = gRepertoire;
690   } else {
691     generateRepertoire(locale, exemplarUSet, hanAppears, status);
692   }
693   exemplarUSet.addAll(tailored);
694   hanSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_HAN, status);
695   exemplarUSet.removeAll(hanSet);
696 
697   logger->log(exemplarUSet.toPattern(pattern, true), true);
698 
699   exemplarUSet = flatten(exemplarUSet, status);
700   logger->log(exemplarUSet.toPattern(pattern, true), true);
701 
702   if(!options[PRINTREF].doesOccur) {
703 
704     logger->log("\n*** Detecting ordering for the locale\n\n");
705 
706     debug->on();
707     SortedLines lines(exemplarUSet, gExcludeSet, probe, logger, debug);
708     lines.analyse(status);
709     lines.calculateSortKeys();
710     debug->log("\n*** Final order\n\n");
711     debug->log(lines.toPrettyString(true, true), true);
712     lines.toFile(fTailoringDump, true, status);
713     tailoringBundle->log(lines.toOutput(outputFormat, myLoc, platforms[gPlatformIndexes[0]].name, NULL, true, true, hanAppears), true);
714     //debug->off();
715 
716     if(options[REFERENCE].doesOccur) {
717       status = U_ZERO_ERROR;
718       lines.getRepertoire(RefRepertoire);
719       setReference(status);
720 
721       logger->log(exemplarUSet.toPattern(pattern, true), true);
722       logger->log(RefRepertoire.toPattern(pattern, true), true);
723 
724       StrengthProbe RefProbe(platforms[gRefNum].comparer, platforms[gRefNum].skgetter);
725       logger->log("\n*** Detecting ordering for reference\n\n");
726       SortedLines RefLines(exemplarUSet, gExcludeSet, RefProbe, logger, debug);
727       RefLines.analyse(status);
728       referenceBundle->log(RefLines.toOutput(outputFormat, myLoc, platforms[gRefNum].name, NULL, true, true, false), true);
729       RefLines.toFile(fDefaultDump, true, status);
730 
731       lines.reduceDifference(RefLines);
732       logger->log("\n*** Final rules\n\n");
733       logger->log(lines.toPrettyString(true), true);
734       bundle->log(lines.toOutput(outputFormat, myLoc, platforms[gPlatformIndexes[0]].name, platforms[gRefNum].name, true, true, hanAppears), true);
735     }
736   } else {
737     setReference(status);
738     StrengthProbe RefProbe(platforms[gRefNum].comparer, platforms[gRefNum].skgetter);
739     logger->log("\n*** Detecting ordering for reference\n\n");
740     SortedLines RefLines(exemplarUSet, gExcludeSet, RefProbe, logger, debug);
741     RefLines.analyse(status);
742     logger->log(RefLines.toPrettyString(true), true);
743     referenceBundle->log(RefLines.toOutput(outputFormat, myLoc, platforms[gRefNum].name, NULL, true, true, false), true);
744   }
745   if(hanAppears) {
746     // there are Han characters. This is a huge block. The best we can do is to just sort it, compare to empty
747     // and spit it out. Anything else would be a suicide (actually is - kernel just kills you :)
748     logger->log("\n*** Detecting order for Han\n");
749     debug->off();
750     setLocale(gLocale, status);
751     exemplarUSet.clear();
752     exemplarUSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_HAN, status);
753     exemplarUSet = flatten(exemplarUSet, status);
754     SortedLines han(exemplarUSet, gExcludeSet, probe, logger, debug);
755     han.sort(true, true);
756     han.classifyRepertoire();
757     han.getBounds(status);
758     tailoringBundle->log("Han ordering:<br>\n");
759     tailoringBundle->log(han.toOutput(outputFormat, myLoc, platforms[gPlatformIndexes[0]].name, NULL, true, false, false), true);
760     bundle->log(han.toOutput(outputFormat, myLoc, platforms[gPlatformIndexes[0]].name, NULL, true, false, false), true);
761   }
762   ucol_close(gCol);
763 }
764 
765 void
processLocale(const char * locale,UErrorCode & status)766 processLocale(const char *locale, UErrorCode &status) {
767   setLocale(locale, status);
768   setFiles(locale, status);
769   if(U_FAILURE(status)) {
770     return;
771   }
772 
773   debug->log("Locale %s (LCID:%06X, unix:%s)\n", locale, gWinLCID, setlocale(LC_COLLATE, NULL));
774   tailoringBundle->log("// Ordering for locale %s (LCID:%06X, unix:%s), platform %s reference %s<br>\n",
775     locale, gWinLCID, setlocale(LC_COLLATE, NULL),
776     platforms[gPlatformIndexes[0]].name, platforms[gRefNum].name);
777   if(options[REFERENCE].doesOccur) {
778     referenceBundle->log("// Reference for locale %s (LCID:%06X, unix:%s), platform %s reference %s<br>\n",
779       locale, gWinLCID, setlocale(LC_COLLATE, NULL),
780       platforms[gPlatformIndexes[0]].name, platforms[gRefNum].name);
781   }
782 
783 
784   processCollator(gCol, status);
785 }
786 
787 
788 
789 UBool
hasCollationElements(const char * locName)790 hasCollationElements(const char *locName) {
791 
792   UErrorCode status = U_ZERO_ERROR;
793   UResourceBundle *ColEl = NULL;
794 
795   UResourceBundle *loc = ures_open(NULL, locName, &status);;
796 
797   if(U_SUCCESS(status)) {
798     status = U_ZERO_ERROR;
799     ColEl = ures_getByKey(loc, "CollationElements", ColEl, &status);
800     if(status == U_ZERO_ERROR) { /* do the test - there are real elements */
801       ures_close(ColEl);
802       ures_close(loc);
803       return true;
804     }
805     ures_close(ColEl);
806     ures_close(loc);
807   }
808   return false;
809 }
810 
811 int
main(int argc,char * argv[])812 main(int argc,
813      char* argv[])
814 {
815   UErrorCode status = U_ZERO_ERROR;
816   logger = new UPrinter(stdout, "en", "latin-1");
817   debug =  new UPrinter(stderr, "en", "latin-1");
818 
819 /*
820   USet *wsp = uprv_openRuleWhiteSpaceSet(&status);
821   uset_add(wsp, 0x0041);
822   uset_remove(wsp, 0x0041);
823   UnicodeString pat;
824   ((UnicodeSet *)wsp)->toPattern(pat, true);
825   pat.setCharAt(pat.length(), 0);
826   escapeString(pat.getBuffer(), pat.length(), log);
827   u_fflush(log);
828 */
829 
830   processArgs(argc, argv, status);
831   int32_t i = 0;
832 
833 
834 
835   if(U_FAILURE(status) || gPlatformNo == 0) {
836     return -1;
837   }
838 
839   utf8cnv = ucnv_open("utf-8", &status);    // we are just doing UTF-8 locales for now.
840   gUCA = ucol_open("root", &status);
841 
842   if(options[INTERACTIVE].doesOccur) {
843     processInteractive();
844   } else {
845     if(gRulesStdin) {
846       char buffer[1024];
847       UChar ruleBuffer[16384];
848       UChar *rules = ruleBuffer;
849       int32_t maxRuleLen = 16384;
850       int32_t rLen = 0;
851       while(fgets(buffer, 1024, stdin)) {
852         if(buffer[0] != '/' && buffer[1] != '/') {
853           rLen = u_unescape(buffer, rules, maxRuleLen);
854           rules += rLen;
855           maxRuleLen -= rLen;
856         }
857       }
858       UParseError parseError;
859       //escapeString(ruleBuffer, rules-ruleBuffer, log);//
860       debug->log("%U\n", ruleBuffer);
861 
862       UCollator *col = ucol_openRules(ruleBuffer, rules-ruleBuffer, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &status);
863       if(U_SUCCESS(status)) {
864         setFiles("stdinRules", status);
865         processCollator(col, status);
866       } else {
867         logger->log("Error %s\n", u_errorName(status));
868       }
869     } else if(options[DIFF].doesOccur) {
870       logger->log("Diffing two dumps\n");
871       // must have locale, platform and ref in order to be
872       // able to find dump files.
873       setFiles(gLocale, status);
874 
875       if(fTailoringDump && fDefaultDump) {
876 	    SortedLines tailoring(fTailoringDump, logger, debug, status);
877 	    logger->log(tailoring.toString(true), true);
878 	    SortedLines reference(fDefaultDump, logger, debug, status);
879 	    logger->log(reference.toString(true), true);
880 	    tailoring.reduceDifference(reference);
881 	    logger->log("\n*** Final rules\n\n");
882 	    logger->log(tailoring.toPrettyString(true), true);
883 	    //result->log(lines.toPrettyString(true), true);
884 	    bundle->log(tailoring.toOutput(outputFormat, gLocale, platforms[gPlatformIndexes[0]].name, platforms[gRefNum].name, true, true, false), true);
885       }
886 
887     } else {
888       if(gLocale) {
889         processLocale(gLocale, status);
890       } else if(gLocaleNo) {
891         for(i = 0; i < gLocaleNo; i++) {
892           processLocale(gLocales[i], status);
893         }
894       } else { // do the loop through all the locales
895         int32_t noOfLoc = uloc_countAvailable();
896         const char *locName = NULL;
897         for(i = 0; i<noOfLoc; i++) {
898           status = U_ZERO_ERROR;
899           locName = uloc_getAvailable(i);
900           if(hasCollationElements(locName)) {
901             processLocale(locName, status);
902           }
903         }
904       }
905     }
906   }
907 
908 
909   ucol_close(gUCA);
910   ucnv_close(utf8cnv);
911 
912   delete logger;
913   delete debug;
914   if(tailoringBundle) {
915     delete tailoringBundle;
916   }
917   if(referenceBundle) {
918     delete referenceBundle;
919   }
920   if(bundle) {
921     delete bundle;
922   }
923   if(fTailoringDump) {
924     fclose(fTailoringDump);
925   }
926   if(fDefaultDump) {
927     fclose(fDefaultDump);
928   }
929   return 0;
930 }
931 
932 
propertyAndValueName(UProperty prop,int32_t i)933 UnicodeString propertyAndValueName(UProperty prop, int32_t i) {
934   UnicodeString result;
935   result.append(u_getPropertyName(prop, U_LONG_PROPERTY_NAME));
936   result.append("=");
937   result.append(u_getPropertyValueName(prop, i, U_LONG_PROPERTY_NAME));
938 
939     //+ "(" + prop + "," + i + ") ";
940   return result;
941 }
942 
943 
generateRepertoire(const char * locale,UnicodeSet & rep,UBool & hanAppears,UErrorCode & status)944 void generateRepertoire(const char *locale, UnicodeSet &rep, UBool &hanAppears, UErrorCode &status) {
945     UnicodeString dispName;
946     debug->log("Getting repertoire for %s\n", locale);
947     tailoringBundle->log("// Scripts in repertoire: ");
948     if(options[REFERENCE].doesOccur) {
949       referenceBundle->log("// Scripts in repertoire: ");
950     }
951 	rep.clear();
952     UnicodeSet delta;
953 
954     UScriptCode script[256];
955     int32_t i = 0;
956     // now add the scripts for the locale
957     UProperty prop = UCHAR_SCRIPT;
958 	int32_t scriptLength = uscript_getCode(locale, script, 256, &status);
959     if(scriptLength) {
960 	  for (i = 0; i < scriptLength; ++i) {
961         if(script[i] == USCRIPT_HAN) {
962           hanAppears = true;
963           continue;
964         }
965         delta.applyIntPropertyValue(prop, script[i], status);
966         debug->log("Adding ");
967         debug->log(propertyAndValueName(prop, script[i]), true);
968         tailoringBundle->log("// ");
969         tailoringBundle->log(propertyAndValueName(prop, script[i]), true);
970         if(options[REFERENCE].doesOccur) {
971           referenceBundle->log("// ");
972           referenceBundle->log(propertyAndValueName(prop, script[i]), true);
973         }
974 		rep.addAll(delta);
975 	  }
976     } else {
977       delta.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_LATIN, status);
978       rep.addAll(delta);
979     }
980 
981     // now see which blocks those overlap, and add
982     prop = UCHAR_BLOCK;
983     int32_t min = u_getIntPropertyMinValue(prop);
984     int32_t max = u_getIntPropertyMaxValue(prop);
985     UnicodeSet checkDelta;
986     for (i = min; i <= max; ++i) {
987         // skip certain blocks
988         const char *name = u_getPropertyValueName(prop, i, U_LONG_PROPERTY_NAME);
989         if (strcmp(name, "Superscripts_and_Subscripts") == 0
990         || strcmp(name, "Letterlike_Symbols") == 0
991         || strcmp(name, "Alphabetic_Presentation_Forms") == 0
992         || strcmp(name, "Halfwidth_and_Fullwidth_Forms") == 0) continue;
993 
994         delta.applyIntPropertyValue(prop, i, status).removeAll(UNASSIGNED);
995         if (!rep.containsSome(delta)) continue;
996         if (rep.containsAll(delta)) continue; // just to see what we are adding
997         debug->log("Adding ");
998         debug->log(propertyAndValueName(prop, i), true);
999         tailoringBundle->log("// ");
1000         tailoringBundle->log(propertyAndValueName(prop, i), true);
1001         if(options[REFERENCE].doesOccur) {
1002           referenceBundle->log("// ");
1003           referenceBundle->log(propertyAndValueName(prop, i), true);
1004         }
1005         rep.addAll(delta);
1006     }
1007 
1008     // add ASCII and general accents
1009     rep.addAll(GENERAL_ACCENTS).addAll(ASCII_BASE);
1010     rep.removeAll(CONTROL);
1011     //delta.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_HAN, status);
1012     //rep.removeAll(delta);
1013 
1014     // now add the exemplar characters
1015     // can't get at them from Java right now
1016     tailoringBundle->log("<br>\n");
1017     if(options[REFERENCE].doesOccur) {
1018       referenceBundle->log("<br>\n");
1019     }
1020 }
1021 
flatten(const UnicodeSet & source,UErrorCode & status)1022 UnicodeSet flatten(const UnicodeSet &source, UErrorCode &status) {
1023     UnicodeSet result;
1024     UnicodeSetIterator it(source);
1025     UnicodeString item, itemNFKD, toNormalize;
1026     while (it.next()) {
1027         // would be nicer if UnicodeSetIterator had a getString function
1028         if (it.isString()) {
1029           Normalizer::normalize(it.getString(), UNORM_NFD, 0, item, status);
1030           Normalizer::normalize(it.getString(), UNORM_NFKD, 0, itemNFKD, status);
1031         } else {
1032           toNormalize.setTo(it.getCodepoint());
1033           Normalizer::normalize(toNormalize, UNORM_NFD, 0, item, status);
1034           Normalizer::normalize(toNormalize, UNORM_NFKD, 0, itemNFKD, status);
1035         }
1036         result.addAll(item);
1037         result.addAll(itemNFKD);
1038     }
1039     return result;
1040 }
1041 
1042 
testWin(StrengthProbe & probe,UErrorCode & status)1043 void testWin(StrengthProbe &probe, UErrorCode &status)
1044 {
1045   UnicodeSet trailings(UnicodeString("[\\uFE7D\\uFE7C\\u30FD\\uFF70\\u30FC\\u309D\\u3032\\u3031\\u3005\\u0651]"), status);
1046   char intChar[] = "\\uFE7D\\uFE7C\\u30FD\\uFF70\\u30FC\\u309D\\u3032\\u3031\\u3005\\u0651";
1047   UChar interesting[256];
1048   int32_t intLen = u_unescape(intChar, interesting, 256);
1049   UChar i = 0;
1050   UChar j = 0,  k = 0;
1051   int32_t count;
1052   Line myCh, combo, trial, inter, kLine;
1053   for(i = 0; i < intLen; i++) {
1054     inter.setTo(interesting[i]);
1055     logger->log(inter.toString(true), true);
1056     logger->log("----------------------\n");
1057     for(j = 0; j < 0xFFFF; j++) {
1058       myCh.setTo(j);
1059       if(probe.distanceFromEmptyString(myCh) == UCOL_IDENTICAL) {
1060         continue;
1061       }
1062       logger->log(myCh.toString(true));
1063       combo.setTo(j);
1064       combo.append(interesting[i]);
1065       count = 0;
1066       for(k = 0; k < 0xFFFF; k++) {
1067         kLine.setTo(k);
1068         trial.setTo(j);
1069         trial.append(k);
1070         if(probe.compare(kLine, inter) < 0) {
1071           if(probe.compare(trial, combo) >= 0) {
1072             count++;
1073           }
1074         }
1075       }
1076       logger->log("%i %i\n", count, count);
1077     }
1078   }
1079 }
1080 
1081