1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2003, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 *
11 * File colprobe.cpp
12 *
13 * Modification History:
14 *
15 * Date Name Description
16 * 03/18/2003 weiv Creation.
17 *******************************************************************************
18 */
19
20 #include "uoptions.h"
21 #include "unicode/ucol.h"
22 #include "unicode/ucoleitr.h"
23 #include "unicode/ures.h"
24 #include "unicode/uniset.h"
25 #include "unicode/usetiter.h"
26 #include "unicode/ustring.h"
27 #include "unicode/uchar.h"
28 #include "unicode/uscript.h"
29 #include "unicode/locid.h"
30 #include "unicode/ucnv.h"
31 #include "uprops.h"
32 #include "hash.h"
33 #include "ucol_imp.h"
34
35 #include "unicode/ustdio.h"
36 #include "unicode/utrans.h"
37
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <fcntl.h>
42
43 // unix tolower
44 #include <ctype.h>
45 // unix setlocale
46 #include <locale.h>
47
48 #include "colprobe.h"
49
50 #include "line.h"
51 #include "sortedlines.h"
52 #include "strengthprobe.h"
53
54 void testWin(StrengthProbe &probe, UErrorCode &status) ;
55
56 #if defined WIN32
57 #include <io.h>
58 #include <windows.h>
59 #include <sys/types.h>
60 #include <sys/stat.h>
61 #include <direct.h>
62
createDir(const char * dirName)63 int createDir(const char* dirName) {
64 struct _stat myStat;
65 int result = _stat(dirName, &myStat);
66
67 if(result == -1) {
68 result = _mkdir(dirName);
69 return result;
70 } else if(myStat.st_mode & _S_IFDIR) {
71 return 0;
72 } else {
73 return 1;
74 }
75 }
76
77 //#elif defined POSIX
78 #else
79 #include <sys/stat.h>
80 #include <unistd.h>
81
createDir(const char * dirName)82 int createDir(const char* dirName) {
83 struct stat myStat;
84 int result = stat(dirName, &myStat);
85
86 if(result == -1) {
87 result = mkdir(dirName, S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IWGRP|S_IXGRP|S_IROTH|S_IWOTH|S_IXOTH);
88 return result;
89 } else if(S_ISDIR(myStat.st_mode)) {
90 return 0;
91 } else {
92 return 1;
93 }
94 }
95 //
96 // Stubs for Windows API functions when building on UNIXes.
97 //
98 typedef int DWORD;
CompareStringW(DWORD,DWORD,UChar *,int,UChar *,int)99 inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;};
100 //#else
101 //#error "Not POSIX or Windows. Won't work."
102 #endif
103
104 #include "line.h"
105
106 static UBool gVerbose = false;
107 static UBool gDebug = false;
108 static UBool gQuiet = false;
109 static UBool gExemplar = false;
110
111 DWORD gWinLCID;
112 int gCount;
113 UCollator *gCol;
114 UCollator *gUCA;
115 UConverter *utf8cnv;
116 CompareFn gComparer;
117 int gRefNum;
118 UnicodeSet gExcludeSet;
119 UnicodeSet gRepertoire;
120
121 const UChar separatorChar = 0x0030;
122
123 UPrinter *logger;
124 UPrinter *debug;
125 UPrinter *tailoringBundle;
126 UPrinter *referenceBundle;
127 UPrinter *bundle;
128 FILE *fTailoringDump;
129 FILE *fDefaultDump;
130
131 const char *progName = "colprobe";
132
133 const char *gLocale = NULL;
134 int32_t platformIndex = -1;
135 int32_t gPlatformNo = 0;
136 int32_t gPlatformIndexes[10];
137 int32_t gLocaleNo = 0;
138 const char* gLocales[100];
139 UBool gRulesStdin = false;
140 const char *outputFormat = "HTML";
141 const char *outExtension = "html";
142
143 enum {
144 HELP1,
145 HELP2,
146 VERBOSE,
147 QUIET,
148 VERSION,
149 ICUDATADIR,
150 COPYRIGHT,
151 LOCALE,
152 PLATFORM,
153 DEBUG,
154 EXEMPLAR,
155 RULESSTDIN,
156 REFERENCE,
157 EXCLUDESET,
158 REPERTOIRE,
159 INTERACTIVE,
160 PRINTREF,
161 DIFF,
162 OUTPUT
163 };
164
165 UOption options[]={
166 /*0*/ UOPTION_HELP_H,
167 /*1*/ UOPTION_HELP_QUESTION_MARK,
168 /*2*/ UOPTION_VERBOSE,
169 /*3*/ UOPTION_QUIET,
170 /*4*/ UOPTION_VERSION,
171 /*5*/ UOPTION_ICUDATADIR,
172 /*6*/ UOPTION_COPYRIGHT,
173 /*7*/ UOPTION_DEF("locale", 'l', UOPT_REQUIRES_ARG),
174 /*8*/ UOPTION_DEF("platform", 'p', UOPT_REQUIRES_ARG),
175 /*9*/ UOPTION_DEF("debug", 'D', UOPT_NO_ARG),
176 /*10*/ UOPTION_DEF("exemplar", 'E', UOPT_NO_ARG),
177 /*11*/ UOPTION_DEF("rulesstdin", 'R', UOPT_NO_ARG),
178 /*12*/ UOPTION_DEF("ref", 'c', UOPT_REQUIRES_ARG),
179 /*13*/ UOPTION_DEF("excludeset", 'x', UOPT_REQUIRES_ARG),
180 /*14*/ UOPTION_DEF("repertoire", 't', UOPT_REQUIRES_ARG),
181 /*15*/ UOPTION_DEF("interactive", 'I', UOPT_NO_ARG),
182 /*16*/ UOPTION_DEF("printref", 0, UOPT_NO_ARG),
183 /*17*/ UOPTION_DEF("diff", 0, UOPT_NO_ARG),
184 /*18*/ UOPTION_DEF("output", 0, UOPT_REQUIRES_ARG)
185 };
186
187 UChar compA[256];
188 UChar compB[256];
189 int32_t compALen = 0;
190 int32_t compBLen = 0;
191
192 char compUTF8A[256];
193 char compUTF8B[256];
194 int32_t compUTF8ALen = 0;
195 int32_t compUTF8BLen = 0;
196
UNIXstrcmp(const void * a,const void * b)197 int UNIXstrcmp(const void *a, const void *b) {
198 UErrorCode status = U_ZERO_ERROR;
199 gCount++;
200 int t;
201 compALen = unorm_normalize((*(Line **)a)->name, (*(Line **)a)->len, UNORM_NFC, 0, compA, 256, &status);
202 compBLen = unorm_normalize((*(Line **)b)->name, (*(Line **)b)->len, UNORM_NFC, 0, compB, 256, &status);
203 compUTF8ALen = ucnv_fromUChars(utf8cnv, compUTF8A, 256, compA, compALen, &status);
204 compUTF8A[compUTF8ALen] = 0;
205 compUTF8BLen = ucnv_fromUChars(utf8cnv, compUTF8B, 256, compB, compBLen, &status);
206 compUTF8B[compUTF8BLen] = 0;
207 t = strcoll(compUTF8A, compUTF8B);
208 return t;
209 }
210
UNIXgetSortKey(const UChar * string,int32_t len,uint8_t * buffer,int32_t buffCapacity)211 int UNIXgetSortKey(const UChar *string, int32_t len, uint8_t *buffer, int32_t buffCapacity) {
212 UErrorCode status = U_ZERO_ERROR;
213 compALen = unorm_normalize(string, len, UNORM_NFC, 0, compA, 256, &status);
214 compUTF8ALen = ucnv_fromUChars(utf8cnv, compUTF8A, 256, compA, compALen, &status);
215 compUTF8A[compUTF8ALen] = 0;
216 return (strxfrm((char *)buffer, compUTF8A, buffCapacity)+1);
217 }
218
219 #ifdef WIN32
Winstrcmp(const void * a,const void * b)220 int Winstrcmp(const void *a, const void *b) {
221 UErrorCode status = U_ZERO_ERROR;
222 gCount++;
223 int t;
224 //compALen = unorm_compose(compA, 256, (*(Line **)a)->name, (*(Line **)a)->len, false, 0, &status);
225 //compBLen = unorm_compose(compB, 256, (*(Line **)b)->name, (*(Line **)b)->len, false, 0, &status);
226 compALen = unorm_normalize((*(Line **)a)->name, (*(Line **)a)->len, UNORM_NFC, 0, compA, 256, &status);
227 compBLen = unorm_normalize((*(Line **)b)->name, (*(Line **)b)->len, UNORM_NFC, 0, compB, 256, &status);
228 t = CompareStringW(gWinLCID, SORT_STRINGSORT, //0,
229 compA, compALen,
230 compB, compBLen);
231
232 /*
233 t = CompareStringW(gWinLCID, 0,
234 (*(Line **)a)->name, (*(Line **)a)->len,
235 (*(Line **)b)->name, (*(Line **)b)->len);
236 */
237 return t-2;
238 }
239
WingetSortKey(const UChar * string,int32_t len,uint8_t * buffer,int32_t buffCapacity)240 int WingetSortKey(const UChar *string, int32_t len, uint8_t *buffer, int32_t buffCapacity) {
241 UErrorCode status = U_ZERO_ERROR;
242 compALen = unorm_normalize(string, len, UNORM_NFC, 0, compA, 256, &status);
243 return LCMapStringW(gWinLCID, LCMAP_SORTKEY | SORT_STRINGSORT, compA, compALen, (unsigned short *)buffer, buffCapacity);
244 }
245
246 #if 0
247 int Winstrcmp(const void *a, const void *b) {
248 UErrorCode status = U_ZERO_ERROR;
249 uint8_t b1[256], b2[256];
250 int32_t b1Len, b2Len;
251 b1Len = WingetSortKey((*(Line **)a)->name, (*(Line **)a)->len, b1, 256);
252 b2Len = WingetSortKey((*(Line **)b)->name, (*(Line **)b)->len, b2, 256);
253
254 b1[b1Len] = 0;
255 b2[b2Len] = 0;
256
257 return strcmp((const char *)b1, (const char *)b2);
258 }
259 #endif
260
261 #else
Winstrcmp(const void * a,const void * b)262 int Winstrcmp(const void *a, const void *b) {
263 if(a == b);
264 return 0;
265 }
WingetSortKey(const UChar *,int32_t,uint8_t *,int32_t)266 int WingetSortKey(const UChar *, int32_t , uint8_t *, int32_t ) {
267 return 0;
268 }
269 #endif
270
ICUstrcmp(const void * a,const void * b)271 int ICUstrcmp(const void *a, const void *b) {
272 gCount++;
273 UCollationResult t;
274 t = ucol_strcoll(gCol,
275 (*(Line **)a)->name, (*(Line **)a)->len,
276 (*(Line **)b)->name, (*(Line **)b)->len);
277 if (t == UCOL_LESS) return -1;
278 if (t == UCOL_GREATER) return +1;
279 return 0;
280 }
281
ICUgetSortKey(const UChar * string,int32_t len,uint8_t * buffer,int32_t buffCapacity)282 int ICUgetSortKey(const UChar *string, int32_t len, uint8_t *buffer, int32_t buffCapacity) {
283 return ucol_getSortKey(gCol, string, len, buffer, buffCapacity);
284 }
285
286 struct {
287 const char* name;
288 CompareFn comparer;
289 GetSortKeyFn skgetter;
290 } platforms[] = {
291 { "icu", ICUstrcmp, ICUgetSortKey },
292 { "w2k", Winstrcmp, WingetSortKey},
293 { "winxp", Winstrcmp, WingetSortKey},
294 { "aix", UNIXstrcmp, UNIXgetSortKey},
295 { "linux", UNIXstrcmp, UNIXgetSortKey}
296 };
297
298
stringToLower(char * string)299 void stringToLower(char *string) {
300 uint32_t i = 0;
301 for(i = 0; i < strlen(string); i++) {
302 string[i] = tolower(string[i]);
303 }
304 }
305
usage(const char * name)306 void usage(const char *name) {
307 logger->log("Usage: %s --locale loc_name --platform platform\n", name);
308 }
309
listKnownPlatforms()310 void listKnownPlatforms() {
311 uint32_t i = 0;
312 logger->log("Known platforms:\n");
313 for(i = 0; i < sizeof(platforms)/sizeof(platforms[0]); i++) {
314 logger->log("\t%s\n", platforms[i]);
315 }
316 }
317
addPlatform(const char * platform)318 void addPlatform(const char *platform) {
319 uint32_t i;
320 //stringToLower(platform);
321 int32_t oldPlatformNo = gPlatformNo;
322
323 for(i = 0; i < sizeof(platforms)/sizeof(platforms[0]); i++) {
324 if(strcmp(platform, platforms[i].name) == 0) {
325 gPlatformIndexes[gPlatformNo++] = i;
326 }
327 }
328 if(gPlatformNo == oldPlatformNo) {
329 logger->log("Unknown platform %s\n", platform);
330 listKnownPlatforms();
331 }
332 }
333
processArgs(int argc,char * argv[],UErrorCode & status)334 void processArgs(int argc, char* argv[], UErrorCode &status)
335 {
336 int32_t i = 0;
337 U_MAIN_INIT_ARGS(argc, argv);
338
339 argc = u_parseArgs(argc, argv, (int32_t)(sizeof(options)/sizeof(options[0])), options);
340
341 if(argc < 0) {
342 logger->log("Unknown option: %s\n", argv[-argc]);
343 usage(progName);
344 return;
345 }
346
347 if(options[0].doesOccur || options[1].doesOccur) {
348 usage(progName);
349 return;
350 }
351 if(options[VERBOSE].doesOccur) {
352 gVerbose = true;
353 }
354 if(options[DEBUG].doesOccur) {
355 gDebug = true;
356 gVerbose = true;
357 }
358 if(options[EXEMPLAR].doesOccur) {
359 gExemplar = true;
360 }
361 if(options[QUIET].doesOccur) {
362 gQuiet = true;
363 }
364
365 // ASCII based options specified on the command line
366 // this is for testing purposes, will allow to load
367 // up ICU rules and then poke through them.
368 // In that case, we test only ICU and don't need
369 // a locale.
370 if(options[RULESSTDIN].doesOccur) {
371 gRulesStdin = true;
372 addPlatform("icu");
373 return;
374 }
375
376 if(options[LOCALE].doesOccur) {
377 gLocale = options[LOCALE].value;
378 } else {
379 gLocale = argv[1];
380 //for(i = 1; i < argc; i++) {
381 //gLocales[gLocaleNo++] = argv[i];
382 //}
383 }
384
385 if(options[PLATFORM].doesOccur) {
386 addPlatform(options[PLATFORM].value);
387 } else { // there is a list of platforms
388 addPlatform("icu");
389 }
390
391 if(options[REFERENCE].doesOccur) {
392 for(i = 0; i < (int32_t)(sizeof(platforms)/sizeof(platforms[0])); i++) {
393 if(strcmp(options[REFERENCE].value, platforms[i].name) == 0) {
394 gRefNum = i;
395 break;
396 }
397 }
398 if(i == sizeof(platforms)/sizeof(platforms[0])) {
399 logger->log("Unknown reference %s!\n", options[REFERENCE].value);
400 status = U_ILLEGAL_ARGUMENT_ERROR;
401 return;
402 }
403 } else {
404 gRefNum = 0;
405 }
406
407 if(options[EXCLUDESET].doesOccur) {
408 gExcludeSet.applyPattern(UnicodeString(options[EXCLUDESET].value), status);
409 if(U_FAILURE(status)) {
410 logger->log("Cannot construct exclude set from argument %s. Error %s\n", options[EXCLUDESET].value, u_errorName(status));
411 return;
412 } else {
413 UnicodeString pattern;
414 logger->log(gExcludeSet.toPattern(pattern, true), true);
415 }
416 }
417
418 if(options[REPERTOIRE].doesOccur) {
419 gRepertoire.applyPattern(UnicodeString(options[REPERTOIRE].value), status);
420 if(U_FAILURE(status)) {
421 logger->log("Cannot construct repertoire from argument %s. Error %s\n", options[REPERTOIRE].value, u_errorName(status));
422 return;
423 }
424 }
425
426 if(options[OUTPUT].doesOccur) {
427 outputFormat = options[OUTPUT].value;
428 if(strcmp(outputFormat, "HTML") == 0) {
429 outExtension = "html";
430 } else if(strcmp(outputFormat, "XML") == 0) {
431 outExtension = "xml";
432 } else {
433 outExtension = "txt";
434 }
435 }
436
437 }
438
439 // Check whether upper case comes before lower case or vice-versa
440 int32_t
checkCaseOrdering(void)441 checkCaseOrdering(void) {
442 UChar stuff[][3] = {
443 { 0x0061, separatorChar, 0x0061}, //"aa",
444 { 0x0061, separatorChar, 0x0041 }, //"a\\u00E0",
445 { 0x0041, separatorChar, 0x0061 }, //"\\u00E0a",
446 { 0x0041, separatorChar, 0x0041 }, //"\\u00E0a",
447 //{ 0x00E0, separatorChar, 0x00E0 } //"\\u00E0\\u00E0"
448 };
449 const int32_t size = sizeof(stuff)/sizeof(stuff[0]);
450
451 Line **sortedLines = new Line*[size];
452 Line lines[size];
453
454 int32_t i = 0;
455 int32_t ordered = 0, reversed = 0;
456
457 for(i = 0; i < size; i++) {
458 lines[i].setName(stuff[i], 3);
459 }
460 //setArray(sortedLines, lines, size);
461 qsort(sortedLines, size, sizeof(Line*), gComparer);
462
463 for(i = 0; i < size; i++) {
464 if(*(sortedLines+i) == &lines[i]) {
465 ordered++;
466 }
467 if(*(sortedLines+i) == &lines[size-i-1]) {
468 reversed++;
469 }
470 }
471
472 delete[] sortedLines;
473 if(ordered == size) {
474 return 0; // in normal order
475 } else if(reversed == size) {
476 return 1; // in reversed order
477 } else {
478 return -1; // unknown order
479 }
480 }
481
482 void
getExemplars(const char * locale,UnicodeSet & exemplars,UErrorCode & status)483 getExemplars(const char *locale, UnicodeSet &exemplars, UErrorCode &status) {
484 // first we fill out structures with exemplar characters.
485 UResourceBundle *res = ures_open(NULL, locale, &status);
486 UnicodeString exemplarString = ures_getUnicodeStringByKey(res, "ExemplarCharacters", &status);
487 exemplars.clear();
488 exemplars.applyPattern(exemplarString, status);
489 ures_close(res);
490 }
491
492
493 void
getFileNames(const char * name,char * tailoringName,char * tailoringDumpName,char * defaultName,char * defaultDumpName,char * diffName)494 getFileNames(const char *name, char *tailoringName, char *tailoringDumpName, char *defaultName, char *defaultDumpName, char *diffName) {
495 if(tailoringName) {
496 strcpy(tailoringName, platforms[gPlatformIndexes[0]].name);
497 strcat(tailoringName, "/");
498 strcat(tailoringName, name);
499 strcat(tailoringName, "_raw.");
500 strcat(tailoringName, outExtension);
501 }
502 if(tailoringDumpName) {
503 strcpy(tailoringDumpName, platforms[gPlatformIndexes[0]].name);
504 strcat(tailoringDumpName, "/");
505 strcat(tailoringDumpName, name);
506 strcat(tailoringDumpName, ".dump");
507 }
508
509 if(diffName) {
510 strcpy(diffName, platforms[gPlatformIndexes[0]].name);
511 strcat(diffName, "/");
512 strcat(diffName, name);
513 strcat(diffName, "_collation.");
514 strcat(diffName, outExtension);
515 }
516
517 if(defaultName) {
518 strcpy(defaultName, platforms[gRefNum].name);
519 strcat(defaultName, "/");
520 strcat(defaultName, name);
521 strcat(defaultName, "_default_raw.");
522 strcat(defaultName, outExtension);
523 }
524
525 if(defaultDumpName) {
526 strcpy(defaultDumpName, platforms[gRefNum].name);
527 strcat(defaultDumpName, "/");
528 strcat(defaultDumpName, name);
529 strcat(defaultDumpName, "_default.dump");
530 }
531 }
532
533 void
setFiles(const char * name,UErrorCode & status)534 setFiles(const char *name, UErrorCode &status) {
535 if(U_FAILURE(status)) {
536 return;
537 }
538 int32_t i = 0;
539 char tailoringName[256];
540 char tailoringDumpName[256];
541 char defaultName[256];
542 char defaultDumpName[256];
543 char diffName[256];
544
545 getFileNames(name, tailoringName, tailoringDumpName, defaultName, defaultDumpName, diffName);
546 if(options[PLATFORM].doesOccur && !options[DIFF].doesOccur) {
547 if(createDir(platforms[gPlatformIndexes[0]].name) == 0) {
548 tailoringBundle = new UPrinter(tailoringName, "en", "utf-8", NULL, false);
549 fTailoringDump = fopen(tailoringDumpName, "wb");
550 } else {
551 status = U_FILE_ACCESS_ERROR;
552 return;
553 }
554 }
555
556 if(options[REFERENCE].doesOccur && !options[DIFF].doesOccur) {
557 if(createDir(platforms[gRefNum].name) == 0) {
558 referenceBundle = new UPrinter(defaultName, "en", "utf-8", NULL, false);
559 fDefaultDump = fopen(defaultDumpName, "wb");
560 } else {
561 status = U_FILE_ACCESS_ERROR;
562 return;
563 }
564 }
565
566 if((options[PLATFORM].doesOccur && options[REFERENCE].doesOccur) || options[DIFF].doesOccur) {
567 if(createDir(platforms[gPlatformIndexes[0]].name) == 0) {
568 bundle = new UPrinter(diffName, "en", "utf-8", NULL, false);
569 }
570 }
571 if(options[DIFF].doesOccur) {
572 fTailoringDump = fopen(tailoringDumpName, "rb");
573 fDefaultDump = fopen(defaultDumpName, "rb");
574 }
575 }
576
577
578 UErrorCode status = U_ZERO_ERROR;
579 static UnicodeSet UNASSIGNED(UnicodeString("[:Cn:]"), status);
580 static UnicodeSet GENERAL_ACCENTS(UnicodeString("[[:block=Combining Diacritical Marks:]-[:Cn:]]"), status);
581 //static UnicodeSet ASCII_BASE(UnicodeString("[[:ASCII:]-[:L:]-[:N:]]"), status);
582 static UnicodeSet ASCII_BASE(UnicodeString("[[:ASCII:]]"), status);
583 static UnicodeSet ALPHABETIC(UnicodeString("[:alphabetic:]"), status);
584 //static UnicodeSet CONTROL(UnicodeString("[[:control:][\\u0000-\\u002F]]"), status);
585 static UnicodeSet BMP(UnicodeString("[\\u0000-\\uFFFF]"), status);
586
587 static UnicodeSet CONTROL(UnicodeString("[:control:]"), status);
588
589 UCollator *
setLocale(const char * locale,UErrorCode & status)590 setLocale(const char* locale, UErrorCode &status)
591 {
592 gWinLCID = uloc_getLCID(locale);
593 setlocale(LC_COLLATE, locale);
594
595 if(gCol) {
596 ucol_close(gCol);
597 }
598 gCol = ucol_open(locale, &status);
599 ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
600 //ucol_setAttribute(col, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status);
601 //ucol_setAttribute(col, UCOL_STRENGTH, UCOL_QUATERNARY, &status);
602
603 return gCol;
604 }
605
606
607
608 UCollator *
setReference(UErrorCode & status)609 setReference(UErrorCode &status)
610 {
611 gWinLCID = uloc_getLCID("en");
612 setlocale(LC_COLLATE, "en_US.UTF-8");
613 if(gCol) {
614 ucol_close(gCol);
615 }
616 gCol = ucol_open("root", &status);
617 ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
618 return gCol;
619 }
620
621 void
processInteractive()622 processInteractive() {
623 char command[256];
624 while(fgets(command, 256, stdin)) {
625
626 }
627 }
628
629 UChar probeChars[][4] = {
630 { 0x0061, 0x0062, 0x00E1, 0x0041 }, // latin with a-grave
631 { 0x0041, 0x0042, 0x00C1, 0x0061 }, // upper first
632 { 0x006E, 0x006F, 0x00F1, 0x004E }, // latin with n-tilda
633 { 0x004E, 0x004F, 0x00D1, 0x006E }, // upper first
634 { 0x0433, 0x0493, 0x0491, 0x0413 }, // Cyrillic
635 { 0x0413, 0x0492, 0x0490, 0x0433 }, // upper first
636 { 0x3045, 0x3047, 0x3094, 0x3046 } // Hiragana/Katakana (last resort)
637
638 };
639
640 void
processCollator(UCollator * col,UErrorCode & status)641 processCollator(UCollator *col, UErrorCode &status) {
642 int32_t i = 0;
643 uint32_t j = 0;
644 gCol = col;
645 UChar ruleString[16384];
646 char myLoc[256];
647
648 int32_t ruleStringLength = ucol_getRulesEx(gCol, UCOL_TAILORING_ONLY, ruleString, 16384);
649 logger->log(UnicodeString(ruleString, ruleStringLength), true);
650 const char *locale = ucol_getLocale(gCol, ULOC_REQUESTED_LOCALE, &status);
651 if(locale == NULL) {
652 locale = "en";
653 }
654 strcpy(myLoc, locale);
655 UnicodeSet exemplarUSet;
656 UnicodeSet RefRepertoire;
657
658 UnicodeSet tailored;
659
660 tailored = *((UnicodeSet *)ucol_getTailoredSet(gCol, &status));
661 tailored.removeAll(CONTROL);
662
663
664 UnicodeString pattern;
665 int sanityResult;
666
667 UnicodeSet hanSet;
668 UBool hanAppears = false;
669
670 debug->log("\nGenerating order for platform: %s\n", platforms[gPlatformIndexes[0]].name);
671 gComparer = platforms[gPlatformIndexes[0]].comparer;
672
673 StrengthProbe probe(platforms[gPlatformIndexes[0]].comparer, platforms[gPlatformIndexes[0]].skgetter, 0x0030, probeChars[0][0], probeChars[0][1], probeChars[0][2], probeChars[0][3]);
674 sanityResult = probe.checkSanity();
675 j = 0;
676 while(sanityResult && j+1 < sizeof(probeChars)/sizeof(probeChars[0])) {
677 j++;
678 sanityResult = probe.setProbeChars(probeChars[j][0], probeChars[j][1], probeChars[j][2], probeChars[j][3]);
679 }
680 if(sanityResult) {
681 logger->log("Bad choice of probe characters! Sanity returned %i. Exiting\n", sanityResult, sanityResult);
682 return;
683 }
684 logger->log("Probe chars: %C, %C, %C, %C\n", probeChars[j][0], probeChars[j][1], probeChars[j][2], probeChars[j][3]);
685
686 debug->off();
687
688 if(gRepertoire.size()) {
689 exemplarUSet = gRepertoire;
690 } else {
691 generateRepertoire(locale, exemplarUSet, hanAppears, status);
692 }
693 exemplarUSet.addAll(tailored);
694 hanSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_HAN, status);
695 exemplarUSet.removeAll(hanSet);
696
697 logger->log(exemplarUSet.toPattern(pattern, true), true);
698
699 exemplarUSet = flatten(exemplarUSet, status);
700 logger->log(exemplarUSet.toPattern(pattern, true), true);
701
702 if(!options[PRINTREF].doesOccur) {
703
704 logger->log("\n*** Detecting ordering for the locale\n\n");
705
706 debug->on();
707 SortedLines lines(exemplarUSet, gExcludeSet, probe, logger, debug);
708 lines.analyse(status);
709 lines.calculateSortKeys();
710 debug->log("\n*** Final order\n\n");
711 debug->log(lines.toPrettyString(true, true), true);
712 lines.toFile(fTailoringDump, true, status);
713 tailoringBundle->log(lines.toOutput(outputFormat, myLoc, platforms[gPlatformIndexes[0]].name, NULL, true, true, hanAppears), true);
714 //debug->off();
715
716 if(options[REFERENCE].doesOccur) {
717 status = U_ZERO_ERROR;
718 lines.getRepertoire(RefRepertoire);
719 setReference(status);
720
721 logger->log(exemplarUSet.toPattern(pattern, true), true);
722 logger->log(RefRepertoire.toPattern(pattern, true), true);
723
724 StrengthProbe RefProbe(platforms[gRefNum].comparer, platforms[gRefNum].skgetter);
725 logger->log("\n*** Detecting ordering for reference\n\n");
726 SortedLines RefLines(exemplarUSet, gExcludeSet, RefProbe, logger, debug);
727 RefLines.analyse(status);
728 referenceBundle->log(RefLines.toOutput(outputFormat, myLoc, platforms[gRefNum].name, NULL, true, true, false), true);
729 RefLines.toFile(fDefaultDump, true, status);
730
731 lines.reduceDifference(RefLines);
732 logger->log("\n*** Final rules\n\n");
733 logger->log(lines.toPrettyString(true), true);
734 bundle->log(lines.toOutput(outputFormat, myLoc, platforms[gPlatformIndexes[0]].name, platforms[gRefNum].name, true, true, hanAppears), true);
735 }
736 } else {
737 setReference(status);
738 StrengthProbe RefProbe(platforms[gRefNum].comparer, platforms[gRefNum].skgetter);
739 logger->log("\n*** Detecting ordering for reference\n\n");
740 SortedLines RefLines(exemplarUSet, gExcludeSet, RefProbe, logger, debug);
741 RefLines.analyse(status);
742 logger->log(RefLines.toPrettyString(true), true);
743 referenceBundle->log(RefLines.toOutput(outputFormat, myLoc, platforms[gRefNum].name, NULL, true, true, false), true);
744 }
745 if(hanAppears) {
746 // there are Han characters. This is a huge block. The best we can do is to just sort it, compare to empty
747 // and spit it out. Anything else would be a suicide (actually is - kernel just kills you :)
748 logger->log("\n*** Detecting order for Han\n");
749 debug->off();
750 setLocale(gLocale, status);
751 exemplarUSet.clear();
752 exemplarUSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_HAN, status);
753 exemplarUSet = flatten(exemplarUSet, status);
754 SortedLines han(exemplarUSet, gExcludeSet, probe, logger, debug);
755 han.sort(true, true);
756 han.classifyRepertoire();
757 han.getBounds(status);
758 tailoringBundle->log("Han ordering:<br>\n");
759 tailoringBundle->log(han.toOutput(outputFormat, myLoc, platforms[gPlatformIndexes[0]].name, NULL, true, false, false), true);
760 bundle->log(han.toOutput(outputFormat, myLoc, platforms[gPlatformIndexes[0]].name, NULL, true, false, false), true);
761 }
762 ucol_close(gCol);
763 }
764
765 void
processLocale(const char * locale,UErrorCode & status)766 processLocale(const char *locale, UErrorCode &status) {
767 setLocale(locale, status);
768 setFiles(locale, status);
769 if(U_FAILURE(status)) {
770 return;
771 }
772
773 debug->log("Locale %s (LCID:%06X, unix:%s)\n", locale, gWinLCID, setlocale(LC_COLLATE, NULL));
774 tailoringBundle->log("// Ordering for locale %s (LCID:%06X, unix:%s), platform %s reference %s<br>\n",
775 locale, gWinLCID, setlocale(LC_COLLATE, NULL),
776 platforms[gPlatformIndexes[0]].name, platforms[gRefNum].name);
777 if(options[REFERENCE].doesOccur) {
778 referenceBundle->log("// Reference for locale %s (LCID:%06X, unix:%s), platform %s reference %s<br>\n",
779 locale, gWinLCID, setlocale(LC_COLLATE, NULL),
780 platforms[gPlatformIndexes[0]].name, platforms[gRefNum].name);
781 }
782
783
784 processCollator(gCol, status);
785 }
786
787
788
789 UBool
hasCollationElements(const char * locName)790 hasCollationElements(const char *locName) {
791
792 UErrorCode status = U_ZERO_ERROR;
793 UResourceBundle *ColEl = NULL;
794
795 UResourceBundle *loc = ures_open(NULL, locName, &status);;
796
797 if(U_SUCCESS(status)) {
798 status = U_ZERO_ERROR;
799 ColEl = ures_getByKey(loc, "CollationElements", ColEl, &status);
800 if(status == U_ZERO_ERROR) { /* do the test - there are real elements */
801 ures_close(ColEl);
802 ures_close(loc);
803 return true;
804 }
805 ures_close(ColEl);
806 ures_close(loc);
807 }
808 return false;
809 }
810
811 int
main(int argc,char * argv[])812 main(int argc,
813 char* argv[])
814 {
815 UErrorCode status = U_ZERO_ERROR;
816 logger = new UPrinter(stdout, "en", "latin-1");
817 debug = new UPrinter(stderr, "en", "latin-1");
818
819 /*
820 USet *wsp = uprv_openRuleWhiteSpaceSet(&status);
821 uset_add(wsp, 0x0041);
822 uset_remove(wsp, 0x0041);
823 UnicodeString pat;
824 ((UnicodeSet *)wsp)->toPattern(pat, true);
825 pat.setCharAt(pat.length(), 0);
826 escapeString(pat.getBuffer(), pat.length(), log);
827 u_fflush(log);
828 */
829
830 processArgs(argc, argv, status);
831 int32_t i = 0;
832
833
834
835 if(U_FAILURE(status) || gPlatformNo == 0) {
836 return -1;
837 }
838
839 utf8cnv = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now.
840 gUCA = ucol_open("root", &status);
841
842 if(options[INTERACTIVE].doesOccur) {
843 processInteractive();
844 } else {
845 if(gRulesStdin) {
846 char buffer[1024];
847 UChar ruleBuffer[16384];
848 UChar *rules = ruleBuffer;
849 int32_t maxRuleLen = 16384;
850 int32_t rLen = 0;
851 while(fgets(buffer, 1024, stdin)) {
852 if(buffer[0] != '/' && buffer[1] != '/') {
853 rLen = u_unescape(buffer, rules, maxRuleLen);
854 rules += rLen;
855 maxRuleLen -= rLen;
856 }
857 }
858 UParseError parseError;
859 //escapeString(ruleBuffer, rules-ruleBuffer, log);//
860 debug->log("%U\n", ruleBuffer);
861
862 UCollator *col = ucol_openRules(ruleBuffer, rules-ruleBuffer, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &status);
863 if(U_SUCCESS(status)) {
864 setFiles("stdinRules", status);
865 processCollator(col, status);
866 } else {
867 logger->log("Error %s\n", u_errorName(status));
868 }
869 } else if(options[DIFF].doesOccur) {
870 logger->log("Diffing two dumps\n");
871 // must have locale, platform and ref in order to be
872 // able to find dump files.
873 setFiles(gLocale, status);
874
875 if(fTailoringDump && fDefaultDump) {
876 SortedLines tailoring(fTailoringDump, logger, debug, status);
877 logger->log(tailoring.toString(true), true);
878 SortedLines reference(fDefaultDump, logger, debug, status);
879 logger->log(reference.toString(true), true);
880 tailoring.reduceDifference(reference);
881 logger->log("\n*** Final rules\n\n");
882 logger->log(tailoring.toPrettyString(true), true);
883 //result->log(lines.toPrettyString(true), true);
884 bundle->log(tailoring.toOutput(outputFormat, gLocale, platforms[gPlatformIndexes[0]].name, platforms[gRefNum].name, true, true, false), true);
885 }
886
887 } else {
888 if(gLocale) {
889 processLocale(gLocale, status);
890 } else if(gLocaleNo) {
891 for(i = 0; i < gLocaleNo; i++) {
892 processLocale(gLocales[i], status);
893 }
894 } else { // do the loop through all the locales
895 int32_t noOfLoc = uloc_countAvailable();
896 const char *locName = NULL;
897 for(i = 0; i<noOfLoc; i++) {
898 status = U_ZERO_ERROR;
899 locName = uloc_getAvailable(i);
900 if(hasCollationElements(locName)) {
901 processLocale(locName, status);
902 }
903 }
904 }
905 }
906 }
907
908
909 ucol_close(gUCA);
910 ucnv_close(utf8cnv);
911
912 delete logger;
913 delete debug;
914 if(tailoringBundle) {
915 delete tailoringBundle;
916 }
917 if(referenceBundle) {
918 delete referenceBundle;
919 }
920 if(bundle) {
921 delete bundle;
922 }
923 if(fTailoringDump) {
924 fclose(fTailoringDump);
925 }
926 if(fDefaultDump) {
927 fclose(fDefaultDump);
928 }
929 return 0;
930 }
931
932
propertyAndValueName(UProperty prop,int32_t i)933 UnicodeString propertyAndValueName(UProperty prop, int32_t i) {
934 UnicodeString result;
935 result.append(u_getPropertyName(prop, U_LONG_PROPERTY_NAME));
936 result.append("=");
937 result.append(u_getPropertyValueName(prop, i, U_LONG_PROPERTY_NAME));
938
939 //+ "(" + prop + "," + i + ") ";
940 return result;
941 }
942
943
generateRepertoire(const char * locale,UnicodeSet & rep,UBool & hanAppears,UErrorCode & status)944 void generateRepertoire(const char *locale, UnicodeSet &rep, UBool &hanAppears, UErrorCode &status) {
945 UnicodeString dispName;
946 debug->log("Getting repertoire for %s\n", locale);
947 tailoringBundle->log("// Scripts in repertoire: ");
948 if(options[REFERENCE].doesOccur) {
949 referenceBundle->log("// Scripts in repertoire: ");
950 }
951 rep.clear();
952 UnicodeSet delta;
953
954 UScriptCode script[256];
955 int32_t i = 0;
956 // now add the scripts for the locale
957 UProperty prop = UCHAR_SCRIPT;
958 int32_t scriptLength = uscript_getCode(locale, script, 256, &status);
959 if(scriptLength) {
960 for (i = 0; i < scriptLength; ++i) {
961 if(script[i] == USCRIPT_HAN) {
962 hanAppears = true;
963 continue;
964 }
965 delta.applyIntPropertyValue(prop, script[i], status);
966 debug->log("Adding ");
967 debug->log(propertyAndValueName(prop, script[i]), true);
968 tailoringBundle->log("// ");
969 tailoringBundle->log(propertyAndValueName(prop, script[i]), true);
970 if(options[REFERENCE].doesOccur) {
971 referenceBundle->log("// ");
972 referenceBundle->log(propertyAndValueName(prop, script[i]), true);
973 }
974 rep.addAll(delta);
975 }
976 } else {
977 delta.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_LATIN, status);
978 rep.addAll(delta);
979 }
980
981 // now see which blocks those overlap, and add
982 prop = UCHAR_BLOCK;
983 int32_t min = u_getIntPropertyMinValue(prop);
984 int32_t max = u_getIntPropertyMaxValue(prop);
985 UnicodeSet checkDelta;
986 for (i = min; i <= max; ++i) {
987 // skip certain blocks
988 const char *name = u_getPropertyValueName(prop, i, U_LONG_PROPERTY_NAME);
989 if (strcmp(name, "Superscripts_and_Subscripts") == 0
990 || strcmp(name, "Letterlike_Symbols") == 0
991 || strcmp(name, "Alphabetic_Presentation_Forms") == 0
992 || strcmp(name, "Halfwidth_and_Fullwidth_Forms") == 0) continue;
993
994 delta.applyIntPropertyValue(prop, i, status).removeAll(UNASSIGNED);
995 if (!rep.containsSome(delta)) continue;
996 if (rep.containsAll(delta)) continue; // just to see what we are adding
997 debug->log("Adding ");
998 debug->log(propertyAndValueName(prop, i), true);
999 tailoringBundle->log("// ");
1000 tailoringBundle->log(propertyAndValueName(prop, i), true);
1001 if(options[REFERENCE].doesOccur) {
1002 referenceBundle->log("// ");
1003 referenceBundle->log(propertyAndValueName(prop, i), true);
1004 }
1005 rep.addAll(delta);
1006 }
1007
1008 // add ASCII and general accents
1009 rep.addAll(GENERAL_ACCENTS).addAll(ASCII_BASE);
1010 rep.removeAll(CONTROL);
1011 //delta.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_HAN, status);
1012 //rep.removeAll(delta);
1013
1014 // now add the exemplar characters
1015 // can't get at them from Java right now
1016 tailoringBundle->log("<br>\n");
1017 if(options[REFERENCE].doesOccur) {
1018 referenceBundle->log("<br>\n");
1019 }
1020 }
1021
flatten(const UnicodeSet & source,UErrorCode & status)1022 UnicodeSet flatten(const UnicodeSet &source, UErrorCode &status) {
1023 UnicodeSet result;
1024 UnicodeSetIterator it(source);
1025 UnicodeString item, itemNFKD, toNormalize;
1026 while (it.next()) {
1027 // would be nicer if UnicodeSetIterator had a getString function
1028 if (it.isString()) {
1029 Normalizer::normalize(it.getString(), UNORM_NFD, 0, item, status);
1030 Normalizer::normalize(it.getString(), UNORM_NFKD, 0, itemNFKD, status);
1031 } else {
1032 toNormalize.setTo(it.getCodepoint());
1033 Normalizer::normalize(toNormalize, UNORM_NFD, 0, item, status);
1034 Normalizer::normalize(toNormalize, UNORM_NFKD, 0, itemNFKD, status);
1035 }
1036 result.addAll(item);
1037 result.addAll(itemNFKD);
1038 }
1039 return result;
1040 }
1041
1042
testWin(StrengthProbe & probe,UErrorCode & status)1043 void testWin(StrengthProbe &probe, UErrorCode &status)
1044 {
1045 UnicodeSet trailings(UnicodeString("[\\uFE7D\\uFE7C\\u30FD\\uFF70\\u30FC\\u309D\\u3032\\u3031\\u3005\\u0651]"), status);
1046 char intChar[] = "\\uFE7D\\uFE7C\\u30FD\\uFF70\\u30FC\\u309D\\u3032\\u3031\\u3005\\u0651";
1047 UChar interesting[256];
1048 int32_t intLen = u_unescape(intChar, interesting, 256);
1049 UChar i = 0;
1050 UChar j = 0, k = 0;
1051 int32_t count;
1052 Line myCh, combo, trial, inter, kLine;
1053 for(i = 0; i < intLen; i++) {
1054 inter.setTo(interesting[i]);
1055 logger->log(inter.toString(true), true);
1056 logger->log("----------------------\n");
1057 for(j = 0; j < 0xFFFF; j++) {
1058 myCh.setTo(j);
1059 if(probe.distanceFromEmptyString(myCh) == UCOL_IDENTICAL) {
1060 continue;
1061 }
1062 logger->log(myCh.toString(true));
1063 combo.setTo(j);
1064 combo.append(interesting[i]);
1065 count = 0;
1066 for(k = 0; k < 0xFFFF; k++) {
1067 kLine.setTo(k);
1068 trial.setTo(j);
1069 trial.append(k);
1070 if(probe.compare(kLine, inter) < 0) {
1071 if(probe.compare(trial, combo) >= 0) {
1072 count++;
1073 }
1074 }
1075 }
1076 logger->log("%i %i\n", count, count);
1077 }
1078 }
1079 }
1080
1081