1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2003, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 *
11 * File colprobe.cpp
12 *
13 * Modification History:
14 *
15 * Date Name Description
16 * 03/18/2003 weiv Creation.
17 *******************************************************************************
18 */
19
20 #include "uoptions.h"
21 #include "unicode/ucol.h"
22 #include "unicode/ucoleitr.h"
23 #include "unicode/ures.h"
24 #include "unicode/uniset.h"
25 #include "unicode/usetiter.h"
26 #include "unicode/ustring.h"
27 #include "unicode/uchar.h"
28 #include "unicode/uscript.h"
29 #include "uprops.h"
30 #include "hash.h"
31 #include "ucol_imp.h"
32
33 #include "unicode/ustdio.h"
34 #include "unicode/utrans.h"
35
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <io.h>
40 #include <fcntl.h>
41
42 #include "colprobe.h"
43
44
45 #ifdef WIN32
46 #include <windows.h>
47 #else
48 //
49 // Stubs for Windows API functions when building on UNIXes.
50 //
51 typedef int DWORD;
CompareStringW(DWORD,DWORD,UChar *,int,UChar *,int)52 inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;};
53 #include <sys/time.h>
timeGetTime()54 unsigned long timeGetTime() {
55 struct timeval t;
56 gettimeofday(&t, 0);
57 unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares.
58 val += t.tv_usec / 1000;
59 return val;
60 };
LCMapStringW(DWORD,DWORD,UChar *,int,UChar *,int)61 inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;};
62 const int LCMAP_SORTKEY = 0;
63 #define MAKELCID(a,b) 0
64 const int SORT_DEFAULT = 0;
65 #endif
66
67 #include "line.h"
68
69 static UBool gVerbose = false;
70 static UBool gDebug = false;
71 static UBool gQuiet = false;
72 static UBool gExemplar = false;
73
74 DWORD gWinLCID;
75 int gCount;
76 Line **gICULines;
77 UCollator *gCol;
78 UCollator *gUCA;
79 Line source;
80 Line target;
81 Line *gSource = &source;
82 Line *gTarget = ⌖
83 Hashtable gElements(false);
84 Hashtable gExpansions(false);
85 CompareFn gComparer;
86
87 const UChar separatorChar = 0x0030;
88
89 UFILE *out = NULL;
90 UFILE *err = NULL;
91 UFILE *log = NULL;
92
93 const char *progName = "colprobe";
94
95 const char *gLocale = NULL;
96 //char platform[256];
97 int32_t platformIndex = -1;
98 int32_t gPlatformNo = 0;
99 int32_t gPlatformIndexes[10];
100 int32_t gLocaleNo = 0;
101 const char* gLocales[100];
102 UBool gRulesStdin = false;
103
104 enum {
105 HELP1,
106 HELP2,
107 VERBOSE,
108 QUIET,
109 VERSION,
110 ICUDATADIR,
111 COPYRIGHT,
112 LOCALE,
113 PLATFORM,
114 DEBUG,
115 EXEMPLAR,
116 RULESSTDIN
117 };
118
119 UOption options[]={
120 /*0*/ UOPTION_HELP_H,
121 /*1*/ UOPTION_HELP_QUESTION_MARK,
122 /*2*/ UOPTION_VERBOSE,
123 /*3*/ UOPTION_QUIET,
124 /*4*/ UOPTION_VERSION,
125 /*5*/ UOPTION_ICUDATADIR,
126 /*6*/ UOPTION_COPYRIGHT,
127 /*7*/ UOPTION_DEF("locale", 'l', UOPT_REQUIRES_ARG),
128 /*8*/ UOPTION_DEF("platform", 'p', UOPT_REQUIRES_ARG),
129 /*9*/ UOPTION_DEF("debug", 'D', UOPT_NO_ARG),
130 /*10*/ UOPTION_DEF("exemplar", 'E', UOPT_NO_ARG),
131 /*11*/ UOPTION_DEF("rulesstdin", 'R', UOPT_NO_ARG)
132 };
133
Winstrcmp(const void * a,const void * b)134 int Winstrcmp(const void *a, const void *b) {
135 gCount++;
136 int t;
137 t = CompareStringW(gWinLCID, 0,
138 (*(Line **)a)->name, (*(Line **)a)->len,
139 (*(Line **)b)->name, (*(Line **)b)->len);
140 return t-2;
141 }
142
ICUstrcmp(const void * a,const void * b)143 int ICUstrcmp(const void *a, const void *b) {
144 gCount++;
145 UCollationResult t;
146 t = ucol_strcoll(gCol,
147 (*(Line **)a)->name, (*(Line **)a)->len,
148 (*(Line **)b)->name, (*(Line **)b)->len);
149 if (t == UCOL_LESS) return -1;
150 if (t == UCOL_GREATER) return +1;
151 return 0;
152 }
153
154 struct {
155 const char* name;
156 CompareFn comparer;
157 } platforms[] = {
158 { "icu", ICUstrcmp },
159 { "win", Winstrcmp}
160 };
161
162
deleteLineElement(void * line)163 void deleteLineElement(void *line) {
164 delete((Line *)line);
165 }
166
stringToLower(char * string)167 void stringToLower(char *string) {
168 uint32_t i = 0;
169 for(i = 0; i < strlen(string); i++) {
170 string[i] = tolower(string[i]);
171 }
172 }
173
usage(const char * name)174 void usage(const char *name) {
175 u_fprintf(out, "Usage: %s --locale loc_name --platform platform\n", name);
176 }
177
listKnownPlatforms()178 void listKnownPlatforms() {
179 int32_t i = 0;
180 u_fprintf(err, "Known platforms:\n");
181 for(i = 0; i < sizeof(platforms)/sizeof(platforms[0]); i++) {
182 u_fprintf(err, "\t%s\n", platforms[i]);
183 }
184 }
185
addPlatform(const char * platform)186 void addPlatform(const char *platform) {
187 int32_t i;
188 //stringToLower(platform);
189 int32_t oldPlatformNo = gPlatformNo;
190
191 for(i = 0; i < sizeof(platforms)/sizeof(platforms[0]); i++) {
192 if(strcmp(platform, platforms[i].name) == 0) {
193 gPlatformIndexes[gPlatformNo++] = i;
194 }
195 }
196 if(gPlatformNo == oldPlatformNo) {
197 u_fprintf(err, "Unknown platform %s\n", platform);
198 listKnownPlatforms();
199 }
200 }
201
processArgs(int argc,char * argv[],UErrorCode & status)202 void processArgs(int argc, char* argv[], UErrorCode &status)
203 {
204 int32_t i = 0;
205 U_MAIN_INIT_ARGS(argc, argv);
206
207 argc = u_parseArgs(argc, argv, (int32_t)(sizeof(options)/sizeof(options[0])), options);
208
209 if(argc < 0) {
210 u_fprintf(err, "Unknown option: %s\n", argv[-argc]);
211 usage(progName);
212 return;
213 }
214
215 if(options[0].doesOccur || options[1].doesOccur) {
216 usage(progName);
217 return;
218 }
219 if(options[VERBOSE].doesOccur) {
220 gVerbose = true;
221 }
222 if(options[DEBUG].doesOccur) {
223 gDebug = true;
224 gVerbose = true;
225 }
226 if(options[EXEMPLAR].doesOccur) {
227 gExemplar = true;
228 }
229 if(options[QUIET].doesOccur) {
230 gQuiet = true;
231 }
232 /*
233 for(i = 8; i < 9; i++) {
234 if(!options[i].doesOccur) {
235 u_fprintf(err, "Option %s is required!\n", options[i].longName);
236 usage(progName);
237 status = U_ILLEGAL_ARGUMENT_ERROR;
238 }
239 if(options[i].value == NULL) {
240 u_fprintf(err, "Option %s needs an argument!\n", options[i].longName);
241 usage(progName);
242 status = U_ILLEGAL_ARGUMENT_ERROR;
243 }
244 }
245 */
246 // ASCII based options specified on the command line
247 // this is for testing purposes, will allow to load
248 // up ICU rules and then poke through them.
249 // In that case, we test only ICU and don't need
250 // a locale.
251 if(options[RULESSTDIN].doesOccur) {
252 gRulesStdin = true;
253 addPlatform("icu");
254 return;
255 }
256
257 if(options[LOCALE].doesOccur) {
258 gLocale = options[LOCALE].value;
259 } else {
260 for(i = 1; i < argc; i++) {
261 gLocales[gLocaleNo++] = argv[i];
262 }
263 }
264 if(options[PLATFORM].doesOccur) {
265 //strcpy(platform, options[PLATFORM].value);
266 //addPlatform("icu");
267 addPlatform(options[PLATFORM].value);
268 } else { // there is a list of platforms
269 u_fprintf(err, "Option %s is required!\n", options[i].longName);
270 usage(progName);
271 status = U_ILLEGAL_ARGUMENT_ERROR;
272 }
273
274 //
275 // Set up a Windows LCID
276 //
277 gWinLCID = uloc_getLCID(gLocale);
278 /*
279 if (gLocale != 0) {
280 gWinLCID = MAKELCID(gLocale, SORT_DEFAULT);
281 }
282 else {
283 gWinLCID = uloc_getLCID(gLocale);
284 }
285 */
286
287 }
288
printRules(const UChar * name,int32_t len,UFILE * file)289 void printRules(const UChar *name, int32_t len, UFILE *file) {
290 // very rudimentary pretty rules print
291 int32_t i = 0;
292 UChar toPrint[16384];
293 int32_t toPrintIndex = 0;
294 for(i = 0; i < len; i++) {
295 if(name[i] == 0x0026) {
296 if(toPrintIndex) {
297 toPrint[toPrintIndex] = 0;
298 u_fprintf(file, "%U\n", toPrint);
299 toPrintIndex = 0;
300 toPrint[toPrintIndex++] = name[i];
301 } else {
302 toPrint[toPrintIndex++] = name[i];
303 }
304 } else {
305 toPrint[toPrintIndex++] = name[i];
306 }
307 }
308 if(toPrintIndex) {
309 toPrint[toPrintIndex] = 0;
310 u_fprintf(file, "%U\n", toPrint);
311 toPrintIndex = 0;
312 }
313
314
315 }
316
escapeString(const UChar * name,int32_t len,UFILE * file)317 void escapeString(const UChar *name, int32_t len, UFILE *file) {
318 u_fprintf(file, "%U", name);
319 /*
320 int32_t j = 0;
321 for(j = 0; j < len; j++) {
322 if(name[j] >= 0x20 && name[j] < 0x80) {
323 u_fprintf(file, "%c", name[j]);
324 } else {
325 u_fprintf(file, "\\u%04X", name[j]);
326 }
327 }
328 */
329 }
escapeALine(Line * line,UFILE * file)330 void escapeALine(Line *line, UFILE *file) {
331 escapeString(line->name, line->len, file);
332 }
333
escapeExpansion(Line * line,UFILE * file)334 void escapeExpansion(Line *line, UFILE *file) {
335 escapeString(line->expansionString, line->expLen, file);
336 }
337
showNames(Line * line,UFILE * file)338 void showNames(Line *line, UFILE *file) {
339 UErrorCode status = U_ZERO_ERROR;
340 int32_t j = 0;
341 char charName[256];
342 for(j = 0; j < line->len; j++) {
343 u_charName(line->name[j], U_EXTENDED_CHAR_NAME, charName, 256, &status);
344 u_fprintf(file, "%s ", charName);
345 }
346 }
347
setArray(Line ** array,Line * contents,int32_t size)348 void setArray(Line **array, Line *contents, int32_t size) {
349 int32_t i = 0;
350 for(i = 0; i < size; i++) {
351 array[i] = contents+i;
352 }
353 }
354
355 // set an array from a Hashtable
356 int32_t
setArray(Line ** array,Hashtable * table=& gElements)357 setArray(Line **array, Hashtable *table = &gElements) {
358 int32_t size = table->count();
359 int32_t hashIndex = -1;
360 const UHashElement *hashElement = NULL;
361 int32_t count = 0;
362 while((hashElement = table->nextElement(hashIndex)) != NULL) {
363 array[count++] = (Line *)hashElement->value.pointer;
364 }
365 return size;
366 }
367
trySwamped(Line ** smaller,Line ** greater,UChar chars[2],CompareFn comparer)368 UBool trySwamped(Line **smaller, Line **greater, UChar chars[2], CompareFn comparer) {
369 u_strcpy(gSource->name, (*smaller)->name);
370 gSource->name[(*smaller)->len] = separatorChar;
371 gSource->name[(*smaller)->len+1] = chars[0];
372 gSource->name[(*smaller)->len+2] = 0;
373 gSource->len = (*smaller)->len+2;
374
375 u_strcpy(gTarget->name, (*greater)->name);
376 gTarget->name[(*greater)->len] = separatorChar;
377 gTarget->name[(*greater)->len+1] = chars[1];
378 gTarget->name[(*greater)->len+2] = 0;
379 gTarget->len = (*greater)->len+2;
380
381 if(comparer(&gSource, &gTarget) > 0) {
382 return true;
383 } else {
384 return false;
385 }
386 }
387
trySwamps(Line ** smaller,Line ** greater,UChar chars[2],CompareFn comparer)388 UBool trySwamps(Line **smaller, Line **greater, UChar chars[2], CompareFn comparer) {
389 gSource->name[0] = chars[0];
390 gSource->name[1] = separatorChar;
391 u_strcpy(gSource->name+2, (*smaller)->name);
392 gSource->len = (*smaller)->len+2;
393
394 gTarget->name[0] = chars[1];
395 gTarget->name[1] = separatorChar;
396 u_strcpy(gTarget->name+2, (*greater)->name);
397 gTarget->len = (*greater)->len+2;
398
399 if(comparer(&gSource, &gTarget) < 0) {
400 return true;
401 } else {
402 return false;
403 }
404 }
405
406 UColAttributeValue
probeStrength(Line ** prevLine,Line ** currLine,CompareFn comparer)407 probeStrength(Line** prevLine, Line **currLine, CompareFn comparer) {
408 // Primary swamps secondary
409 // have pairs where [0] 2> [1]
410 UChar primSwamps[][2] = {
411 { 0x00E0, 0x0061 },
412 { 0x0450, 0x0435 },
413 { 0x31a3, 0x310d }
414 };
415 // Secondary swamps tertiary
416 // have pairs where [0] 3> [1]
417 UChar secSwamps[][2] = {
418 { 0x0053, 0x0073 },
419 { 0x0415, 0x0435 },
420 { 0x31b6, 0x310e }
421 };
422 // Secondary is swamped by primary
423 // have pairs where [0] 1> [1]
424 UChar secSwamped[][2] = {
425 { 0x0062, 0x0061 },
426 { 0x0436, 0x0454 },
427 { 0x310e, 0x310d }
428 };
429 // Tertiary is swamped by secondary
430 // have pairs where [0] 2> [1]
431 UChar terSwamped[][2] = {
432 { 0x00E0, 0x0061 },
433 { 0x0450, 0x0435 },
434 { 0x31a3, 0x310d }
435 };
436 int32_t i = 0;
437 // Tertiary swamps equal?
438 int result = 0;
439 // Choose the pair
440 i = 0;
441 /*
442 if((*prevLine)->name[0] > 0xFF && (*currLine)->name[0] > 0xFF) {
443 i = 0;
444 } else if((*prevLine)->name[0] < 0x0400 && (*currLine)->name[0] < 0x0400) {
445 i = 1;
446 } else {
447 i = 2;
448 }
449 */
450 // are they equal?
451 if((result = comparer(prevLine, currLine)) == 0) {
452 return UCOL_IDENTICAL;
453 } else if(result > 0) {
454 //fprintf(stderr, "lines should be ordered!");
455 return UCOL_OFF;
456 } else if(trySwamps(prevLine, currLine, primSwamps[i], comparer)) {
457 return UCOL_PRIMARY;
458 } else if(trySwamps(prevLine, currLine, secSwamps[i], comparer)) {
459 return UCOL_SECONDARY;
460 } else if(trySwamped(prevLine, currLine, terSwamped[i], comparer)) {
461 // is there a tertiary difference
462 return UCOL_TERTIARY;
463 } else {
464 //fprintf(stderr, "Unknown strength!\n");
465 return UCOL_ON;
466 }
467 }
468
469 // This function tries to probe the set of lines
470 // (already sorted by qsort) and deduct the strengths
471 void
analyzeStrength(Line ** lines,int32_t size,CompareFn comparer)472 analyzeStrength(Line **lines, int32_t size, CompareFn comparer) {
473 int32_t i = 0;
474
475 for(i = 1; i < size; i++) {
476 Line **prevLine = lines+i-1;
477 Line **currLine = lines+i;
478 (*currLine)->strength = probeStrength(prevLine, currLine, comparer);
479 (*currLine)->sortedIndex = i;
480 (*currLine)->previous = *prevLine;
481 (*prevLine)->next = *currLine;
482
483 }
484
485 }
486
printStrength(UColAttributeValue strength,UFILE * file)487 void printStrength(UColAttributeValue strength, UFILE *file) {
488 u_fprintf(file, " ");
489 switch(strength) {
490 case UCOL_IDENTICAL:
491 u_fprintf(file, "=");
492 break;
493 case UCOL_TERTIARY:
494 //u_fprintf(file, "<3");
495 u_fprintf(file, "<<<");
496 break;
497 case UCOL_SECONDARY:
498 //u_fprintf(file, "<2");
499 u_fprintf(file, "<<");
500 break;
501 case UCOL_PRIMARY:
502 //u_fprintf(file, "<1");
503 u_fprintf(file, "<");
504 break;
505 case UCOL_OFF:
506 u_fprintf(file, ">?");
507 default:
508 u_fprintf(file, "?!");
509 break;
510 }
511 u_fprintf(file, " ");
512 }
513
printStrength(Line * line,UFILE * file)514 void printStrength(Line *line, UFILE *file) {
515 printStrength(line->strength, file);
516 }
517
printLine(Line * line,UFILE * file)518 void printLine(Line *line, UFILE *file) {
519 escapeALine(line, file);
520 if(line->isExpansion) {
521 u_fprintf(file, "/");
522 escapeExpansion(line, file);
523 }
524 }
525
printOrdering(Line ** lines,int32_t size,UFILE * file,UBool useLinks=false)526 void printOrdering(Line **lines, int32_t size, UFILE *file, UBool useLinks = false) {
527 int32_t i = 0;
528
529 //printLine(*lines);
530 //escapeALine(*lines); // Print first line
531
532 Line *line = NULL;
533 Line *previous = *lines;
534 if(previous->isReset) {
535 u_fprintf(file, "\n& ");
536 escapeALine(previous, file);
537 } else if(!previous->isRemoved) {
538 printLine(previous, file);
539 }
540 i = 1;
541 while(i < size && previous->next) {
542 if(useLinks) {
543 line = previous->next;
544 } else {
545 line = *(lines+i);
546 }
547 if(line->isReset) {
548 u_fprintf(file, "\n& ");
549 escapeALine(line, file);
550 } else if(!line->isRemoved) {
551 if(file == out) {
552 u_fprintf(file, "\n");
553 }
554 if(i > 0) {
555 printStrength(line, file);
556 }
557 printLine(line, file);
558 //escapeALine(line, file);
559 }
560 previous = line;
561 i++;
562 }
563 u_fprintf(file, "\n");
564 }
565
566
setIndexes(Line ** lines,int32_t size)567 void setIndexes(Line **lines, int32_t size) {
568 int32_t i = 0;
569 (*lines)->sortedIndex = 0;
570 for(i = 1; i < size; i++) {
571 Line *line = *(lines+i);
572 Line *prev = *(lines+i-1);
573 line->previous = prev;
574 prev->next = line;
575 line->sortedIndex = i;
576 }
577 }
578
579
580 // this seems to be a dead end
581 void
noteExpansion(Line ** gLines,Line * line,int32_t size,CompareFn comparer)582 noteExpansion(Line **gLines, Line *line, int32_t size, CompareFn comparer) {
583 UErrorCode status = U_ZERO_ERROR;
584
585 UnicodeString key(line->name, line->len);
586 //Line *toInsert = (Line *)gElements.get(key);
587 Line *toInsert = (Line *)gExpansions.get(key);
588 if(toInsert != NULL) {
589 toInsert->isExpansion = true;
590 u_strcpy(toInsert->expansionString, line->expansionString);
591 toInsert->expLen = line->expLen;
592 toInsert->previous->next = toInsert->next;
593 toInsert->next->previous = toInsert->previous;
594 gElements.remove(key);
595 } else {
596 toInsert = new Line(*line);
597 toInsert->isExpansion = true;
598 gElements.put(UnicodeString(toInsert->name, toInsert->len), toInsert, status);
599 }
600
601 int32_t i = 0;
602 Line testLine;
603 Line *l = &testLine;
604 for(i = 0; i < size; i++) {
605 u_strcpy(testLine.name, (*(gLines+i))->name);
606 u_strcat(testLine.name, line->expansionString);
607 testLine.len = (*(gLines+i))->len + line->expLen;
608 if(comparer(&l, &line) > 0) {
609 toInsert->previous = *(gLines+i-1);
610 toInsert->next = *(gLines+i);
611 toInsert->previous->next = toInsert;
612 toInsert->next->previous = toInsert;
613 break;
614 }
615 }
616 if(gVerbose) {
617 u_fprintf(log, "Adding expansion\n");
618 escapeALine(line, log);
619 u_fprintf(log, "/");
620 escapeExpansion(line, log);
621 u_fprintf(log, " ");
622 }
623 }
624
625 void
positionExpansions(Line ** gLines,int32_t size,CompareFn comparer)626 positionExpansions(Line **gLines, int32_t size, CompareFn comparer) {
627 int result = 0;
628 Line *line = NULL;
629 Line *toMove = NULL;
630 int32_t i = 0, j = 0;
631 Line **sortedExpansions = new Line*[gExpansions.count()];
632 int32_t sortedExpansionsSize = setArray(sortedExpansions, &gExpansions);
633 qsort(sortedExpansions, sortedExpansionsSize, sizeof(Line *), comparer);
634 // Make a list of things in the vincinity of expansion candidate
635 for(j = 0; j < sortedExpansionsSize; j++) {
636 line = *(sortedExpansions+j);
637 UnicodeString key(line->name, line->len);
638 toMove = (Line *)gElements.get(key);
639 int32_t i = 0;
640 Line testLine, prevTestLine;
641 Line *l = &testLine;
642 Line *prevL = &prevTestLine;
643 // This can be further optimized, since we now know that we have a
644 // sorted list of expansions, so current can start from toMove, since all
645 // the elements before it are already smaller. In the beginning it needs to
646 // be on gLines, though.
647 Line *current = *gLines;
648 while(current) {
649 if(current == toMove) {
650 // we are wading through a sorted list
651 // if we found ourselves, it means that we
652 // are already in a right place, so no moving
653 // is needed, but we need to make sure we have
654 // the right strength.
655 toMove->strength = probeStrength(&prevL, &toMove, comparer);
656 if(0) {
657 u_fprintf(log, "Positioned expansion without moving ");
658 printLine(toMove, log);
659 u_fprintf(log, " new ordering: \n");
660 printOrdering(gLines, size, log, true);
661 }
662 break;
663 } else {
664 u_strcpy(testLine.name, current->name);
665 if(!current->isExpansion) {
666 u_strcat(testLine.name, line->expansionString);
667 testLine.len = current->len + line->expLen;
668 } else {
669 testLine.len = current->len;
670 }
671 if(comparer(&l, &line) > 0) {
672 // remove from chain
673 if(toMove->next) {
674 toMove->next->strength = probeStrength(&(toMove->previous), &(toMove->next), comparer);
675 toMove->next->previous = toMove->previous;
676 }
677 if(toMove->previous) {
678 toMove->previous->next = toMove->next;
679 }
680
681 // insert
682 toMove->previous = current->previous;
683 toMove->next = current;
684
685 if(current->previous) {
686 current->previous->next = toMove;
687 }
688 current->previous = toMove;
689
690 toMove->strength = probeStrength(&prevL, &toMove, comparer);
691 toMove->next->strength = probeStrength(&toMove, &l, comparer);
692 if(0) {
693 u_fprintf(log, "Positioned expansion ");
694 printLine(toMove, log);
695 u_fprintf(log, " new ordering: \n");
696 printOrdering(gLines, size, log, true);
697 }
698 if(toMove->strength == UCOL_IDENTICAL) {
699 // check for craziness such as s = ss/s
700 // such line would consist of previous (or next) concatenated with the expansion value
701 // make a test
702 UChar fullString[256];
703 u_strcpy(fullString, toMove->previous->name);
704 u_strcat(fullString, toMove->expansionString);
705 if(u_strcmp(fullString, toMove->name) == 0) {
706 toMove->previous->next = toMove->next;
707 toMove->next->previous = toMove->previous;
708 toMove->isRemoved = true;
709 u_fprintf(log, "Removed: ");
710 printLine(toMove, log);
711 u_fprintf(log, "\n");
712 }
713 } else if(toMove->next->strength == UCOL_IDENTICAL) {
714 UChar fullString[256];
715 u_strcpy(fullString, toMove->next->name);
716 u_strcat(fullString, toMove->expansionString);
717 if(u_strcmp(fullString, toMove->name) == 0) {
718 toMove->next->strength = toMove->strength;
719 toMove->previous->next = toMove->next;
720 toMove->next->previous = toMove->previous;
721 toMove->isRemoved = true;
722 u_fprintf(log, "Removed because of back: ");
723 printLine(toMove, log);
724 u_fprintf(log, "\n");
725 }
726 }
727 break;
728 }
729 prevTestLine = testLine;
730 }
731 current = current->next;
732 }
733 }
734 delete[] sortedExpansions;
735 }
736
737
738 void
noteExpansion(Line * line)739 noteExpansion(Line *line) {
740 UErrorCode status = U_ZERO_ERROR;
741 UnicodeString key(line->name, line->len);
742 Line *el = (Line *)gElements.get(key);
743 if(el != NULL) {
744 el->isExpansion = true;
745 u_strcpy(el->expansionString, line->expansionString);
746 el->expLen = line->expLen;
747 } else {
748 Line *toInsert = new Line(*line);
749 toInsert->isExpansion = true;
750 gElements.put(UnicodeString(line->name, line->len), toInsert, status);
751 }
752
753 Line *el2 = (Line *)gExpansions.get(key);
754 el2->isExpansion = true;
755 u_strcpy(el2->expansionString, line->expansionString);
756 el2->expLen = line->expLen;
757
758 if(gDebug) {
759 u_fprintf(log, "Adding expansion\n");
760 printLine(line, log);
761 u_fprintf(log, "\n");
762 }
763 }
764
765 void
noteContraction(Line * line)766 noteContraction(Line *line) {
767 UErrorCode status = U_ZERO_ERROR;
768 Line *toInsert = new Line(*line);
769 toInsert->isContraction = true;
770 gElements.put(UnicodeString(line->name, line->len), toInsert, status);
771 if(gVerbose) {
772 u_fprintf(log, "Adding contraction\n");
773 escapeALine(line, log);
774 u_fprintf(log, " ");
775 }
776 }
777
778 void
noteElement(Line * line)779 noteElement(Line *line) {
780 UErrorCode status = U_ZERO_ERROR;
781 Line *toInsert = new Line(*line);
782 gElements.put(UnicodeString(line->name, line->len), toInsert, status);
783 if(0) { //if(gDebug)
784 escapeALine(line, log);
785 u_fprintf(log, " ");
786 }
787 }
788
789
790
791 // This function checks if a combination of characters has changed place with the
792 // adjacent elements. If so, these are most probably contractions.
793 // However, it still needs to be checked if these contractions are fake - the
794 // test is simple - if xy is suspected contraction, if we get that x/y is expansion, then
795 // xy is a fake contraction.
796 int32_t
analyzeContractions(Line ** lines,int32_t size,CompareFn comparer)797 analyzeContractions(Line** lines, int32_t size, CompareFn comparer) {
798 int32_t i = 0, j = 0;
799 int32_t outOfOrder = 0;
800 UColAttributeValue strength = UCOL_OFF;
801 UColAttributeValue currStrength = UCOL_OFF;
802 Line **prevLine = lines;
803 Line **currLine = NULL;
804 Line **backupLine = NULL;
805 UBool prevIsContraction = false, currIsContraction = false;
806 // Problem here is detecting a contraction that is at the very end of the sorted list
807 for(i = 1; i < size; i++) {
808 currLine = lines+i;
809 strength = probeStrength(prevLine, currLine, comparer);
810 if(strength == UCOL_OFF || strength != (*currLine)->strength) {
811 prevIsContraction = false;
812 currIsContraction = false;
813 if(!outOfOrder) {
814 if(gVerbose) {
815 u_fprintf(log, "Possible contractions: ");
816 }
817 }
818 // now we have two elements that are different. The question is,
819 // which one of them is the contraction - which one has moved.
820 // Could be the previous, but could also be the current.
821
822 outOfOrder++;
823
824 // First, lets check whether the previous has jumped back
825 j = i+1;
826 // skip all the nexts that have smaller strength, they don't have an effect
827 while(j < size && (*(lines+j))->strength > (*currLine)->strength) {
828 j++;
829 }
830 // check if there are other elements of same or greater strength
831 while(j < size &&
832 (strength = probeStrength(prevLine, (backupLine = lines+j), comparer)) == UCOL_OFF) {
833 j++;
834 // if we skipped more than one, it might be in fact a contraction
835 prevIsContraction = true;
836 }
837 if(prevIsContraction) {
838 noteContraction(*prevLine);
839 j = i-2;
840 // add all the previous elements with smaller strength, since they also
841 // will jump over and are contractions
842 while(j >= 0 && (*(lines+j+1))->strength > (*currLine)->strength) {
843 strength = probeStrength(lines+j, currLine, comparer);
844 if(strength == UCOL_OFF) {
845 noteContraction(*(lines+j));
846 }
847 j--;
848 }
849 }
850
851 // now we check if the current element is jumping forward,
852 // the dance steps are analogous to above.
853 j = i - 2;
854 while(j >= 0 && (*(lines+j+1))->strength > (*currLine)->strength) {
855 j--;
856 }
857 while(j >= 0 &&
858 (strength = probeStrength((backupLine = lines+j), currLine, comparer)) == UCOL_OFF) {
859 j--;
860 currIsContraction = true;
861 }
862 if(currIsContraction) {
863 if(gVerbose) {
864 escapeALine(*currLine, log);
865 u_fprintf(log, " ");
866 }
867 j = i+1;
868 while(j < size && (*(lines+j))->strength > (*currLine)->strength) {
869 strength = probeStrength(prevLine, lines+j, comparer);
870 if(strength == UCOL_OFF) {
871 noteContraction(*(lines+j));
872 }
873 j++;
874 }
875 }
876
877 // Not sure about either. List both and then check
878 if(!(prevIsContraction || currIsContraction)) {
879 noteContraction(*prevLine);
880 noteContraction(*currLine);
881 }
882 }
883 prevLine = currLine;
884 }
885 if(outOfOrder) {
886 if(gVerbose) {
887 u_fprintf(log, "\n");
888 }
889 }
890 return outOfOrder;
891 }
892
893 int32_t
detectContractions(Line ** gLines,Line * lines,int32_t size,CompareFn comparer)894 detectContractions(Line **gLines, Line *lines, int32_t size, CompareFn comparer) {
895 int32_t i = 0, j = 0;
896 int32_t noContractions = 0;
897 // Create and compare doubles:
898 Line *backupLines = new Line[size];
899 Line::copyArray(backupLines, lines, size);
900 // detect contractions
901
902 Line **gLinesBackup = NULL; //new Line*[size];
903
904 for(i = 0; i < size; i++) {
905 // preserve index and previous
906 Line::copyArray(lines, backupLines, size);
907 for(j = 0; j < size; j++) {
908 u_strcpy(lines[j].name, backupLines[i].name);
909 u_strcat(lines[j].name, backupLines[j].name);
910 lines[j].len = backupLines[i].len+backupLines[j].len;
911 }
912
913 if((noContractions += analyzeContractions(gLines, size, comparer)) && gDebug) {
914 if(gLinesBackup == NULL) {
915 gLinesBackup = new Line*[size];
916 }
917 // Show the sorted doubles, for debugging
918 setArray(gLinesBackup, lines, size);
919 qsort(gLinesBackup, size, sizeof(Line *), comparer);
920 //setIndexes(gLinesBackup, size);
921 analyzeStrength(gLinesBackup, size, comparer);
922 printOrdering(gLinesBackup, size, log);
923 }
924 if(!gQuiet) {
925 u_fprintf(log, ".");
926 }
927 }
928 if(!gQuiet) {
929 u_fprintf(log, "\n");
930 }
931 delete[] backupLines;
932 if(gLinesBackup) {
933 delete[] gLinesBackup;
934 }
935 return noContractions;
936 }
937
938 // gLines in this function is an array of sorted pointers.
939 // Contractions are already included.
940 int32_t
detectExpansions(Line ** gLines,int32_t size,CompareFn comparer)941 detectExpansions(Line **gLines, int32_t size, CompareFn comparer) {
942 UErrorCode status = U_ZERO_ERROR;
943 // detect expansions
944
945 UColAttributeValue startStrength = UCOL_OFF, endStrength = UCOL_OFF,
946 strength = UCOL_OFF, previousStrength = UCOL_OFF;
947 Line start, end, src;
948 Line *startP = &start, *endP = &end, *srcP = &src;
949 Line *current = NULL;
950 memset(startP, 0, sizeof(Line));
951 memset(endP, 0, sizeof(Line));
952 memset(srcP, 0, sizeof(Line));
953 int32_t srcLen;
954 int32_t i = 0, j = 0, k = 0;
955 for(i = 0; i < size; i++) {
956 u_strcpy(start.name, (*(gLines+i))->name);
957 u_strcpy(end.name, (*(gLines+i))->name);
958 srcLen = (*(gLines+i))->len;
959 u_strcpy(start.name+srcLen, (*(gLines))->name);
960 start.len = srcLen + (*(gLines))->len;
961 u_strcpy(end.name+srcLen, (*(gLines+size-1))->name);
962 end.len = srcLen + (*(gLines+size-1))->len;
963
964 for(k = 0; k < size; k++) { // k is index of a thing that is not doubled
965 current = *(gLines+k);
966 // see if we have moved to front
967 // has it moved to the very beginning
968 if((startStrength = probeStrength((gLines+k), &startP, comparer)) != UCOL_OFF) {
969 continue; // this one is in the front
970 }
971 // has it moved to the very end?
972 if((endStrength = probeStrength(&endP, (gLines+k), comparer)) != UCOL_OFF) {
973 continue; // this one is in the back
974 }
975 // Potential Expansion
976 if(gDebug) { //gVerbose
977 u_fprintf(log, "Possible expansion: ");
978 escapeALine(*(gLines+k), log);
979 u_fprintf(log, " ");
980 }
981 // Now we have to make sure that this is really an expansion
982 // First, we have to find it
983 u_strcpy(src.name, (*(gLines+i))->name);
984 for(j = 0; j < size; j++) {
985 u_strcpy(src.name+srcLen, (*(gLines+j))->name);
986 src.len = srcLen + (*(gLines+j))->len;
987 if((strength = probeStrength(&srcP, (gLines+k), comparer)) == UCOL_OFF) {
988 strength = probeStrength((gLines+k), &srcP, comparer);
989 // we found it *(gLines+j-1) is the element that is interesting
990 // since gLines+j-1 < gLines+k < gLines+j
991 if(gDebug) { //gVerbose
992 u_fprintf(log, "i = %i, k = %i, j = %i ", i, k, j);
993 escapeALine(*(gLines+i), log);
994 escapeALine(*(gLines+j-1), log);
995 printStrength(previousStrength, log);
996 escapeALine(current, log);
997 printStrength(strength, log);
998 escapeALine(*(gLines+i), log);
999 escapeALine(*(gLines+j), log);
1000 u_fprintf(log, "\n");
1001 }
1002 // check whether it is a contraction that is the same as an expansion
1003 // or a multi character that doesn't do anything
1004 current->addExpansionHit(i, j);
1005 current->isExpansion = true;
1006 current->expIndex = k;
1007 // cache expansion
1008 gExpansions.put(UnicodeString(current->name, current->len), current, status); //new Line(*current)
1009 break;
1010 }
1011 previousStrength = strength;
1012 }
1013 }
1014 if(!gQuiet) {
1015 u_fprintf(log, ".");
1016 }
1017 }
1018 if(!gQuiet) {
1019 u_fprintf(log, "\n");
1020 }
1021 // now we have identified possible expansions. We need to find out how do they expand.
1022 // Let's iterate over expansions cache - it's easier.
1023 const UHashElement *el = NULL;
1024 int32_t hashIndex = -1;
1025 Line *doubles = new Line[size*10];
1026 Line **sorter = new Line*[size*10];
1027 int32_t currSize = 0;
1028 int32_t newSize = 0;
1029 Line *prev = NULL;
1030 Line *next = NULL;
1031 Line *origin = NULL;
1032 int result = 0;
1033 // Make a list of things in the vincinity of expansion candidate
1034 // in expansionPrefixes and expansionAfter we have stored the
1035 // prefixes of stuff that caused the detection of an expansion
1036 // and a position where the expansion was.
1037 // For example (icu, de__PHONEBOOK), we had:
1038 // aE <<< \u00E4 < af
1039 // AD < \u00E4 <<< Ae
1040 // From that we will construct the following sequence:
1041 // AD < aE <<< \u00E4/ <<< Ae < af
1042 // then we will take the vincinity of \u00E4:
1043 // aE <<< \u00E4/ <<< Ae
1044 // then we will choose the smallest expansion to be the expansion
1045 // part: 'e'.
1046 // if there is equality, we choose the equal part:
1047 // (win32, de__PHONEBOOK):
1048 // AD < \u00E4/ = ae <<< aE <<< Ae
1049 // we choose 'e'.
1050
1051 while((el = gExpansions.nextElement(hashIndex)) != NULL) {
1052 newSize = 0;
1053 current = (Line *)el->value.pointer;
1054 currSize = size*current->expansionPrefixesSize;
1055 if(gDebug) {
1056 escapeALine(current, log);
1057 u_fprintf(log, " Number: %i\n", current->expansionPrefixesSize);
1058 }
1059 // construct the doubles
1060 for(i = 0; i < current->expansionPrefixesSize; i++) {
1061 doubles[newSize].suffix = current->expansionAfter[i]-1;
1062 doubles[newSize++].setToConcat(*(gLines+current->expansionPrefixes[i]), *(gLines+current->expansionAfter[i]-1));
1063 doubles[newSize].suffix = current->expansionAfter[i];
1064 doubles[newSize++].setToConcat(*(gLines+current->expansionPrefixes[i]), *(gLines+current->expansionAfter[i]));
1065 }
1066 // add the expansion we're observing
1067 doubles[newSize++] = *current;
1068 setArray(sorter, doubles, newSize);
1069 qsort(sorter, newSize, sizeof(Line*), comparer);
1070 analyzeStrength(sorter, newSize, comparer);
1071 if(gDebug) {
1072 printOrdering(sorter, newSize, log);
1073 }
1074 i = 0;
1075 while(**(sorter+i) != *current) {
1076 i++;
1077 }
1078 // find the two additions
1079 if((*(sorter+i))->strength == UCOL_IDENTICAL) {
1080 // if we ae id
1081 origin = *(gLines+((*(sorter+i-1))->suffix));
1082 u_strcpy(current->expansionString, origin->name);
1083 current->expLen = origin->len;
1084 } else if(i < newSize-1 && (*(sorter+i+1))->strength == UCOL_IDENTICAL) {
1085 origin = *(gLines+((*(sorter+i+1))->suffix));
1086 u_strcpy(current->expansionString, origin->name);
1087 current->expLen = origin->len;
1088 } else {
1089 if(i > 0) {
1090 prev = *(gLines+(*(sorter+i-1))->suffix);
1091 if(i < newSize-1) {
1092 next = *(gLines+(*(sorter+i+1))->suffix);
1093 result = comparer(&prev, &next);
1094 if(result <= 0) {
1095 u_strcpy(current->expansionString, prev->name);
1096 current->expLen = prev->len;
1097 } else {
1098 u_strcpy(current->expansionString, next->name);
1099 current->expLen = next->len;
1100 }
1101 }
1102 }
1103 if(0) { //if(gDebug)
1104 u_fprintf(log, "Expansion is: ");
1105 escapeALine(current, log);
1106 u_fprintf(log, "/");
1107 escapeExpansion(current, log);
1108 u_fprintf(log, "\n");
1109 }
1110 }
1111 noteExpansion(current);
1112 //noteExpansion(gLines, current, size, comparer);
1113 if(!gQuiet) {
1114 u_fprintf(log, ".");
1115 }
1116 }
1117 if(!gQuiet) {
1118 u_fprintf(log, "\n");
1119 }
1120 delete[] doubles;
1121 delete[] sorter;
1122 return gExpansions.count();
1123 }
1124
1125 UBool
isTailored(Line * line,UErrorCode & status)1126 isTailored(Line *line, UErrorCode &status) {
1127 UBool result = false;
1128 UCollationElements *tailoring = ucol_openElements(gCol, line->name, line->len, &status);
1129 UCollationElements *uca = ucol_openElements(gUCA, line->name, line->len, &status);
1130
1131 int32_t tailElement = UCOL_NULLORDER;
1132 int32_t ucaElement = UCOL_NULLORDER;
1133
1134 do {
1135 do {
1136 tailElement = ucol_next(tailoring, &status);
1137 } while(tailElement == 0);
1138 do {
1139 ucaElement = ucol_next(uca, &status);
1140 } while(ucaElement == 0);
1141 if(tailElement != ucaElement) {
1142 result = true;
1143 break;
1144 }
1145 } while (tailElement != UCOL_NULLORDER && ucaElement != UCOL_NULLORDER);
1146
1147 ucol_closeElements(tailoring);
1148 ucol_closeElements(uca);
1149 return result;
1150 }
1151
1152 void
reduceUntailored(Line ** gLines,int32_t size)1153 reduceUntailored(Line **gLines, int32_t size){
1154 UErrorCode status = U_ZERO_ERROR;
1155 Line *current = *(gLines);
1156 Line *previous = NULL;
1157 while(current) {
1158 // if the current line is not tailored according to the UCA
1159 if(!isTailored(current, status)) {
1160 // we remove it
1161 current->isRemoved = true;
1162 } else {
1163 // if it's tailored
1164 if(current->previous && current->previous->isRemoved == true) {
1165 previous = current->previous;
1166 while(previous && (previous->strength > current->strength || previous->isExpansion || previous->isContraction) && previous->isRemoved) {
1167 if(previous->previous && previous->previous->isRemoved) {
1168 previous = previous->previous;
1169 } else {
1170 break;
1171 }
1172 }
1173 if(previous) {
1174 previous->isReset = true;
1175 } else {
1176 (*(gLines))->isReset = true;
1177 }
1178 }
1179 }
1180 current = current->next;
1181 }
1182 }
1183
1184 void
constructAndAnalyze(Line ** gLines,Line * lines,int32_t size,CompareFn comparer)1185 constructAndAnalyze(Line **gLines, Line *lines, int32_t size, CompareFn comparer) {
1186 int32_t i = 0, j = 0, k = 0;
1187 // setup our compare arrays to point to single set.
1188
1189 // For contractions we need a block of data
1190 setArray(gLines, lines, size);
1191 //size = setArray(gLines);
1192
1193 qsort(gLines, size, sizeof(Line *), comparer);
1194
1195 // Establish who is previous according to the sort order
1196 //setIndexes(gLines, size);
1197
1198 analyzeStrength(gLines, size, comparer);
1199 if(gVerbose) {
1200 u_fprintf(log, "Ordering:\n");
1201 printOrdering(gLines, size, log);
1202 }
1203
1204 //showDifferences(exemplarSetSize);
1205 //dumpData(exemplarSetSize);
1206
1207 if(!gQuiet) {
1208 u_fprintf(log, "Detecting contractions?\n");
1209 }
1210 int32_t noContractions = 0;
1211 noContractions = detectContractions(gLines, lines, size, comparer);
1212 if(!gQuiet) {
1213 u_fprintf(log, "Detected %i contractions\n", noContractions);
1214 }
1215
1216 // now we have suspected contractions in the table
1217 // we have to re-sort the things
1218 size = setArray(gLines);
1219 qsort(gLines, size, sizeof(Line *), comparer);
1220 analyzeStrength(gLines, size, comparer);
1221
1222 if(!gQuiet) {
1223 u_fprintf(log, "Detecting expansions\n");
1224 }
1225 int32_t noExpansions = detectExpansions(gLines, size, comparer);
1226 if(!gQuiet) {
1227 u_fprintf(log, "Detected %i expansions\n", noExpansions);
1228 }
1229
1230 positionExpansions(gLines, size, comparer);
1231
1232 if(gVerbose) {
1233 u_fprintf(log, "After positioning expansions:\n");
1234 printOrdering(gLines, size, log, true);
1235 }
1236 //reduceUntailored(gLines, size);
1237 if(!gQuiet) {
1238 u_fprintf(out, "Final result\n");
1239 }
1240 printOrdering(gLines, size, out, true);
1241 printOrdering(gLines, size, log, true);
1242 }
1243
1244 // Check whether upper case comes before lower case or vice-versa
1245 int32_t
checkCaseOrdering(void)1246 checkCaseOrdering(void) {
1247 UChar stuff[][3] = {
1248 { 0x0061, separatorChar, 0x0061}, //"aa",
1249 { 0x0061, separatorChar, 0x0041 }, //"a\\u00E0",
1250 { 0x0041, separatorChar, 0x0061 }, //"\\u00E0a",
1251 { 0x0041, separatorChar, 0x0041 }, //"\\u00E0a",
1252 //{ 0x00E0, separatorChar, 0x00E0 } //"\\u00E0\\u00E0"
1253 };
1254 const int32_t size = sizeof(stuff)/sizeof(stuff[0]);
1255
1256 Line **sortedLines = new Line*[size];
1257 Line lines[size];
1258
1259 int32_t i = 0;
1260 int32_t ordered = 0, reversed = 0;
1261
1262 for(i = 0; i < size; i++) {
1263 lines[i].setName(stuff[i], 3);
1264 }
1265 setArray(sortedLines, lines, size);
1266 qsort(sortedLines, size, sizeof(Line*), gComparer);
1267
1268 for(i = 0; i < size; i++) {
1269 if(*(sortedLines+i) == &lines[i]) {
1270 ordered++;
1271 }
1272 if(*(sortedLines+i) == &lines[size-i-1]) {
1273 reversed++;
1274 }
1275 }
1276
1277 delete[] sortedLines;
1278 if(ordered == size) {
1279 return 0; // in normal order
1280 } else if(reversed == size) {
1281 return 1; // in reversed order
1282 } else {
1283 return -1; // unknown order
1284 }
1285 }
1286
1287
1288 // Check whether the secondaries are in the straight or reversed order
1289 int32_t
checkSecondaryOrdering(void)1290 checkSecondaryOrdering(void) {
1291 UChar stuff[][5] = {
1292 { 0x0061, separatorChar, 0x0061, separatorChar, 0x00E0 }, //"aa",
1293 { 0x0061, separatorChar, 0x00E0, separatorChar, 0x0061 }, //"a\\u00E0",
1294 { 0x00E0, separatorChar, 0x0061, separatorChar, 0x0061 }, //"\\u00E0a",
1295 //{ 0x00E0, separatorChar, 0x00E0 } //"\\u00E0\\u00E0"
1296 };
1297 const int32_t size = sizeof(stuff)/sizeof(stuff[0]);
1298
1299 Line **sortedLines = new Line*[size];
1300 Line lines[size];
1301
1302 int32_t i = 0;
1303 int32_t ordered = 0, reversed = 0;
1304
1305 for(i = 0; i < size; i++) {
1306 lines[i].setName(stuff[i], 5);
1307 }
1308 setArray(sortedLines, lines, size);
1309 qsort(sortedLines, size, sizeof(Line*), gComparer);
1310
1311 for(i = 0; i < size; i++) {
1312 if(*(sortedLines+i) == &lines[i]) {
1313 ordered++;
1314 }
1315 if(*(sortedLines+i) == &lines[size-i-1]) {
1316 reversed++;
1317 }
1318 }
1319
1320 delete[] sortedLines;
1321 if(ordered == size) {
1322 return 0; // in normal order
1323 } else if(reversed == size) {
1324 return 1; // in reversed order
1325 } else {
1326 return -1; // unknown order
1327 }
1328 }
1329
1330 // We have to remove ignorable characters from the exemplar set,
1331 // otherwise, we get messed up results
removeIgnorableChars(UnicodeSet & exemplarUSet,CompareFn comparer,UErrorCode & status)1332 void removeIgnorableChars(UnicodeSet &exemplarUSet, CompareFn comparer, UErrorCode &status) {
1333 UnicodeSet ignorables, primaryIgnorables;
1334 UnicodeSetIterator exemplarUSetIter(exemplarUSet);
1335 exemplarUSetIter.reset();
1336 Line empty;
1337 Line *emptyP = ∅
1338 Line current;
1339 Line *currLine = ¤t;
1340 UColAttributeValue strength = UCOL_OFF;
1341
1342
1343 while(exemplarUSetIter.next()) {
1344 if(exemplarUSetIter.isString()) { // process a string
1345 u_memcpy(currLine->name, exemplarUSetIter.getString().getBuffer(), exemplarUSetIter.getString().length());
1346 currLine->len = exemplarUSetIter.getString().length();
1347 strength = probeStrength(&emptyP, &currLine, comparer);
1348 if(strength == UCOL_IDENTICAL) {
1349 ignorables.add(exemplarUSetIter.getString());
1350 } else if(strength > UCOL_PRIMARY) {
1351 primaryIgnorables.add(exemplarUSetIter.getString());
1352 }
1353 } else { // process code point
1354 UBool isError = false;
1355 UChar32 codePoint = exemplarUSetIter.getCodepoint();
1356 currLine->len = 0;
1357 U16_APPEND(currLine->name, currLine->len, 25, codePoint, isError);
1358 strength = probeStrength(&emptyP, &currLine, comparer);
1359 if(strength == UCOL_IDENTICAL) {
1360 ignorables.add(codePoint);
1361 } else if(strength > UCOL_PRIMARY) {
1362 primaryIgnorables.add(codePoint);
1363 }
1364 }
1365 }
1366
1367
1368
1369 exemplarUSet.removeAll(ignorables);
1370 exemplarUSet.removeAll(primaryIgnorables);
1371
1372 UnicodeString removedPattern;
1373 if(ignorables.size()) {
1374 u_fprintf(log, "Ignorables:\n");
1375 ignorables.toPattern(removedPattern, true);
1376 removedPattern.setCharAt(removedPattern.length(), 0);
1377 escapeString(removedPattern.getBuffer(), removedPattern.length(), log);
1378 u_fprintf(log, "\n");
1379 }
1380 if(primaryIgnorables.size()) {
1381 u_fprintf(log, "Primary ignorables:\n");
1382 primaryIgnorables.toPattern(removedPattern, true);
1383 removedPattern.setCharAt(removedPattern.length(), 0);
1384 escapeString(removedPattern.getBuffer(), removedPattern.length(), log);
1385 u_fprintf(log, "\n");
1386 }
1387
1388 }
1389
1390 // TODO: develop logic for choosing boundary characters - right now it is hardcoded
1391 // It should be a function of used scripts. Also, check whether we need to save
1392 // used script names
addUtilityChars(UnicodeSet & exemplarUSet,UErrorCode & status)1393 void addUtilityChars(UnicodeSet &exemplarUSet, UErrorCode &status) {
1394
1395 // in order to get nice rules, we need to add some characters to the
1396 // starting set. These are mostly parts of compatibility composed characters,
1397 // such as L-middle dot (middle dot is 0x00B7). If we don't add these, we would
1398 // get a reset at a funky character, such as L-middle dot. This list will probably
1399 // grow.
1400 exemplarUSet.add(0x00B7);
1401
1402 // these things represent a script before the target script and
1403 // a script after. More logic should be added so that these characters are
1404 // chosen automatically
1405
1406 exemplarUSet.add(0x0038);
1407 exemplarUSet.add(0x0039);
1408
1409 //exemplarUSet.add(0x0433);
1410 //exemplarUSet.add(0x0436);
1411 exemplarUSet.add(0xfa29);
1412 exemplarUSet.add(0xfa28);
1413 }
1414
1415 void
getExemplars(const char * locale,UnicodeSet & exemplars,UErrorCode & status)1416 getExemplars(const char *locale, UnicodeSet &exemplars, UErrorCode &status) {
1417 // first we fill out structures with exemplar characters.
1418 UResourceBundle *res = ures_open(NULL, locale, &status);
1419 int32_t exemplarLength = 0;
1420 UnicodeString exemplarString = ures_getUnicodeStringByKey(res, "ExemplarCharacters", &status);
1421 exemplars.clear();
1422 exemplars.applyPattern(exemplarString, status);
1423 ures_close(res);
1424 }
1425
1426 void
prepareStartingSet(UnicodeSet & exemplarUSet,CompareFn comparer,UErrorCode & status)1427 prepareStartingSet(UnicodeSet &exemplarUSet, CompareFn comparer, UErrorCode &status) {
1428 int32_t i = 0;
1429 UnicodeString exemplarString;
1430 exemplarUSet.toPattern(exemplarString);
1431 // Produce case closure of exemplar characters
1432 // Then we want to figure out what is the script of the exemplar characters
1433 // just pick several and see their script
1434 const char* usedScriptNames[USCRIPT_CODE_LIMIT];
1435 int32_t numberOfUsedScripts = 0;
1436 char scriptSetPattern[256];
1437 UnicodeString pattern; // for debugging
1438 UChar32 exChar = -1;
1439 while(exemplarUSet.size() != 0 && (exChar = exemplarUSet.charAt(0)) != -1) {
1440 int32_t scriptNo = u_getIntPropertyValue(exChar, UCHAR_SCRIPT);
1441 usedScriptNames[numberOfUsedScripts] = u_getPropertyValueName(UCHAR_SCRIPT, scriptNo, U_SHORT_PROPERTY_NAME);
1442 sprintf(scriptSetPattern, "[:%s:]", usedScriptNames[numberOfUsedScripts]);
1443 numberOfUsedScripts++;
1444 UnicodeSet scriptSet(UnicodeString(scriptSetPattern, ""), status);
1445 exemplarUSet.removeAll(scriptSet);
1446 exemplarUSet.toPattern(pattern, true);
1447 }
1448 exemplarUSet.clear();
1449
1450 // always add ASCII
1451 //exemplarUSet.addAll(UnicodeSet(UnicodeString("[\\u0020-\\u007f]", ""), status));
1452 exemplarUSet.addAll(UnicodeSet(UnicodeString("[\\u0041-\\u005b]", ""), status));
1453 if(gExemplar) {
1454 exemplarUSet.applyPattern(exemplarString, status);
1455 exemplarUSet.closeOver(USET_CASE);
1456 if(!gQuiet) {
1457 u_fprintf(out, "ICU exemplar characters:\n");
1458 escapeString(exemplarString.getBuffer(), exemplarString.length(), out);
1459 u_fprintf(out, "\n");
1460 }
1461 } else {
1462 if(!gQuiet) {
1463 u_fprintf(out, "Using scripts:\n");
1464 }
1465 // add interesting scripts
1466 for(i = 0; i < numberOfUsedScripts; i++) {
1467 sprintf(scriptSetPattern, "[:%s:]", usedScriptNames[i]);
1468 exemplarUSet.addAll(UnicodeSet(UnicodeString(scriptSetPattern, ""), status));
1469 if(!gQuiet) {
1470 u_fprintf(out, "%s\n", scriptSetPattern);
1471 }
1472 }
1473 }
1474
1475
1476 removeIgnorableChars(exemplarUSet, comparer, status);
1477
1478 addUtilityChars(exemplarUSet, status);
1479
1480 /*
1481 // try to check whether tailored set and exemplar characters match.
1482 USet *tailored = ucol_getTailoredSet(gCol, &status);
1483 UBool tailoredContained = exemplarUSet.containsAll(*((UnicodeSet *)tailored));
1484 if(!tailoredContained) {
1485 ((UnicodeSet *)tailored)->removeAll(exemplarUSet);
1486 UnicodeString pattern;
1487 ((UnicodeSet *)tailored)->toPattern(pattern, true);
1488 }
1489 uset_close(tailored);
1490 */
1491
1492 //return exemplarUSet;
1493 }
1494
1495 void
setOutputFile(const char * name,UErrorCode & status)1496 setOutputFile(const char *name, UErrorCode &status) {
1497 int32_t i = 0;
1498 char filename[256];
1499 strcpy(filename, name);
1500 for(i = 0; i < gPlatformNo; i++) {
1501 strcat(filename, "_");
1502 strcat(filename, platforms[gPlatformIndexes[i]].name);
1503 }
1504 if(gExemplar) {
1505 strcat(filename, "_exemplar");
1506 } else {
1507 strcat(filename, "_script");
1508 }
1509 strcat(filename, ".utf16.txt");
1510 out = u_fopen(filename, "wb", "en", "utf-16");
1511 }
1512
1513 void
processCollator(UCollator * col,UErrorCode & status)1514 processCollator(UCollator *col, UErrorCode &status) {
1515 int32_t i = 0;
1516 gCol = col;
1517 UChar ruleString[16384];
1518 int32_t ruleStringLength = ucol_getRulesEx(gCol, UCOL_TAILORING_ONLY, ruleString, 16384);
1519 if(!gQuiet) {
1520 u_fprintf(out, "ICU rules:\n");
1521 printRules(ruleString, ruleStringLength, out);
1522 printRules(ruleString, ruleStringLength, log);
1523 //escapeString(ruleString, ruleStringLength, out);
1524 u_fprintf(out, "\n");
1525 }
1526 const char *locale = ucol_getLocale(gCol, ULOC_REQUESTED_LOCALE, &status);
1527 UnicodeSet exemplarUSet;
1528 if(locale) {
1529 getExemplars(locale, exemplarUSet, status);
1530 } else {
1531 exemplarUSet = *((UnicodeSet *)ucol_getTailoredSet(gCol, &status));
1532 }
1533
1534
1535 for(i = 0; i < gPlatformNo; i++) {
1536 u_fprintf(out, "\nGenerating order for platform: %s\n", platforms[gPlatformIndexes[i]].name);
1537 gComparer = platforms[gPlatformIndexes[i]].comparer;
1538
1539 prepareStartingSet(exemplarUSet, gComparer, status);
1540 int32_t itemLen = 0;
1541 // get the number of all the items from the set (both codepoints and strings)
1542 int32_t exemplarSetSize = exemplarUSet.size();
1543 UnicodeSetIterator exemplarUSetIter(exemplarUSet);
1544
1545 // allocate ICU lines
1546 gICULines = new Line*[exemplarSetSize*5];
1547 int32_t j = 0;
1548 int32_t linesCount = 0;
1549 Line *lines = new Line[exemplarSetSize];
1550
1551 int32_t reversedSecondary = checkSecondaryOrdering();
1552 if(reversedSecondary == 0) {
1553 u_fprintf(out, "Secondaries do not seem to be reversed\n");
1554 } else if(reversedSecondary == 1) {
1555 u_fprintf(out, "Secondaries are reversed\n");
1556 if(gComparer == ICUstrcmp) {
1557 ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status);
1558 }
1559 } else {
1560 u_fprintf(out, "Cannot conclude if secondaries are reversed\n");
1561 }
1562
1563 int32_t reversedCase = checkCaseOrdering();
1564 if(reversedCase == 0) {
1565 u_fprintf(out, "Case does not seem to be reversed\n");
1566 } else if(reversedCase == 1) {
1567 u_fprintf(out, "Case is reversed\n");
1568 if(gComparer == ICUstrcmp) {
1569 ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_OFF, &status);
1570 }
1571 } else {
1572 u_fprintf(out, "Cannot conclude if case is reversed\n");
1573 }
1574
1575 exemplarUSetIter.reset();
1576 gElements.removeAll();
1577 gExpansions.removeAll();
1578 linesCount = 0;
1579
1580 while(exemplarUSetIter.next()) {
1581 Line *currLine = lines+linesCount;
1582 if(exemplarUSetIter.isString()) { // process a string
1583 u_memcpy(currLine->name, exemplarUSetIter.getString().getBuffer(), exemplarUSetIter.getString().length());
1584 currLine->len = exemplarUSetIter.getString().length();
1585 } else { // process code point
1586 UBool isError = false;
1587 currLine->len = 0;
1588 U16_APPEND(currLine->name, currLine->len, 25, exemplarUSetIter.getCodepoint(), isError);
1589 }
1590 currLine->name[currLine->len] = 0; // zero terminate, for our evil ways
1591 currLine->index = linesCount;
1592 linesCount++;
1593 noteElement(currLine);
1594 }
1595 constructAndAnalyze(gICULines, lines, exemplarSetSize, gComparer);
1596
1597 delete[] lines;
1598 }
1599
1600
1601 // cleanup globals
1602 delete[] gICULines;
1603 u_fflush(out);
1604 u_fclose(out);
1605 ucol_close(gCol);
1606 }
1607
1608 void
processLocale(const char * locale,UErrorCode & status)1609 processLocale(const char *locale, UErrorCode &status) {
1610 gWinLCID = uloc_getLCID(locale);
1611
1612 UCollator *col = ucol_open(locale, &status);
1613
1614 setOutputFile(locale, status);
1615
1616 u_fprintf(out, "Locale %s (LCID:%06X)\n", locale, gWinLCID);
1617
1618 processCollator(col, status);
1619 }
1620
1621 UBool
hasCollationElements(const char * locName)1622 hasCollationElements(const char *locName) {
1623
1624 UErrorCode status = U_ZERO_ERROR;
1625 UResourceBundle *ColEl = NULL;
1626
1627 UResourceBundle *loc = ures_open(NULL, locName, &status);;
1628
1629 if(U_SUCCESS(status)) {
1630 status = U_ZERO_ERROR;
1631 ColEl = ures_getByKey(loc, "CollationElements", ColEl, &status);
1632 if(status == U_ZERO_ERROR) { /* do the test - there are real elements */
1633 ures_close(ColEl);
1634 ures_close(loc);
1635 return true;
1636 }
1637 ures_close(ColEl);
1638 ures_close(loc);
1639 }
1640 return false;
1641 }
1642
1643 int
main(int argc,char * argv[])1644 main(int argc,
1645 char* argv[])
1646 {
1647 UErrorCode status = U_ZERO_ERROR;
1648 err = u_finit(stderr, "en", "latin-1");
1649 log = u_finit(stdout, "en", "latin-1");
1650
1651 /*
1652 USet *wsp = uprv_openRuleWhiteSpaceSet(&status);
1653 uset_add(wsp, 0x0041);
1654 uset_remove(wsp, 0x0041);
1655 UnicodeString pat;
1656 ((UnicodeSet *)wsp)->toPattern(pat, true);
1657 pat.setCharAt(pat.length(), 0);
1658 escapeString(pat.getBuffer(), pat.length(), log);
1659 u_fflush(log);
1660 */
1661
1662 UTransliterator *anyHex = utrans_open("[^\\u000a\\u0020-\\u007f] Any-Hex/Java", UTRANS_FORWARD, NULL, 0, NULL, &status);
1663 u_fsettransliterator(log, U_WRITE, anyHex, &status);
1664
1665 processArgs(argc, argv, status);
1666 int32_t i = 0;
1667
1668
1669 gElements.setValueDeleter(deleteLineElement);
1670
1671
1672 if(U_FAILURE(status) || gPlatformNo == 0) {
1673 return -1;
1674 }
1675
1676 gUCA = ucol_open("root", &status);
1677
1678 if(gRulesStdin) {
1679 char buffer[1024];
1680 UChar ruleBuffer[16384];
1681 UChar *rules = ruleBuffer;
1682 int32_t maxRuleLen = 16384;
1683 int32_t rLen = 0;
1684 while(gets(buffer)) {
1685 if(buffer[0] != '/' && buffer[1] != '/') {
1686 rLen = u_unescape(buffer, rules, maxRuleLen);
1687 rules += rLen;
1688 maxRuleLen -= rLen;
1689 }
1690 }
1691 UParseError parseError;
1692 //escapeString(ruleBuffer, rules-ruleBuffer, log);//
1693 u_fprintf(log, "%U\n", ruleBuffer);
1694
1695 UCollator *col = ucol_openRules(ruleBuffer, rules-ruleBuffer, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &status);
1696 if(U_SUCCESS(status)) {
1697 setOutputFile("stdinRules", status);
1698 processCollator(col, status);
1699 } else {
1700 u_fprintf(err, "Error %s\n", u_errorName(status));
1701 }
1702 } else {
1703
1704 if(gLocale) {
1705 processLocale(gLocale, status);
1706 } else if(gLocaleNo) {
1707 for(i = 0; i < gLocaleNo; i++) {
1708 processLocale(gLocales[i], status);
1709 }
1710 } else { // do the loop through all the locales
1711 int32_t noOfLoc = uloc_countAvailable();
1712 const char *locName = NULL;
1713 for(i = 0; i<noOfLoc; i++) {
1714 status = U_ZERO_ERROR;
1715 locName = uloc_getAvailable(i);
1716 if(hasCollationElements(locName)) {
1717 processLocale(locName, status);
1718 }
1719 }
1720 }
1721 }
1722
1723
1724 ucol_close(gUCA);
1725
1726 u_fflush(log);
1727 u_fclose(log);
1728 u_fflush(err);
1729 u_fclose(err);
1730
1731 return 0;
1732 }