• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2003, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *
11 * File colprobe.cpp
12 *
13 * Modification History:
14 *
15 *   Date        Name        Description
16 *   03/18/2003  weiv        Creation.
17 *******************************************************************************
18 */
19 
20 #include "uoptions.h"
21 #include "unicode/ucol.h"
22 #include "unicode/ucoleitr.h"
23 #include "unicode/ures.h"
24 #include "unicode/uniset.h"
25 #include "unicode/usetiter.h"
26 #include "unicode/ustring.h"
27 #include "unicode/uchar.h"
28 #include "unicode/uscript.h"
29 #include "uprops.h"
30 #include "hash.h"
31 #include "ucol_imp.h"
32 
33 #include "unicode/ustdio.h"
34 #include "unicode/utrans.h"
35 
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <io.h>
40 #include <fcntl.h>
41 
42 #include "colprobe.h"
43 
44 
45 #ifdef WIN32
46 #include <windows.h>
47 #else
48 //
49 //  Stubs for Windows API functions when building on UNIXes.
50 //
51 typedef int DWORD;
CompareStringW(DWORD,DWORD,UChar *,int,UChar *,int)52 inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;};
53 #include <sys/time.h>
timeGetTime()54 unsigned long timeGetTime() {
55     struct timeval t;
56     gettimeofday(&t, 0);
57     unsigned long val = t.tv_sec * 1000;  // Let it overflow.  Who cares.
58     val += t.tv_usec / 1000;
59     return val;
60 };
LCMapStringW(DWORD,DWORD,UChar *,int,UChar *,int)61 inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;};
62 const int LCMAP_SORTKEY = 0;
63 #define MAKELCID(a,b) 0
64 const int SORT_DEFAULT = 0;
65 #endif
66 
67 #include "line.h"
68 
69 static UBool gVerbose = false;
70 static UBool gDebug = false;
71 static UBool gQuiet = false;
72 static UBool gExemplar = false;
73 
74 DWORD          gWinLCID;
75 int            gCount;
76 Line          **gICULines;
77 UCollator     *gCol;
78 UCollator     *gUCA;
79 Line          source;
80 Line          target;
81 Line          *gSource = &source;
82 Line          *gTarget = &target;
83 Hashtable     gElements(false);
84 Hashtable     gExpansions(false);
85 CompareFn gComparer;
86 
87 const UChar separatorChar = 0x0030;
88 
89 UFILE *out = NULL;
90 UFILE *err = NULL;
91 UFILE *log = NULL;
92 
93 const char *progName = "colprobe";
94 
95 const char *gLocale = NULL;
96 //char platform[256];
97 int32_t platformIndex = -1;
98 int32_t gPlatformNo = 0;
99 int32_t gPlatformIndexes[10];
100 int32_t gLocaleNo = 0;
101 const char* gLocales[100];
102 UBool gRulesStdin = false;
103 
104 enum {
105   HELP1,
106     HELP2,
107     VERBOSE,
108     QUIET,
109     VERSION,
110     ICUDATADIR,
111     COPYRIGHT,
112     LOCALE,
113     PLATFORM,
114     DEBUG,
115     EXEMPLAR,
116     RULESSTDIN
117 };
118 
119 UOption options[]={
120   /*0*/ UOPTION_HELP_H,
121   /*1*/ UOPTION_HELP_QUESTION_MARK,
122   /*2*/ UOPTION_VERBOSE,
123   /*3*/ UOPTION_QUIET,
124   /*4*/ UOPTION_VERSION,
125   /*5*/ UOPTION_ICUDATADIR,
126   /*6*/ UOPTION_COPYRIGHT,
127   /*7*/ UOPTION_DEF("locale", 'l', UOPT_REQUIRES_ARG),
128   /*8*/ UOPTION_DEF("platform", 'p', UOPT_REQUIRES_ARG),
129   /*9*/ UOPTION_DEF("debug", 'D', UOPT_NO_ARG),
130   /*10*/ UOPTION_DEF("exemplar", 'E', UOPT_NO_ARG),
131   /*11*/ UOPTION_DEF("rulesstdin", 'R', UOPT_NO_ARG)
132 };
133 
Winstrcmp(const void * a,const void * b)134 int Winstrcmp(const void *a, const void *b) {
135     gCount++;
136     int t;
137     t = CompareStringW(gWinLCID, 0,
138       (*(Line **)a)->name, (*(Line **)a)->len,
139       (*(Line **)b)->name, (*(Line **)b)->len);
140     return t-2;
141 }
142 
ICUstrcmp(const void * a,const void * b)143 int ICUstrcmp(const void *a, const void *b) {
144     gCount++;
145     UCollationResult t;
146     t = ucol_strcoll(gCol,
147       (*(Line **)a)->name, (*(Line **)a)->len,
148       (*(Line **)b)->name, (*(Line **)b)->len);
149     if (t == UCOL_LESS) return -1;
150     if (t == UCOL_GREATER) return +1;
151     return 0;
152 }
153 
154 struct {
155   const char* name;
156   CompareFn comparer;
157 } platforms[] = {
158   { "icu", ICUstrcmp },
159   { "win", Winstrcmp}
160 };
161 
162 
deleteLineElement(void * line)163 void deleteLineElement(void *line) {
164   delete((Line *)line);
165 }
166 
stringToLower(char * string)167 void stringToLower(char *string) {
168   uint32_t i = 0;
169   for(i = 0; i < strlen(string); i++) {
170     string[i] = tolower(string[i]);
171   }
172 }
173 
usage(const char * name)174 void usage(const char *name) {
175   u_fprintf(out, "Usage: %s --locale loc_name --platform platform\n", name);
176 }
177 
listKnownPlatforms()178 void listKnownPlatforms() {
179   int32_t i = 0;
180   u_fprintf(err, "Known platforms:\n");
181   for(i = 0; i < sizeof(platforms)/sizeof(platforms[0]); i++) {
182     u_fprintf(err, "\t%s\n", platforms[i]);
183   }
184 }
185 
addPlatform(const char * platform)186 void addPlatform(const char *platform) {
187   int32_t i;
188   //stringToLower(platform);
189   int32_t oldPlatformNo = gPlatformNo;
190 
191   for(i = 0; i < sizeof(platforms)/sizeof(platforms[0]); i++) {
192     if(strcmp(platform, platforms[i].name) == 0) {
193       gPlatformIndexes[gPlatformNo++] = i;
194     }
195   }
196   if(gPlatformNo == oldPlatformNo) {
197     u_fprintf(err, "Unknown platform %s\n", platform);
198     listKnownPlatforms();
199   }
200 }
201 
processArgs(int argc,char * argv[],UErrorCode & status)202 void processArgs(int argc, char* argv[], UErrorCode &status)
203 {
204   int32_t i = 0;
205   U_MAIN_INIT_ARGS(argc, argv);
206 
207   argc = u_parseArgs(argc, argv, (int32_t)(sizeof(options)/sizeof(options[0])), options);
208 
209   if(argc < 0) {
210     u_fprintf(err, "Unknown option: %s\n", argv[-argc]);
211     usage(progName);
212     return;
213   }
214 
215   if(options[0].doesOccur || options[1].doesOccur) {
216     usage(progName);
217     return;
218   }
219   if(options[VERBOSE].doesOccur) {
220     gVerbose = true;
221   }
222   if(options[DEBUG].doesOccur) {
223     gDebug = true;
224     gVerbose = true;
225   }
226   if(options[EXEMPLAR].doesOccur) {
227     gExemplar = true;
228   }
229   if(options[QUIET].doesOccur) {
230     gQuiet = true;
231   }
232 /*
233   for(i = 8; i < 9; i++) {
234     if(!options[i].doesOccur) {
235       u_fprintf(err, "Option %s is required!\n", options[i].longName);
236       usage(progName);
237       status = U_ILLEGAL_ARGUMENT_ERROR;
238     }
239     if(options[i].value == NULL) {
240       u_fprintf(err, "Option %s needs an argument!\n", options[i].longName);
241       usage(progName);
242       status = U_ILLEGAL_ARGUMENT_ERROR;
243     }
244   }
245 */
246   // ASCII based options specified on the command line
247   // this is for testing purposes, will allow to load
248   // up ICU rules and then poke through them.
249   // In that case, we test only ICU and don't need
250   // a locale.
251   if(options[RULESSTDIN].doesOccur) {
252     gRulesStdin = true;
253     addPlatform("icu");
254     return;
255   }
256 
257   if(options[LOCALE].doesOccur) {
258     gLocale = options[LOCALE].value;
259   } else {
260     for(i = 1; i < argc; i++) {
261       gLocales[gLocaleNo++] = argv[i];
262     }
263   }
264   if(options[PLATFORM].doesOccur) {
265     //strcpy(platform, options[PLATFORM].value);
266     //addPlatform("icu");
267     addPlatform(options[PLATFORM].value);
268   } else { // there is a list of platforms
269     u_fprintf(err, "Option %s is required!\n", options[i].longName);
270     usage(progName);
271     status = U_ILLEGAL_ARGUMENT_ERROR;
272   }
273 
274   //
275   //  Set up a Windows LCID
276   //
277   gWinLCID = uloc_getLCID(gLocale);
278   /*
279   if (gLocale != 0) {
280       gWinLCID = MAKELCID(gLocale, SORT_DEFAULT);
281   }
282   else {
283       gWinLCID = uloc_getLCID(gLocale);
284   }
285   */
286 
287 }
288 
printRules(const UChar * name,int32_t len,UFILE * file)289 void printRules(const UChar *name, int32_t len, UFILE *file) {
290   // very rudimentary pretty rules print
291   int32_t i = 0;
292   UChar toPrint[16384];
293   int32_t toPrintIndex = 0;
294   for(i = 0; i < len; i++) {
295     if(name[i] == 0x0026) {
296       if(toPrintIndex) {
297         toPrint[toPrintIndex] = 0;
298         u_fprintf(file, "%U\n", toPrint);
299         toPrintIndex = 0;
300         toPrint[toPrintIndex++] = name[i];
301       } else {
302         toPrint[toPrintIndex++] = name[i];
303       }
304     } else {
305       toPrint[toPrintIndex++] = name[i];
306     }
307   }
308   if(toPrintIndex) {
309     toPrint[toPrintIndex] = 0;
310     u_fprintf(file, "%U\n", toPrint);
311     toPrintIndex = 0;
312   }
313 
314 
315 }
316 
escapeString(const UChar * name,int32_t len,UFILE * file)317 void escapeString(const UChar *name, int32_t len, UFILE *file) {
318   u_fprintf(file, "%U", name);
319 /*
320   int32_t j = 0;
321   for(j = 0; j < len; j++) {
322     if(name[j] >= 0x20 && name[j] < 0x80) {
323       u_fprintf(file, "%c", name[j]);
324     } else {
325       u_fprintf(file, "\\u%04X", name[j]);
326     }
327   }
328 */
329 }
escapeALine(Line * line,UFILE * file)330 void escapeALine(Line *line, UFILE *file) {
331   escapeString(line->name, line->len, file);
332 }
333 
escapeExpansion(Line * line,UFILE * file)334 void escapeExpansion(Line *line, UFILE *file) {
335   escapeString(line->expansionString, line->expLen, file);
336 }
337 
showNames(Line * line,UFILE * file)338 void showNames(Line *line, UFILE *file) {
339   UErrorCode status = U_ZERO_ERROR;
340   int32_t j = 0;
341   char charName[256];
342   for(j = 0; j < line->len; j++) {
343     u_charName(line->name[j], U_EXTENDED_CHAR_NAME, charName, 256, &status);
344     u_fprintf(file, "%s ", charName);
345   }
346 }
347 
setArray(Line ** array,Line * contents,int32_t size)348 void setArray(Line **array, Line *contents, int32_t size) {
349   int32_t i = 0;
350   for(i = 0; i < size; i++) {
351     array[i] = contents+i;
352   }
353 }
354 
355 // set an array from a Hashtable
356 int32_t
setArray(Line ** array,Hashtable * table=& gElements)357 setArray(Line **array, Hashtable *table = &gElements) {
358   int32_t size = table->count();
359   int32_t hashIndex = -1;
360   const UHashElement *hashElement = NULL;
361   int32_t count = 0;
362   while((hashElement = table->nextElement(hashIndex)) != NULL) {
363     array[count++] = (Line *)hashElement->value.pointer;
364   }
365   return size;
366 }
367 
trySwamped(Line ** smaller,Line ** greater,UChar chars[2],CompareFn comparer)368 UBool trySwamped(Line **smaller, Line **greater, UChar chars[2], CompareFn comparer) {
369   u_strcpy(gSource->name, (*smaller)->name);
370   gSource->name[(*smaller)->len] = separatorChar;
371   gSource->name[(*smaller)->len+1] = chars[0];
372   gSource->name[(*smaller)->len+2] = 0;
373   gSource->len = (*smaller)->len+2;
374 
375   u_strcpy(gTarget->name, (*greater)->name);
376   gTarget->name[(*greater)->len] = separatorChar;
377   gTarget->name[(*greater)->len+1] = chars[1];
378   gTarget->name[(*greater)->len+2] = 0;
379   gTarget->len = (*greater)->len+2;
380 
381   if(comparer(&gSource, &gTarget) > 0) {
382     return true;
383   } else {
384     return false;
385   }
386 }
387 
trySwamps(Line ** smaller,Line ** greater,UChar chars[2],CompareFn comparer)388 UBool trySwamps(Line **smaller, Line **greater, UChar chars[2], CompareFn comparer) {
389   gSource->name[0] = chars[0];
390   gSource->name[1] = separatorChar;
391   u_strcpy(gSource->name+2, (*smaller)->name);
392   gSource->len = (*smaller)->len+2;
393 
394   gTarget->name[0] = chars[1];
395   gTarget->name[1] = separatorChar;
396   u_strcpy(gTarget->name+2, (*greater)->name);
397   gTarget->len = (*greater)->len+2;
398 
399   if(comparer(&gSource, &gTarget) < 0) {
400     return true;
401   } else {
402     return false;
403   }
404 }
405 
406 UColAttributeValue
probeStrength(Line ** prevLine,Line ** currLine,CompareFn comparer)407 probeStrength(Line** prevLine, Line **currLine, CompareFn comparer) {
408   // Primary swamps secondary
409   // have pairs where [0] 2> [1]
410   UChar primSwamps[][2] = {
411     { 0x00E0, 0x0061 },
412     { 0x0450, 0x0435 },
413     { 0x31a3, 0x310d }
414   };
415   // Secondary swamps tertiary
416   // have pairs where [0] 3> [1]
417   UChar secSwamps[][2] = {
418     { 0x0053, 0x0073 },
419     { 0x0415, 0x0435 },
420     { 0x31b6, 0x310e }
421   };
422   // Secondary is swamped by primary
423   // have pairs where [0] 1> [1]
424   UChar secSwamped[][2] = {
425     { 0x0062, 0x0061 },
426     { 0x0436, 0x0454 },
427     { 0x310e, 0x310d }
428   };
429   // Tertiary is swamped by secondary
430   // have pairs where [0] 2> [1]
431   UChar terSwamped[][2] = {
432     { 0x00E0, 0x0061 },
433     { 0x0450, 0x0435 },
434     { 0x31a3, 0x310d }
435   };
436   int32_t i = 0;
437   // Tertiary swamps equal?
438   int result = 0;
439   // Choose the pair
440   i = 0;
441   /*
442   if((*prevLine)->name[0] > 0xFF && (*currLine)->name[0] > 0xFF) {
443     i = 0;
444   } else if((*prevLine)->name[0] < 0x0400 && (*currLine)->name[0] < 0x0400) {
445     i = 1;
446   } else {
447     i = 2;
448   }
449   */
450   // are they equal?
451   if((result = comparer(prevLine, currLine)) == 0) {
452     return UCOL_IDENTICAL;
453   } else if(result > 0) {
454     //fprintf(stderr, "lines should be ordered!");
455     return UCOL_OFF;
456   } else if(trySwamps(prevLine, currLine, primSwamps[i], comparer)) {
457     return UCOL_PRIMARY;
458   } else if(trySwamps(prevLine, currLine, secSwamps[i], comparer)) {
459     return UCOL_SECONDARY;
460   } else if(trySwamped(prevLine, currLine, terSwamped[i], comparer)) {
461     // is there a tertiary difference
462     return UCOL_TERTIARY;
463   } else {
464     //fprintf(stderr, "Unknown strength!\n");
465     return UCOL_ON;
466   }
467 }
468 
469 // This function tries to probe the set of lines
470 // (already sorted by qsort) and deduct the strengths
471 void
analyzeStrength(Line ** lines,int32_t size,CompareFn comparer)472 analyzeStrength(Line **lines, int32_t size, CompareFn comparer) {
473   int32_t i = 0;
474 
475   for(i = 1; i < size; i++) {
476     Line **prevLine = lines+i-1;
477     Line **currLine = lines+i;
478     (*currLine)->strength = probeStrength(prevLine, currLine, comparer);
479     (*currLine)->sortedIndex = i;
480     (*currLine)->previous = *prevLine;
481     (*prevLine)->next = *currLine;
482 
483   }
484 
485 }
486 
printStrength(UColAttributeValue strength,UFILE * file)487 void printStrength(UColAttributeValue strength, UFILE *file) {
488   u_fprintf(file, " ");
489   switch(strength) {
490   case UCOL_IDENTICAL:
491     u_fprintf(file, "=");
492     break;
493   case UCOL_TERTIARY:
494     //u_fprintf(file, "<3");
495     u_fprintf(file, "<<<");
496     break;
497   case UCOL_SECONDARY:
498     //u_fprintf(file, "<2");
499     u_fprintf(file, "<<");
500     break;
501   case UCOL_PRIMARY:
502     //u_fprintf(file, "<1");
503     u_fprintf(file, "<");
504     break;
505   case UCOL_OFF:
506     u_fprintf(file, ">?");
507   default:
508     u_fprintf(file, "?!");
509     break;
510   }
511   u_fprintf(file, " ");
512 }
513 
printStrength(Line * line,UFILE * file)514 void printStrength(Line *line, UFILE *file) {
515   printStrength(line->strength, file);
516 }
517 
printLine(Line * line,UFILE * file)518 void printLine(Line *line, UFILE *file) {
519   escapeALine(line, file);
520   if(line->isExpansion) {
521     u_fprintf(file, "/");
522     escapeExpansion(line, file);
523   }
524 }
525 
printOrdering(Line ** lines,int32_t size,UFILE * file,UBool useLinks=false)526 void printOrdering(Line **lines, int32_t size, UFILE *file, UBool useLinks = false) {
527   int32_t i = 0;
528 
529   //printLine(*lines);
530   //escapeALine(*lines); // Print first line
531 
532   Line *line = NULL;
533   Line *previous = *lines;
534   if(previous->isReset) {
535     u_fprintf(file, "\n& ");
536     escapeALine(previous, file);
537   } else if(!previous->isRemoved) {
538     printLine(previous, file);
539   }
540   i = 1;
541   while(i < size && previous->next) {
542     if(useLinks) {
543       line = previous->next;
544     } else {
545       line = *(lines+i);
546     }
547     if(line->isReset) {
548       u_fprintf(file, "\n& ");
549       escapeALine(line, file);
550     } else if(!line->isRemoved) {
551       if(file == out) {
552         u_fprintf(file, "\n");
553       }
554       if(i > 0) {
555         printStrength(line, file);
556       }
557       printLine(line, file);
558       //escapeALine(line, file);
559     }
560     previous = line;
561     i++;
562   }
563   u_fprintf(file, "\n");
564 }
565 
566 
setIndexes(Line ** lines,int32_t size)567 void setIndexes(Line **lines, int32_t size) {
568   int32_t i = 0;
569   (*lines)->sortedIndex = 0;
570   for(i = 1; i < size; i++) {
571     Line *line = *(lines+i);
572     Line *prev = *(lines+i-1);
573     line->previous = prev;
574     prev->next = line;
575     line->sortedIndex = i;
576   }
577 }
578 
579 
580 // this seems to be a dead end
581 void
noteExpansion(Line ** gLines,Line * line,int32_t size,CompareFn comparer)582 noteExpansion(Line **gLines, Line *line, int32_t size, CompareFn comparer) {
583   UErrorCode status = U_ZERO_ERROR;
584 
585   UnicodeString key(line->name, line->len);
586   //Line *toInsert = (Line *)gElements.get(key);
587   Line *toInsert = (Line *)gExpansions.get(key);
588   if(toInsert != NULL) {
589     toInsert->isExpansion = true;
590     u_strcpy(toInsert->expansionString, line->expansionString);
591     toInsert->expLen = line->expLen;
592     toInsert->previous->next = toInsert->next;
593     toInsert->next->previous = toInsert->previous;
594     gElements.remove(key);
595   } else {
596     toInsert = new Line(*line);
597     toInsert->isExpansion = true;
598     gElements.put(UnicodeString(toInsert->name, toInsert->len), toInsert, status);
599   }
600 
601   int32_t i = 0;
602   Line testLine;
603   Line *l = &testLine;
604   for(i = 0; i < size; i++) {
605     u_strcpy(testLine.name, (*(gLines+i))->name);
606     u_strcat(testLine.name, line->expansionString);
607     testLine.len = (*(gLines+i))->len + line->expLen;
608     if(comparer(&l, &line) > 0) {
609       toInsert->previous = *(gLines+i-1);
610       toInsert->next = *(gLines+i);
611       toInsert->previous->next = toInsert;
612       toInsert->next->previous = toInsert;
613       break;
614     }
615   }
616   if(gVerbose) {
617     u_fprintf(log, "Adding expansion\n");
618     escapeALine(line, log);
619     u_fprintf(log, "/");
620     escapeExpansion(line, log);
621     u_fprintf(log, " ");
622   }
623 }
624 
625 void
positionExpansions(Line ** gLines,int32_t size,CompareFn comparer)626 positionExpansions(Line **gLines, int32_t size, CompareFn comparer) {
627   int result = 0;
628   Line *line = NULL;
629   Line *toMove = NULL;
630   int32_t i = 0, j = 0;
631   Line **sortedExpansions = new Line*[gExpansions.count()];
632   int32_t sortedExpansionsSize = setArray(sortedExpansions, &gExpansions);
633   qsort(sortedExpansions, sortedExpansionsSize, sizeof(Line *), comparer);
634   // Make a list of things in the vincinity of expansion candidate
635   for(j = 0; j < sortedExpansionsSize; j++) {
636     line = *(sortedExpansions+j);
637     UnicodeString key(line->name, line->len);
638     toMove = (Line *)gElements.get(key);
639     int32_t i = 0;
640     Line testLine, prevTestLine;
641     Line *l = &testLine;
642     Line *prevL = &prevTestLine;
643     // This can be further optimized, since we now know that we have a
644     // sorted list of expansions, so current can start from toMove, since all
645     // the elements before it are already smaller. In the beginning it needs to
646     // be on gLines, though.
647     Line *current = *gLines;
648     while(current) {
649       if(current == toMove) {
650         // we are wading through a sorted list
651         // if we found ourselves, it means that we
652         // are already in a right place, so no moving
653         // is needed, but we need to make sure we have
654         // the right strength.
655         toMove->strength = probeStrength(&prevL, &toMove, comparer);
656         if(0) {
657           u_fprintf(log, "Positioned expansion without moving ");
658           printLine(toMove, log);
659           u_fprintf(log, " new ordering: \n");
660           printOrdering(gLines, size, log, true);
661         }
662         break;
663       } else {
664         u_strcpy(testLine.name, current->name);
665         if(!current->isExpansion) {
666           u_strcat(testLine.name, line->expansionString);
667           testLine.len = current->len + line->expLen;
668         } else {
669           testLine.len = current->len;
670         }
671         if(comparer(&l, &line) > 0) {
672           // remove from chain
673           if(toMove->next) {
674             toMove->next->strength = probeStrength(&(toMove->previous), &(toMove->next), comparer);
675             toMove->next->previous = toMove->previous;
676           }
677           if(toMove->previous) {
678             toMove->previous->next = toMove->next;
679           }
680 
681           // insert
682           toMove->previous = current->previous;
683           toMove->next = current;
684 
685           if(current->previous) {
686             current->previous->next = toMove;
687           }
688           current->previous = toMove;
689 
690           toMove->strength = probeStrength(&prevL, &toMove, comparer);
691           toMove->next->strength = probeStrength(&toMove, &l, comparer);
692           if(0) {
693             u_fprintf(log, "Positioned expansion ");
694             printLine(toMove, log);
695             u_fprintf(log, " new ordering: \n");
696             printOrdering(gLines, size, log, true);
697           }
698           if(toMove->strength == UCOL_IDENTICAL) {
699             // check for craziness such as s = ss/s
700             // such line would consist of previous (or next) concatenated with the expansion value
701             // make a test
702             UChar fullString[256];
703             u_strcpy(fullString, toMove->previous->name);
704             u_strcat(fullString, toMove->expansionString);
705             if(u_strcmp(fullString, toMove->name) == 0) {
706               toMove->previous->next = toMove->next;
707               toMove->next->previous = toMove->previous;
708               toMove->isRemoved = true;
709               u_fprintf(log, "Removed: ");
710               printLine(toMove, log);
711               u_fprintf(log, "\n");
712             }
713           } else if(toMove->next->strength == UCOL_IDENTICAL) {
714             UChar fullString[256];
715             u_strcpy(fullString, toMove->next->name);
716             u_strcat(fullString, toMove->expansionString);
717             if(u_strcmp(fullString, toMove->name) == 0) {
718               toMove->next->strength = toMove->strength;
719               toMove->previous->next = toMove->next;
720               toMove->next->previous = toMove->previous;
721               toMove->isRemoved = true;
722               u_fprintf(log, "Removed because of back: ");
723               printLine(toMove, log);
724               u_fprintf(log, "\n");
725             }
726           }
727           break;
728         }
729         prevTestLine = testLine;
730       }
731       current = current->next;
732     }
733   }
734   delete[] sortedExpansions;
735 }
736 
737 
738 void
noteExpansion(Line * line)739 noteExpansion(Line *line) {
740   UErrorCode status = U_ZERO_ERROR;
741   UnicodeString key(line->name, line->len);
742   Line *el = (Line *)gElements.get(key);
743   if(el != NULL) {
744     el->isExpansion = true;
745     u_strcpy(el->expansionString, line->expansionString);
746     el->expLen = line->expLen;
747   } else {
748     Line *toInsert = new Line(*line);
749     toInsert->isExpansion = true;
750     gElements.put(UnicodeString(line->name, line->len), toInsert, status);
751   }
752 
753   Line *el2 = (Line *)gExpansions.get(key);
754   el2->isExpansion = true;
755   u_strcpy(el2->expansionString, line->expansionString);
756   el2->expLen = line->expLen;
757 
758   if(gDebug) {
759     u_fprintf(log, "Adding expansion\n");
760     printLine(line, log);
761     u_fprintf(log, "\n");
762   }
763 }
764 
765 void
noteContraction(Line * line)766 noteContraction(Line *line) {
767   UErrorCode status = U_ZERO_ERROR;
768   Line *toInsert = new Line(*line);
769   toInsert->isContraction = true;
770   gElements.put(UnicodeString(line->name, line->len), toInsert, status);
771   if(gVerbose) {
772     u_fprintf(log, "Adding contraction\n");
773     escapeALine(line, log);
774     u_fprintf(log, " ");
775   }
776 }
777 
778 void
noteElement(Line * line)779 noteElement(Line *line) {
780   UErrorCode status = U_ZERO_ERROR;
781   Line *toInsert = new Line(*line);
782   gElements.put(UnicodeString(line->name, line->len), toInsert, status);
783   if(0) { //if(gDebug)
784     escapeALine(line, log);
785     u_fprintf(log, " ");
786   }
787 }
788 
789 
790 
791 // This function checks if a combination of characters has changed place with the
792 // adjacent elements. If so, these are most probably contractions.
793 // However, it still needs to be checked if these contractions are fake - the
794 // test is simple - if xy is suspected contraction, if we get that x/y is expansion, then
795 // xy is a fake contraction.
796 int32_t
analyzeContractions(Line ** lines,int32_t size,CompareFn comparer)797 analyzeContractions(Line** lines, int32_t size, CompareFn comparer) {
798   int32_t i = 0, j = 0;
799   int32_t outOfOrder = 0;
800   UColAttributeValue strength = UCOL_OFF;
801   UColAttributeValue currStrength = UCOL_OFF;
802   Line **prevLine = lines;
803   Line **currLine = NULL;
804   Line **backupLine = NULL;
805   UBool prevIsContraction = false, currIsContraction = false;
806   // Problem here is detecting a contraction that is at the very end of the sorted list
807   for(i = 1; i < size; i++) {
808     currLine = lines+i;
809     strength = probeStrength(prevLine, currLine, comparer);
810     if(strength == UCOL_OFF || strength != (*currLine)->strength) {
811       prevIsContraction = false;
812       currIsContraction = false;
813       if(!outOfOrder) {
814         if(gVerbose) {
815           u_fprintf(log, "Possible contractions: ");
816         }
817       }
818       // now we have two elements that are different. The question is,
819       // which one of them is the contraction - which one has moved.
820       // Could be the previous, but could also be the current.
821 
822       outOfOrder++;
823 
824       // First, lets check whether the previous has jumped back
825       j = i+1;
826       // skip all the nexts that have smaller strength, they don't have an effect
827       while(j < size && (*(lines+j))->strength > (*currLine)->strength) {
828         j++;
829       }
830       // check if there are other elements of same or greater strength
831       while(j < size &&
832         (strength = probeStrength(prevLine, (backupLine = lines+j), comparer)) == UCOL_OFF) {
833         j++;
834         // if we skipped more than one, it might be in fact a contraction
835         prevIsContraction = true;
836       }
837       if(prevIsContraction) {
838         noteContraction(*prevLine);
839         j = i-2;
840         // add all the previous elements with smaller strength, since they also
841         // will jump over and are contractions
842         while(j >= 0 && (*(lines+j+1))->strength > (*currLine)->strength) {
843           strength = probeStrength(lines+j, currLine, comparer);
844           if(strength == UCOL_OFF) {
845             noteContraction(*(lines+j));
846           }
847           j--;
848         }
849       }
850 
851       // now we check if the current element is jumping forward,
852       // the dance steps are analogous to above.
853       j = i - 2;
854       while(j >= 0 && (*(lines+j+1))->strength > (*currLine)->strength) {
855         j--;
856       }
857       while(j >= 0 &&
858         (strength = probeStrength((backupLine = lines+j), currLine, comparer)) == UCOL_OFF) {
859         j--;
860         currIsContraction = true;
861       }
862       if(currIsContraction) {
863         if(gVerbose) {
864           escapeALine(*currLine, log);
865           u_fprintf(log, " ");
866         }
867         j = i+1;
868         while(j < size && (*(lines+j))->strength > (*currLine)->strength) {
869           strength = probeStrength(prevLine, lines+j, comparer);
870           if(strength == UCOL_OFF) {
871             noteContraction(*(lines+j));
872           }
873           j++;
874         }
875       }
876 
877       // Not sure about either. List both and then check
878       if(!(prevIsContraction || currIsContraction)) {
879         noteContraction(*prevLine);
880         noteContraction(*currLine);
881       }
882     }
883     prevLine = currLine;
884   }
885   if(outOfOrder) {
886     if(gVerbose) {
887       u_fprintf(log, "\n");
888     }
889   }
890   return outOfOrder;
891 }
892 
893 int32_t
detectContractions(Line ** gLines,Line * lines,int32_t size,CompareFn comparer)894 detectContractions(Line **gLines, Line *lines, int32_t size, CompareFn comparer) {
895   int32_t i = 0, j = 0;
896   int32_t noContractions = 0;
897   // Create and compare doubles:
898   Line *backupLines = new Line[size];
899   Line::copyArray(backupLines, lines, size);
900   // detect contractions
901 
902   Line **gLinesBackup = NULL; //new Line*[size];
903 
904   for(i = 0; i < size; i++) {
905     // preserve index and previous
906     Line::copyArray(lines, backupLines, size);
907     for(j = 0; j < size; j++) {
908       u_strcpy(lines[j].name, backupLines[i].name);
909       u_strcat(lines[j].name, backupLines[j].name);
910       lines[j].len = backupLines[i].len+backupLines[j].len;
911     }
912 
913     if((noContractions += analyzeContractions(gLines, size, comparer)) && gDebug) {
914       if(gLinesBackup == NULL) {
915         gLinesBackup = new Line*[size];
916       }
917       // Show the sorted doubles, for debugging
918       setArray(gLinesBackup, lines, size);
919       qsort(gLinesBackup, size, sizeof(Line *), comparer);
920       //setIndexes(gLinesBackup, size);
921       analyzeStrength(gLinesBackup, size, comparer);
922       printOrdering(gLinesBackup, size, log);
923     }
924     if(!gQuiet) {
925       u_fprintf(log, ".");
926     }
927   }
928   if(!gQuiet) {
929     u_fprintf(log, "\n");
930   }
931   delete[] backupLines;
932   if(gLinesBackup) {
933     delete[] gLinesBackup;
934   }
935   return noContractions;
936 }
937 
938 // gLines in this function is an array of sorted pointers.
939 // Contractions are already included.
940 int32_t
detectExpansions(Line ** gLines,int32_t size,CompareFn comparer)941 detectExpansions(Line **gLines, int32_t size, CompareFn comparer) {
942   UErrorCode status = U_ZERO_ERROR;
943   // detect expansions
944 
945   UColAttributeValue startStrength = UCOL_OFF, endStrength = UCOL_OFF,
946     strength = UCOL_OFF, previousStrength = UCOL_OFF;
947   Line start, end, src;
948   Line *startP = &start, *endP = &end, *srcP = &src;
949   Line *current = NULL;
950   memset(startP, 0, sizeof(Line));
951   memset(endP, 0, sizeof(Line));
952   memset(srcP, 0, sizeof(Line));
953   int32_t srcLen;
954   int32_t i = 0, j = 0, k = 0;
955   for(i = 0; i < size; i++) {
956     u_strcpy(start.name, (*(gLines+i))->name);
957     u_strcpy(end.name, (*(gLines+i))->name);
958     srcLen = (*(gLines+i))->len;
959     u_strcpy(start.name+srcLen, (*(gLines))->name);
960     start.len = srcLen + (*(gLines))->len;
961     u_strcpy(end.name+srcLen, (*(gLines+size-1))->name);
962     end.len = srcLen + (*(gLines+size-1))->len;
963 
964     for(k = 0; k < size; k++) { // k is index of a thing that is not doubled
965       current = *(gLines+k);
966       // see if we have moved to front
967       // has it moved to the very beginning
968       if((startStrength = probeStrength((gLines+k), &startP, comparer)) != UCOL_OFF) {
969         continue; // this one is in the front
970       }
971       // has it moved to the very end?
972       if((endStrength = probeStrength(&endP, (gLines+k), comparer)) != UCOL_OFF) {
973         continue; // this one is in the back
974       }
975       // Potential Expansion
976       if(gDebug) { //gVerbose
977         u_fprintf(log, "Possible expansion: ");
978         escapeALine(*(gLines+k), log);
979         u_fprintf(log, " ");
980       }
981       // Now we have to make sure that this is really an expansion
982       // First, we have to find it
983       u_strcpy(src.name, (*(gLines+i))->name);
984       for(j = 0; j < size; j++) {
985         u_strcpy(src.name+srcLen, (*(gLines+j))->name);
986         src.len = srcLen + (*(gLines+j))->len;
987         if((strength = probeStrength(&srcP, (gLines+k), comparer)) == UCOL_OFF) {
988           strength = probeStrength((gLines+k), &srcP, comparer);
989           // we found it *(gLines+j-1) is the element that is interesting
990           // since gLines+j-1 < gLines+k < gLines+j
991           if(gDebug) { //gVerbose
992             u_fprintf(log, "i = %i, k = %i, j = %i ", i, k, j);
993             escapeALine(*(gLines+i), log);
994             escapeALine(*(gLines+j-1), log);
995             printStrength(previousStrength, log);
996             escapeALine(current, log);
997             printStrength(strength, log);
998             escapeALine(*(gLines+i), log);
999             escapeALine(*(gLines+j), log);
1000             u_fprintf(log, "\n");
1001           }
1002           // check whether it is a contraction that is the same as an expansion
1003           // or a multi character that doesn't do anything
1004           current->addExpansionHit(i, j);
1005           current->isExpansion = true;
1006           current->expIndex = k;
1007           // cache expansion
1008           gExpansions.put(UnicodeString(current->name, current->len), current, status); //new Line(*current)
1009           break;
1010         }
1011         previousStrength = strength;
1012       }
1013     }
1014     if(!gQuiet) {
1015       u_fprintf(log, ".");
1016     }
1017   }
1018   if(!gQuiet) {
1019     u_fprintf(log, "\n");
1020   }
1021   // now we have identified possible expansions. We need to find out how do they expand.
1022   // Let's iterate over expansions cache - it's easier.
1023   const UHashElement *el = NULL;
1024   int32_t hashIndex = -1;
1025   Line *doubles = new Line[size*10];
1026   Line **sorter = new Line*[size*10];
1027   int32_t currSize = 0;
1028   int32_t newSize = 0;
1029   Line *prev = NULL;
1030   Line *next = NULL;
1031   Line *origin = NULL;
1032   int result = 0;
1033   // Make a list of things in the vincinity of expansion candidate
1034   // in expansionPrefixes and expansionAfter we have stored the
1035   // prefixes of stuff that caused the detection of an expansion
1036   // and a position where the expansion was.
1037   // For example (icu, de__PHONEBOOK), we had:
1038   // aE <<< \u00E4 < af
1039   // AD < \u00E4 <<< Ae
1040   // From that we will construct the following sequence:
1041   // AD < aE <<< \u00E4/ <<< Ae < af
1042   // then we will take the vincinity of \u00E4:
1043   // aE <<< \u00E4/ <<< Ae
1044   // then we will choose the smallest expansion to be the expansion
1045   // part: 'e'.
1046   // if there is equality, we choose the equal part:
1047   // (win32, de__PHONEBOOK):
1048   // AD < \u00E4/ = ae <<< aE <<< Ae
1049   // we choose 'e'.
1050 
1051   while((el = gExpansions.nextElement(hashIndex)) != NULL) {
1052     newSize = 0;
1053     current = (Line *)el->value.pointer;
1054     currSize = size*current->expansionPrefixesSize;
1055     if(gDebug) {
1056       escapeALine(current, log);
1057       u_fprintf(log, " Number: %i\n", current->expansionPrefixesSize);
1058     }
1059     // construct the doubles
1060     for(i = 0; i < current->expansionPrefixesSize; i++) {
1061       doubles[newSize].suffix = current->expansionAfter[i]-1;
1062       doubles[newSize++].setToConcat(*(gLines+current->expansionPrefixes[i]), *(gLines+current->expansionAfter[i]-1));
1063       doubles[newSize].suffix = current->expansionAfter[i];
1064       doubles[newSize++].setToConcat(*(gLines+current->expansionPrefixes[i]), *(gLines+current->expansionAfter[i]));
1065     }
1066     // add the expansion we're observing
1067     doubles[newSize++] = *current;
1068     setArray(sorter, doubles, newSize);
1069     qsort(sorter, newSize, sizeof(Line*), comparer);
1070     analyzeStrength(sorter, newSize, comparer);
1071     if(gDebug) {
1072       printOrdering(sorter, newSize, log);
1073     }
1074     i = 0;
1075     while(**(sorter+i) != *current) {
1076       i++;
1077     }
1078     // find the two additions
1079     if((*(sorter+i))->strength == UCOL_IDENTICAL) {
1080       // if we ae id
1081       origin = *(gLines+((*(sorter+i-1))->suffix));
1082       u_strcpy(current->expansionString, origin->name);
1083       current->expLen = origin->len;
1084     } else if(i < newSize-1 && (*(sorter+i+1))->strength == UCOL_IDENTICAL) {
1085       origin = *(gLines+((*(sorter+i+1))->suffix));
1086       u_strcpy(current->expansionString, origin->name);
1087       current->expLen = origin->len;
1088     } else {
1089       if(i > 0) {
1090         prev = *(gLines+(*(sorter+i-1))->suffix);
1091         if(i < newSize-1) {
1092           next = *(gLines+(*(sorter+i+1))->suffix);
1093           result = comparer(&prev, &next);
1094           if(result <= 0) {
1095             u_strcpy(current->expansionString, prev->name);
1096             current->expLen = prev->len;
1097           } else {
1098             u_strcpy(current->expansionString, next->name);
1099             current->expLen = next->len;
1100           }
1101         }
1102       }
1103       if(0) { //if(gDebug)
1104         u_fprintf(log, "Expansion is: ");
1105         escapeALine(current, log);
1106         u_fprintf(log, "/");
1107         escapeExpansion(current, log);
1108         u_fprintf(log, "\n");
1109       }
1110     }
1111     noteExpansion(current);
1112     //noteExpansion(gLines, current, size, comparer);
1113     if(!gQuiet) {
1114       u_fprintf(log, ".");
1115     }
1116   }
1117   if(!gQuiet) {
1118     u_fprintf(log, "\n");
1119   }
1120   delete[] doubles;
1121   delete[] sorter;
1122   return gExpansions.count();
1123 }
1124 
1125 UBool
isTailored(Line * line,UErrorCode & status)1126 isTailored(Line *line, UErrorCode &status) {
1127   UBool result = false;
1128   UCollationElements *tailoring = ucol_openElements(gCol, line->name, line->len, &status);
1129   UCollationElements *uca = ucol_openElements(gUCA, line->name, line->len, &status);
1130 
1131   int32_t tailElement = UCOL_NULLORDER;
1132   int32_t ucaElement = UCOL_NULLORDER;
1133 
1134   do {
1135     do {
1136       tailElement = ucol_next(tailoring, &status);
1137     } while(tailElement == 0);
1138     do {
1139       ucaElement = ucol_next(uca, &status);
1140     } while(ucaElement == 0);
1141     if(tailElement != ucaElement) {
1142       result = true;
1143       break;
1144     }
1145   } while (tailElement != UCOL_NULLORDER && ucaElement != UCOL_NULLORDER);
1146 
1147   ucol_closeElements(tailoring);
1148   ucol_closeElements(uca);
1149   return result;
1150 }
1151 
1152 void
reduceUntailored(Line ** gLines,int32_t size)1153 reduceUntailored(Line **gLines, int32_t size){
1154   UErrorCode status = U_ZERO_ERROR;
1155   Line *current = *(gLines);
1156   Line *previous = NULL;
1157   while(current) {
1158     // if the current line is not tailored according to the UCA
1159     if(!isTailored(current, status)) {
1160       // we remove it
1161       current->isRemoved = true;
1162     } else {
1163       // if it's tailored
1164       if(current->previous && current->previous->isRemoved == true) {
1165         previous = current->previous;
1166         while(previous && (previous->strength > current->strength || previous->isExpansion || previous->isContraction) && previous->isRemoved) {
1167           if(previous->previous && previous->previous->isRemoved) {
1168             previous = previous->previous;
1169           } else {
1170             break;
1171           }
1172         }
1173         if(previous) {
1174           previous->isReset = true;
1175         } else {
1176           (*(gLines))->isReset = true;
1177         }
1178       }
1179     }
1180     current = current->next;
1181   }
1182 }
1183 
1184 void
constructAndAnalyze(Line ** gLines,Line * lines,int32_t size,CompareFn comparer)1185 constructAndAnalyze(Line **gLines, Line *lines, int32_t size, CompareFn comparer) {
1186   int32_t i = 0, j = 0, k = 0;
1187   // setup our compare arrays to point to single set.
1188 
1189   // For contractions we need a block of data
1190   setArray(gLines, lines, size);
1191   //size = setArray(gLines);
1192 
1193   qsort(gLines, size, sizeof(Line *), comparer);
1194 
1195   // Establish who is previous according to the sort order
1196   //setIndexes(gLines, size);
1197 
1198   analyzeStrength(gLines, size, comparer);
1199   if(gVerbose) {
1200     u_fprintf(log, "Ordering:\n");
1201     printOrdering(gLines, size, log);
1202   }
1203 
1204   //showDifferences(exemplarSetSize);
1205   //dumpData(exemplarSetSize);
1206 
1207   if(!gQuiet) {
1208     u_fprintf(log, "Detecting contractions?\n");
1209   }
1210   int32_t noContractions = 0;
1211   noContractions = detectContractions(gLines, lines, size, comparer);
1212   if(!gQuiet) {
1213     u_fprintf(log, "Detected %i contractions\n", noContractions);
1214   }
1215 
1216   // now we have suspected contractions in the table
1217   // we have to re-sort the things
1218   size = setArray(gLines);
1219   qsort(gLines, size, sizeof(Line *), comparer);
1220   analyzeStrength(gLines, size, comparer);
1221 
1222   if(!gQuiet) {
1223     u_fprintf(log, "Detecting expansions\n");
1224   }
1225   int32_t noExpansions = detectExpansions(gLines, size, comparer);
1226   if(!gQuiet) {
1227     u_fprintf(log, "Detected %i expansions\n", noExpansions);
1228   }
1229 
1230   positionExpansions(gLines, size, comparer);
1231 
1232   if(gVerbose) {
1233     u_fprintf(log, "After positioning expansions:\n");
1234     printOrdering(gLines, size, log, true);
1235   }
1236   //reduceUntailored(gLines, size);
1237   if(!gQuiet) {
1238     u_fprintf(out, "Final result\n");
1239   }
1240   printOrdering(gLines, size, out, true);
1241   printOrdering(gLines, size, log, true);
1242 }
1243 
1244 // Check whether upper case comes before lower case or vice-versa
1245 int32_t
checkCaseOrdering(void)1246 checkCaseOrdering(void) {
1247   UChar stuff[][3] = {
1248     { 0x0061, separatorChar, 0x0061}, //"aa",
1249     { 0x0061, separatorChar, 0x0041 }, //"a\\u00E0",
1250     { 0x0041, separatorChar, 0x0061 }, //"\\u00E0a",
1251     { 0x0041, separatorChar, 0x0041 }, //"\\u00E0a",
1252     //{ 0x00E0, separatorChar, 0x00E0 }  //"\\u00E0\\u00E0"
1253   };
1254   const int32_t size = sizeof(stuff)/sizeof(stuff[0]);
1255 
1256   Line **sortedLines = new Line*[size];
1257   Line lines[size];
1258 
1259   int32_t i = 0;
1260   int32_t ordered = 0, reversed = 0;
1261 
1262   for(i = 0; i < size; i++) {
1263     lines[i].setName(stuff[i], 3);
1264   }
1265   setArray(sortedLines, lines, size);
1266   qsort(sortedLines, size, sizeof(Line*), gComparer);
1267 
1268   for(i = 0; i < size; i++) {
1269     if(*(sortedLines+i) == &lines[i]) {
1270       ordered++;
1271     }
1272     if(*(sortedLines+i) == &lines[size-i-1]) {
1273       reversed++;
1274     }
1275   }
1276 
1277   delete[] sortedLines;
1278   if(ordered == size) {
1279     return 0; // in normal order
1280   } else if(reversed == size) {
1281     return 1; // in reversed order
1282   } else {
1283     return -1; // unknown order
1284   }
1285 }
1286 
1287 
1288 // Check whether the secondaries are in the straight or reversed order
1289 int32_t
checkSecondaryOrdering(void)1290 checkSecondaryOrdering(void) {
1291   UChar stuff[][5] = {
1292     { 0x0061, separatorChar, 0x0061, separatorChar, 0x00E0 }, //"aa",
1293     { 0x0061, separatorChar, 0x00E0, separatorChar, 0x0061 }, //"a\\u00E0",
1294     { 0x00E0, separatorChar, 0x0061, separatorChar, 0x0061 }, //"\\u00E0a",
1295     //{ 0x00E0, separatorChar, 0x00E0 }  //"\\u00E0\\u00E0"
1296   };
1297   const int32_t size = sizeof(stuff)/sizeof(stuff[0]);
1298 
1299   Line **sortedLines = new Line*[size];
1300   Line lines[size];
1301 
1302   int32_t i = 0;
1303   int32_t ordered = 0, reversed = 0;
1304 
1305   for(i = 0; i < size; i++) {
1306     lines[i].setName(stuff[i], 5);
1307   }
1308   setArray(sortedLines, lines, size);
1309   qsort(sortedLines, size, sizeof(Line*), gComparer);
1310 
1311   for(i = 0; i < size; i++) {
1312     if(*(sortedLines+i) == &lines[i]) {
1313       ordered++;
1314     }
1315     if(*(sortedLines+i) == &lines[size-i-1]) {
1316       reversed++;
1317     }
1318   }
1319 
1320   delete[] sortedLines;
1321   if(ordered == size) {
1322     return 0; // in normal order
1323   } else if(reversed == size) {
1324     return 1; // in reversed order
1325   } else {
1326     return -1; // unknown order
1327   }
1328 }
1329 
1330 // We have to remove ignorable characters from the exemplar set,
1331 // otherwise, we get messed up results
removeIgnorableChars(UnicodeSet & exemplarUSet,CompareFn comparer,UErrorCode & status)1332 void removeIgnorableChars(UnicodeSet &exemplarUSet, CompareFn comparer, UErrorCode &status) {
1333   UnicodeSet ignorables, primaryIgnorables;
1334   UnicodeSetIterator exemplarUSetIter(exemplarUSet);
1335   exemplarUSetIter.reset();
1336   Line empty;
1337   Line *emptyP = &empty;
1338   Line current;
1339   Line *currLine = &current;
1340   UColAttributeValue strength = UCOL_OFF;
1341 
1342 
1343   while(exemplarUSetIter.next()) {
1344     if(exemplarUSetIter.isString()) { // process a string
1345       u_memcpy(currLine->name, exemplarUSetIter.getString().getBuffer(), exemplarUSetIter.getString().length());
1346       currLine->len = exemplarUSetIter.getString().length();
1347       strength = probeStrength(&emptyP, &currLine, comparer);
1348       if(strength == UCOL_IDENTICAL) {
1349         ignorables.add(exemplarUSetIter.getString());
1350       } else if(strength > UCOL_PRIMARY) {
1351         primaryIgnorables.add(exemplarUSetIter.getString());
1352       }
1353     } else { // process code point
1354       UBool isError = false;
1355       UChar32 codePoint = exemplarUSetIter.getCodepoint();
1356       currLine->len = 0;
1357       U16_APPEND(currLine->name, currLine->len, 25, codePoint, isError);
1358       strength = probeStrength(&emptyP, &currLine, comparer);
1359       if(strength == UCOL_IDENTICAL) {
1360         ignorables.add(codePoint);
1361       } else if(strength > UCOL_PRIMARY) {
1362         primaryIgnorables.add(codePoint);
1363       }
1364     }
1365   }
1366 
1367 
1368 
1369   exemplarUSet.removeAll(ignorables);
1370   exemplarUSet.removeAll(primaryIgnorables);
1371 
1372   UnicodeString removedPattern;
1373   if(ignorables.size()) {
1374     u_fprintf(log, "Ignorables:\n");
1375     ignorables.toPattern(removedPattern, true);
1376     removedPattern.setCharAt(removedPattern.length(), 0);
1377     escapeString(removedPattern.getBuffer(), removedPattern.length(), log);
1378     u_fprintf(log, "\n");
1379   }
1380   if(primaryIgnorables.size()) {
1381     u_fprintf(log, "Primary ignorables:\n");
1382     primaryIgnorables.toPattern(removedPattern, true);
1383     removedPattern.setCharAt(removedPattern.length(), 0);
1384     escapeString(removedPattern.getBuffer(), removedPattern.length(), log);
1385     u_fprintf(log, "\n");
1386   }
1387 
1388 }
1389 
1390 // TODO: develop logic for choosing boundary characters - right now it is hardcoded
1391 // It should be a function of used scripts. Also, check whether we need to save
1392 // used script names
addUtilityChars(UnicodeSet & exemplarUSet,UErrorCode & status)1393 void addUtilityChars(UnicodeSet &exemplarUSet, UErrorCode &status) {
1394 
1395   // in order to get nice rules, we need to add some characters to the
1396   // starting set. These are mostly parts of compatibility composed characters,
1397   // such as L-middle dot (middle dot is 0x00B7). If we don't add these, we would
1398   // get a reset at a funky character, such as L-middle dot. This list will probably
1399   // grow.
1400   exemplarUSet.add(0x00B7);
1401 
1402   // these things represent a script before the target script and
1403   // a script after. More logic should be added so that these characters are
1404   // chosen automatically
1405 
1406   exemplarUSet.add(0x0038);
1407   exemplarUSet.add(0x0039);
1408 
1409   //exemplarUSet.add(0x0433);
1410   //exemplarUSet.add(0x0436);
1411   exemplarUSet.add(0xfa29);
1412   exemplarUSet.add(0xfa28);
1413 }
1414 
1415 void
getExemplars(const char * locale,UnicodeSet & exemplars,UErrorCode & status)1416 getExemplars(const char *locale, UnicodeSet &exemplars, UErrorCode &status) {
1417   // first we fill out structures with exemplar characters.
1418   UResourceBundle *res = ures_open(NULL, locale, &status);
1419   int32_t exemplarLength = 0;
1420   UnicodeString exemplarString = ures_getUnicodeStringByKey(res, "ExemplarCharacters", &status);
1421   exemplars.clear();
1422   exemplars.applyPattern(exemplarString, status);
1423   ures_close(res);
1424 }
1425 
1426 void
prepareStartingSet(UnicodeSet & exemplarUSet,CompareFn comparer,UErrorCode & status)1427 prepareStartingSet(UnicodeSet &exemplarUSet, CompareFn comparer, UErrorCode &status) {
1428   int32_t i = 0;
1429   UnicodeString exemplarString;
1430   exemplarUSet.toPattern(exemplarString);
1431   // Produce case closure of exemplar characters
1432   // Then we want to figure out what is the script of the exemplar characters
1433   // just pick several and see their script
1434   const char* usedScriptNames[USCRIPT_CODE_LIMIT];
1435   int32_t numberOfUsedScripts = 0;
1436   char scriptSetPattern[256];
1437   UnicodeString pattern; // for debugging
1438   UChar32 exChar = -1;
1439   while(exemplarUSet.size() != 0 && (exChar = exemplarUSet.charAt(0)) != -1) {
1440     int32_t scriptNo = u_getIntPropertyValue(exChar, UCHAR_SCRIPT);
1441     usedScriptNames[numberOfUsedScripts] = u_getPropertyValueName(UCHAR_SCRIPT, scriptNo, U_SHORT_PROPERTY_NAME);
1442     sprintf(scriptSetPattern, "[:%s:]", usedScriptNames[numberOfUsedScripts]);
1443     numberOfUsedScripts++;
1444     UnicodeSet scriptSet(UnicodeString(scriptSetPattern, ""), status);
1445     exemplarUSet.removeAll(scriptSet);
1446     exemplarUSet.toPattern(pattern, true);
1447   }
1448   exemplarUSet.clear();
1449 
1450   // always add ASCII
1451   //exemplarUSet.addAll(UnicodeSet(UnicodeString("[\\u0020-\\u007f]", ""), status));
1452   exemplarUSet.addAll(UnicodeSet(UnicodeString("[\\u0041-\\u005b]", ""), status));
1453   if(gExemplar) {
1454     exemplarUSet.applyPattern(exemplarString, status);
1455     exemplarUSet.closeOver(USET_CASE);
1456     if(!gQuiet) {
1457       u_fprintf(out, "ICU exemplar characters:\n");
1458       escapeString(exemplarString.getBuffer(), exemplarString.length(), out);
1459       u_fprintf(out, "\n");
1460     }
1461   } else {
1462     if(!gQuiet) {
1463       u_fprintf(out, "Using scripts:\n");
1464     }
1465     // add interesting scripts
1466     for(i = 0; i < numberOfUsedScripts; i++) {
1467       sprintf(scriptSetPattern, "[:%s:]", usedScriptNames[i]);
1468       exemplarUSet.addAll(UnicodeSet(UnicodeString(scriptSetPattern, ""), status));
1469       if(!gQuiet) {
1470         u_fprintf(out, "%s\n", scriptSetPattern);
1471       }
1472     }
1473   }
1474 
1475 
1476   removeIgnorableChars(exemplarUSet, comparer, status);
1477 
1478   addUtilityChars(exemplarUSet, status);
1479 
1480 /*
1481   // try to check whether tailored set and exemplar characters match.
1482   USet *tailored = ucol_getTailoredSet(gCol, &status);
1483   UBool tailoredContained = exemplarUSet.containsAll(*((UnicodeSet *)tailored));
1484   if(!tailoredContained) {
1485     ((UnicodeSet *)tailored)->removeAll(exemplarUSet);
1486     UnicodeString pattern;
1487     ((UnicodeSet *)tailored)->toPattern(pattern, true);
1488   }
1489   uset_close(tailored);
1490 */
1491 
1492   //return exemplarUSet;
1493 }
1494 
1495 void
setOutputFile(const char * name,UErrorCode & status)1496 setOutputFile(const char *name, UErrorCode &status) {
1497   int32_t i = 0;
1498   char filename[256];
1499   strcpy(filename, name);
1500   for(i = 0; i < gPlatformNo; i++) {
1501     strcat(filename, "_");
1502     strcat(filename, platforms[gPlatformIndexes[i]].name);
1503   }
1504   if(gExemplar) {
1505     strcat(filename, "_exemplar");
1506   } else {
1507     strcat(filename, "_script");
1508   }
1509   strcat(filename, ".utf16.txt");
1510   out = u_fopen(filename, "wb", "en", "utf-16");
1511 }
1512 
1513 void
processCollator(UCollator * col,UErrorCode & status)1514 processCollator(UCollator *col, UErrorCode &status) {
1515   int32_t i = 0;
1516   gCol = col;
1517   UChar ruleString[16384];
1518   int32_t ruleStringLength = ucol_getRulesEx(gCol, UCOL_TAILORING_ONLY, ruleString, 16384);
1519   if(!gQuiet) {
1520     u_fprintf(out, "ICU rules:\n");
1521     printRules(ruleString, ruleStringLength, out);
1522     printRules(ruleString, ruleStringLength, log);
1523     //escapeString(ruleString, ruleStringLength, out);
1524     u_fprintf(out, "\n");
1525   }
1526   const char *locale = ucol_getLocale(gCol, ULOC_REQUESTED_LOCALE, &status);
1527   UnicodeSet exemplarUSet;
1528   if(locale) {
1529     getExemplars(locale, exemplarUSet, status);
1530   } else {
1531     exemplarUSet = *((UnicodeSet *)ucol_getTailoredSet(gCol, &status));
1532   }
1533 
1534 
1535   for(i = 0; i < gPlatformNo; i++) {
1536     u_fprintf(out, "\nGenerating order for platform: %s\n", platforms[gPlatformIndexes[i]].name);
1537     gComparer = platforms[gPlatformIndexes[i]].comparer;
1538 
1539     prepareStartingSet(exemplarUSet, gComparer, status);
1540     int32_t itemLen = 0;
1541     // get the number of all the items from the set (both codepoints and strings)
1542     int32_t exemplarSetSize = exemplarUSet.size();
1543     UnicodeSetIterator exemplarUSetIter(exemplarUSet);
1544 
1545     // allocate ICU lines
1546     gICULines = new Line*[exemplarSetSize*5];
1547     int32_t j = 0;
1548     int32_t linesCount = 0;
1549     Line *lines = new Line[exemplarSetSize];
1550 
1551     int32_t reversedSecondary = checkSecondaryOrdering();
1552     if(reversedSecondary == 0) {
1553       u_fprintf(out, "Secondaries do not seem to be reversed\n");
1554     } else if(reversedSecondary == 1) {
1555       u_fprintf(out, "Secondaries are reversed\n");
1556       if(gComparer == ICUstrcmp) {
1557         ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status);
1558       }
1559     } else {
1560       u_fprintf(out, "Cannot conclude if secondaries are reversed\n");
1561     }
1562 
1563     int32_t reversedCase = checkCaseOrdering();
1564     if(reversedCase == 0) {
1565       u_fprintf(out, "Case does not seem to be reversed\n");
1566     } else if(reversedCase == 1) {
1567       u_fprintf(out, "Case is reversed\n");
1568       if(gComparer == ICUstrcmp) {
1569         ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_OFF, &status);
1570       }
1571     } else {
1572       u_fprintf(out, "Cannot conclude if case is reversed\n");
1573     }
1574 
1575     exemplarUSetIter.reset();
1576     gElements.removeAll();
1577     gExpansions.removeAll();
1578     linesCount = 0;
1579 
1580     while(exemplarUSetIter.next()) {
1581       Line *currLine = lines+linesCount;
1582       if(exemplarUSetIter.isString()) { // process a string
1583         u_memcpy(currLine->name, exemplarUSetIter.getString().getBuffer(), exemplarUSetIter.getString().length());
1584         currLine->len = exemplarUSetIter.getString().length();
1585       } else { // process code point
1586         UBool isError = false;
1587         currLine->len = 0;
1588         U16_APPEND(currLine->name, currLine->len, 25, exemplarUSetIter.getCodepoint(), isError);
1589       }
1590       currLine->name[currLine->len] = 0; // zero terminate, for our evil ways
1591       currLine->index = linesCount;
1592       linesCount++;
1593       noteElement(currLine);
1594     }
1595     constructAndAnalyze(gICULines, lines, exemplarSetSize, gComparer);
1596 
1597     delete[] lines;
1598   }
1599 
1600 
1601   // cleanup globals
1602   delete[] gICULines;
1603   u_fflush(out);
1604   u_fclose(out);
1605   ucol_close(gCol);
1606 }
1607 
1608 void
processLocale(const char * locale,UErrorCode & status)1609 processLocale(const char *locale, UErrorCode &status) {
1610   gWinLCID = uloc_getLCID(locale);
1611 
1612   UCollator *col = ucol_open(locale, &status);
1613 
1614   setOutputFile(locale, status);
1615 
1616   u_fprintf(out, "Locale %s (LCID:%06X)\n", locale, gWinLCID);
1617 
1618   processCollator(col, status);
1619 }
1620 
1621 UBool
hasCollationElements(const char * locName)1622 hasCollationElements(const char *locName) {
1623 
1624   UErrorCode status = U_ZERO_ERROR;
1625   UResourceBundle *ColEl = NULL;
1626 
1627   UResourceBundle *loc = ures_open(NULL, locName, &status);;
1628 
1629   if(U_SUCCESS(status)) {
1630     status = U_ZERO_ERROR;
1631     ColEl = ures_getByKey(loc, "CollationElements", ColEl, &status);
1632     if(status == U_ZERO_ERROR) { /* do the test - there are real elements */
1633       ures_close(ColEl);
1634       ures_close(loc);
1635       return true;
1636     }
1637     ures_close(ColEl);
1638     ures_close(loc);
1639   }
1640   return false;
1641 }
1642 
1643 int
main(int argc,char * argv[])1644 main(int argc,
1645      char* argv[])
1646 {
1647   UErrorCode status = U_ZERO_ERROR;
1648   err = u_finit(stderr, "en", "latin-1");
1649   log = u_finit(stdout, "en", "latin-1");
1650 
1651 /*
1652   USet *wsp = uprv_openRuleWhiteSpaceSet(&status);
1653   uset_add(wsp, 0x0041);
1654   uset_remove(wsp, 0x0041);
1655   UnicodeString pat;
1656   ((UnicodeSet *)wsp)->toPattern(pat, true);
1657   pat.setCharAt(pat.length(), 0);
1658   escapeString(pat.getBuffer(), pat.length(), log);
1659   u_fflush(log);
1660 */
1661 
1662   UTransliterator *anyHex = utrans_open("[^\\u000a\\u0020-\\u007f] Any-Hex/Java", UTRANS_FORWARD, NULL, 0, NULL, &status);
1663   u_fsettransliterator(log, U_WRITE, anyHex, &status);
1664 
1665   processArgs(argc, argv, status);
1666   int32_t i = 0;
1667 
1668 
1669   gElements.setValueDeleter(deleteLineElement);
1670 
1671 
1672   if(U_FAILURE(status) || gPlatformNo == 0) {
1673     return -1;
1674   }
1675 
1676   gUCA = ucol_open("root", &status);
1677 
1678   if(gRulesStdin) {
1679     char buffer[1024];
1680     UChar ruleBuffer[16384];
1681     UChar *rules = ruleBuffer;
1682     int32_t maxRuleLen = 16384;
1683     int32_t rLen = 0;
1684     while(gets(buffer)) {
1685       if(buffer[0] != '/' && buffer[1] != '/') {
1686         rLen = u_unescape(buffer, rules, maxRuleLen);
1687         rules += rLen;
1688         maxRuleLen -= rLen;
1689       }
1690     }
1691     UParseError parseError;
1692     //escapeString(ruleBuffer, rules-ruleBuffer, log);//
1693     u_fprintf(log, "%U\n", ruleBuffer);
1694 
1695     UCollator *col = ucol_openRules(ruleBuffer, rules-ruleBuffer, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &status);
1696     if(U_SUCCESS(status)) {
1697       setOutputFile("stdinRules", status);
1698       processCollator(col, status);
1699     } else {
1700       u_fprintf(err, "Error %s\n", u_errorName(status));
1701     }
1702   } else {
1703 
1704     if(gLocale) {
1705       processLocale(gLocale, status);
1706     } else if(gLocaleNo) {
1707       for(i = 0; i < gLocaleNo; i++) {
1708         processLocale(gLocales[i], status);
1709       }
1710     } else { // do the loop through all the locales
1711       int32_t noOfLoc = uloc_countAvailable();
1712       const char *locName = NULL;
1713       for(i = 0; i<noOfLoc; i++) {
1714         status = U_ZERO_ERROR;
1715         locName = uloc_getAvailable(i);
1716         if(hasCollationElements(locName)) {
1717           processLocale(locName, status);
1718         }
1719       }
1720     }
1721   }
1722 
1723 
1724   ucol_close(gUCA);
1725 
1726   u_fflush(log);
1727   u_fclose(log);
1728   u_fflush(err);
1729   u_fclose(err);
1730 
1731   return 0;
1732 }