• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 #include "sortedlines.h"
4 
codePointCmp(const void * a,const void * b)5 static int codePointCmp(const void *a, const void *b) {
6   return u_strcmp((*(Line **)a)->name, (*(Line **)b)->name);
7 }
8 
SortedLines(const UnicodeSet & set,const UnicodeSet & excludeBounds,const StrengthProbe & probe,UPrinter * logger,UPrinter * debug)9 SortedLines::SortedLines(const UnicodeSet &set, const UnicodeSet &excludeBounds, const StrengthProbe &probe,
10                          UPrinter *logger, UPrinter *debug) :
11 toSort(NULL),
12 toSortCapacity(0),
13 lines(NULL),
14 size(0),
15 capacity(0),
16 repertoire(set),
17 excludeBounds(excludeBounds),
18 probe(probe),
19 first(NULL),
20 last(NULL),
21 logger(logger),
22 debug(debug),
23 contractionsTable(NULL),
24 duplicators(NULL),
25 maxExpansionPrefixSize(0),
26 wordSort(false),
27 frenchSecondary(false),
28 upperFirst(false),
29 sortkeys(NULL),
30 sortkeyOffset(0)
31 {
32   memset(UB, 0, sizeof(UB));
33   int32_t i = 0;
34   for(i = 0; i < UCOL_OFF; i++) {
35     UB[i] = &empty;
36   }
37   init();
38 }
39 
~SortedLines()40 SortedLines::~SortedLines()
41 {
42   delete[] lines;
43   if(sortkeys) {
44     delete[] sortkeys;
45   }
46   if(toSort) {
47     delete[] toSort;
48   }
49   if(contractionsTable) {
50     delete contractionsTable;
51   }
52   if(duplicators) {
53     delete duplicators;
54   }
55 }
56 
57 void
getBounds(UErrorCode & status)58 SortedLines::getBounds(UErrorCode &status) {
59   // first sort through the set
60   debug->log(toString(), true);
61   int32_t i = 0, j = 0;
62   UColAttributeValue strength = UCOL_OFF;
63   for(i = 0; i < size; i++) {
64     if(toSort[i]->strengthFromEmpty < strength) {
65       if(i && strength < UCOL_OFF) {
66         //u_strcpy(UB[strength], toSort[i-1]->name);
67         j = 1;
68         while(excludeBounds.contains(UnicodeString(toSort[i-j]->name, toSort[i-j]->len))) {
69           j++;
70         }
71         UB[strength] = toSort[i-j];
72       }
73       strength = toSort[i]->strengthFromEmpty;
74       if(strength == UCOL_PRIMARY) {
75         probe.SE = toSort[i]->name[0];
76       }
77     }
78   }
79   //u_strcpy(UB[strength], toSort[size-1]->name);
80   // a different solution for bounds: go from end and see if the guys on the top
81   // cause duplication for things
82   UChar dupch[] = { 0x0020, 0x0030, 0x0042, 0x0051, 0x0062, 0x0071, 0x0391, 0x0396, 0x03b1, 0x03b6 };
83   j = 1;
84   Line dup;
85   Line bound;
86   int32_t dups = 0;
87   while(j < size) {
88     dups = 0;
89     for(i = 0; i < sizeof(dupch)/sizeof(dupch[0]); i++) {
90       dup.setTo(dupch[i]);
91       dup.append(dupch[i]);
92       bound.setTo(dupch[i]);
93       bound.append(toSort[size-j]->name, toSort[size-j]->len);
94       if(probe.getStrength(dup, bound) >= UCOL_IDENTICAL) {
95         dups++;
96       }
97     }
98     if(dups == 0) {
99       break;
100     } else {
101       if(!duplicators) {
102         duplicators = new Hashtable();
103       }
104       duplicators->put(UnicodeString(toSort[size-j]->name, toSort[size-j]->len), &toSort[size-j], status);
105       debug->log(toSort[size-j]->toString());
106       debug->log(" is not good enough to be an upper bound\n");
107       j++;
108     }
109   }
110   if(j == size) {
111     debug->log("Oi! I'm hallucinating. Will use the first upper bound");
112     delete duplicators;
113     duplicators = NULL;
114     j = 1;
115   }
116 /*
117   j = 1;
118   while(excludeBounds.contains(UnicodeString(toSort[size-j]->name, toSort[size-j]->len))) {
119     j++;
120   }
121 */
122   UB[strength] = toSort[size-j];
123   for(i = 0; i < UCOL_OFF; i++) {
124     if(UB[i]) {
125       //debug->log(UB[i], true);
126       debug->log(UB[i]->toString(true), true);
127     }
128   }
129 }
130 
131 // classifies repertoire according to the strength of their difference
132 // from the empty string
133 void
classifyRepertoire()134 SortedLines::classifyRepertoire() {
135   UColAttributeValue strongestStrengthFromEmpty = UCOL_OFF;
136   int32_t lastChange = 0;
137   int32_t i = 0, j = 0;
138   while(i < size) // && probe.distanceFromEmptyString(*toSort[i]) > UCOL_PRIMARY)
139   {
140     toSort[i]->strengthFromEmpty = probe.distanceFromEmptyString(*toSort[i]);
141     if(toSort[i]->strengthFromEmpty < strongestStrengthFromEmpty) {
142       strongestStrengthFromEmpty = toSort[i]->strengthFromEmpty;
143       lastChange = i;
144     } else if (toSort[i]->strengthFromEmpty > strongestStrengthFromEmpty) {
145       // there is a problem in detection. Most probably a quaternary.
146       // why don't we try to interpolate
147       UColAttributeValue nextStrength = UCOL_OFF;
148       UColAttributeValue prevStrength = UCOL_OFF;
149       UColAttributeValue st = UCOL_OFF;
150 
151       logger->log("Interpolating to get the distance from empty for Line ");
152       logger->log(toSort[i]->toString(true), true);
153 
154       if(i) {
155         st = probe.getStrength(*toSort[i-1], *toSort[i]);
156         if(st == UCOL_OFF) {
157           logger->log("Cannot deduce distance from empty using previous element. Something is very wrong! Line:");
158           logger->log(toSort[i]->toString(true), true);
159         } else if(st == UCOL_IDENTICAL || st >= toSort[i-1]->strengthFromEmpty) {
160           prevStrength  = toSort[i-1]->strengthFromEmpty;
161         } else if(st < toSort[i-1]->strengthFromEmpty) {
162           prevStrength = st;
163         }
164         toSort[i]->strengthFromEmpty = prevStrength;
165       }
166       if(i < size-2) {
167         toSort[i+1]->strengthFromEmpty = probe.distanceFromEmptyString(*toSort[i+1]);
168         st = probe.getStrength(*toSort[i+1], *toSort[i]);
169         if(st == UCOL_OFF) {
170           logger->log("Cannot deduce distance from empty using next element. Something is very wrong! Line:");
171           logger->log(toSort[i]->toString(true), true);
172         } else if(st == UCOL_IDENTICAL || st < toSort[i+1]->strengthFromEmpty) {
173           nextStrength = toSort[i+1]->strengthFromEmpty;
174         } else if(st >= toSort[i+1]->strengthFromEmpty) {
175           nextStrength = st;
176         }
177         if(i) {
178           if(prevStrength != nextStrength) {
179             logger->log("Inconsistent results from interpolation! Results will most likely be wrong\n");
180           }
181         }
182         toSort[i]->strengthFromEmpty = nextStrength;
183       }
184       /*
185       UColAttributeValue problemStrength = UCOL_PRIMARY;
186       for(j = lastChange; j < i ; j++) {
187         if(toSort[j]->strength > problemStrength) {
188           problemStrength = toSort[j]->strength;
189         }
190       }
191       for(j = lastChange; j < i ; j++) {
192         toSort[j]->strengthFromEmpty = problemStrength;
193       }
194       strongestStrengthFromEmpty = toSort[i]->strengthFromEmpty;
195       lastChange = i;
196       debug->log("Problem detected in distances from empty. Most probably word sort is on\n");
197       */
198       wordSort = true;
199     }
200     i++;
201   }
202   debug->log("Distances from empty string\n");
203   debug->log(toStringFromEmpty(), true);
204 }
205 
206 void
analyse(UErrorCode & status)207 SortedLines::analyse(UErrorCode &status) {
208   frenchSecondary = probe.isFrenchSecondary(status);
209   if(U_FAILURE(status)) {
210     logger->log("Test for French secondary failed. Bailing out!\n");
211     return;
212   }
213   logger->log("French secondary value is %i\n", frenchSecondary, frenchSecondary);
214   upperFirst = probe.isUpperFirst(status);
215   if(U_FAILURE(status)) {
216     logger->log("Test for upper first failed. Bailing out!\n");
217     return;
218   }
219   logger->log("upper first value is %i\n", upperFirst, upperFirst);
220   sort(true, true);
221   classifyRepertoire();
222   getBounds(status);
223   //sort(true, true);
224   addContractionsToRepertoire(status);
225   //sort(true, true);
226   debug->log("\n*** Order after detecting contractions\n\n");
227   calculateSortKeys();
228   debug->log(toPrettyString(false, true), true);
229   detectExpansions();
230 }
231 
init()232 void SortedLines::init()
233 {
234   size = repertoire.size();
235   capacity = 5*size;
236   lines = new Line[capacity];
237   init(repertoire, lines);
238 }
239 
init(UnicodeSet & rep,Line * lin)240 void SortedLines::init(UnicodeSet &rep, Line *lin)
241 {
242 
243   UnicodeSetIterator exemplarUSetIter(rep);
244   int32_t size = 0;
245 
246   while(exemplarUSetIter.next()) {
247     Line *currLine = lin+size;
248     if(exemplarUSetIter.isString()) { // process a string
249       currLine->setTo(exemplarUSetIter.getString());
250     } else { // process code point
251       currLine->setTo(exemplarUSetIter.getCodepoint());
252     }
253     currLine->name[currLine->len] = 0; // zero terminate, for our evil ways
254     //currLine->index = size;
255     size++;
256   }
257 }
258 
259 void
setSortingArray(Line ** sortingArray,Line * elements,int32_t sizeToSort)260 SortedLines::setSortingArray(Line **sortingArray, Line *elements, int32_t sizeToSort) {
261   int32_t i = 0;
262   for(i = 0; i < sizeToSort; i++) {
263     sortingArray[i] = &elements[i];
264   }
265 }
266 
267 int32_t
setSortingArray(Line ** sortingArray,Hashtable * table)268 SortedLines::setSortingArray(Line **sortingArray, Hashtable *table) {
269   int32_t size = table->count();
270   int32_t hashIndex = -1;
271   const UHashElement *hashElement = NULL;
272   int32_t count = 0;
273   while((hashElement = table->nextElement(hashIndex)) != NULL) {
274     sortingArray[count++] = (Line *)hashElement->value.pointer;
275   }
276   return size;
277 }
278 
279 void
sort(Line ** sortingArray,int32_t sizeToSort,UBool setStrengths,UBool link)280 SortedLines::sort(Line **sortingArray, int32_t sizeToSort, UBool setStrengths, UBool link) {
281   int32_t i = 0;
282   int32_t equalStart = 0;
283   UColAttributeValue equalStrength = UCOL_OFF;
284 
285   qsort(sortingArray, sizeToSort, sizeof(Line *), probe.comparer);
286 
287   if(setStrengths) { // analyze strengths
288     for(i = 1; i < sizeToSort; i++) {
289       sortingArray[i]->strength = probe.getStrength(*sortingArray[i-1], *sortingArray[i]);
290     }
291     // for equal guys, do the code point ordering
292 
293     i = 1;
294     while(i < sizeToSort)
295     {
296       if(sortingArray[i]->strength == UCOL_IDENTICAL) {
297         equalStart = i - 1;
298         equalStrength = sortingArray[equalStart]->strength;
299         sortingArray[equalStart]->strength = UCOL_IDENTICAL;
300         while(i < sizeToSort && sortingArray[i]->strength == UCOL_IDENTICAL) {
301           i++;
302         }
303         qsort(sortingArray+equalStart, i-equalStart, sizeof(Line *), codePointCmp);
304         sortingArray[equalStart]->strength = equalStrength;
305       } else {
306         i++;
307       }
308     }
309 
310   }
311 
312 
313 
314   if(link) { // do the linking
315     for(i = 0; i < sizeToSort - 1; i++) {
316       Line *curr = *(sortingArray+i);
317       curr->next = *(sortingArray+i+1);
318       (*(sortingArray+i+1))->previous = curr;
319     }
320   }
321 }
322 
323 void
sort(UBool setStrengths,UBool link)324 SortedLines::sort(UBool setStrengths, UBool link) {
325   if(toSortCapacity < size || !toSort) {
326     if(toSort) {
327       delete[] toSort;
328     }
329     toSort = new Line*[size*2];
330     toSortCapacity = size*2;
331   }
332 
333   setSortingArray(toSort, lines, size);
334   sort(toSort, size, setStrengths, link);
335 
336   first = last = NULL;
337 
338   if(link) { // do the linking
339     first = *toSort;
340     last = *(toSort+size-1);
341   }
342 }
343 
344 void
updateBounds(UnicodeSet & set)345 SortedLines::updateBounds(UnicodeSet &set) {
346   Line line;
347   UnicodeString s1;
348   UnicodeSetIterator it1(set);
349   while(it1.next()) {
350     if(!debug->isOn()) {
351       logger->log(".");
352     }
353     if(it1.isString()) { // process a string
354       s1.setTo(it1.getString());
355     } else { // process code point
356       s1.setTo(it1.getCodepoint());
357     }
358     //line.setTo(s1);
359     UColAttributeValue strength = probe.distanceFromEmptyString(s1);
360     if(probe.compare(UnicodeString(UB[strength]->name), s1) < 0) {
361       // TODO: leak here - fixit!
362       UB[strength] = new Line(s1);
363       //u_strcpy(UB[strength], s1.getTerminatedBuffer());
364     }
365   }
366 
367 
368 
369 }
370 
addAll(Line * toAdd,int32_t toAddSize)371 void SortedLines::addAll(Line* toAdd, int32_t toAddSize)
372 {
373   if(size+toAddSize > capacity) {
374     int32_t doGrowingBreakpoint = 0;
375     // we need to do growing here
376   }
377   int32_t i = 0;
378 
379   for(i = 0; i < toAddSize; i++) {
380     lines[size+i] = toAdd[i];
381   }
382   size += toAddSize;
383 }
384 
setDistancesFromEmpty(Line * array,int32_t arraySize)385 void SortedLines::setDistancesFromEmpty(Line* array, int32_t arraySize)
386 {
387   int32_t i = 0;
388   for(i = 0; i < arraySize; i++) {
389     array[i].strengthFromEmpty = probe.distanceFromEmptyString(array[i]);
390   }
391 }
392 
393 
394 // adds contractions in to repertoire
addContractionsToRepertoire(UErrorCode & status)395 int32_t SortedLines::addContractionsToRepertoire(UErrorCode &status)
396 {
397   logger->log("\n*** Detecting contractions\n\n");
398   contractionsTable = new Hashtable();
399   int32_t noConts = 0;
400   int32_t allocateSize = 50*size;
401   // first check for simple contractions
402   Line* delta = new Line[allocateSize];
403   Line** deltaSorted = new Line*[allocateSize];
404   Line* lesserToAddTo = new Line[allocateSize];
405   Line* newDelta = new Line[allocateSize];
406   Line** newDeltaSorted = new Line*[allocateSize];
407   Line* deltaP = delta;
408   Line** deltaPP = deltaSorted;
409   Line* newDeltaP = newDelta;
410   int32_t deltaSize = 0, lesserToAddToSize = 0, newDeltaSize = 0;
411   logger->log("++ Contraction detection generation 0\n");
412   noConts = detectContractions(toSort, size, toSort, size,
413 			       delta, deltaSize, lesserToAddTo, lesserToAddToSize, 3*size, status);
414   setSortingArray(deltaSorted, delta, deltaSize);
415   sort(deltaSorted, deltaSize, true);
416 
417   setDistancesFromEmpty(delta, deltaSize);
418   int32_t deltaPSize = deltaSize;
419   //updateBounds(delta);
420 
421   int32_t generation = 0;
422   // if we found any, we have to try multiple contractions
423   // However, we want to prevent the contractions explosion
424   // if the number of simple contractions is greater than the
425   // starting size, chances are that we either have an algorithmic
426   // contraction (like iteration marks on w2k) or something
427   // is seriosly wrong.
428   if(deltaPSize < size/2) {
429     while (deltaPSize && generation < 1) {
430       generation++;
431       logger->log("\n++ Contraction detection generation %i\n", generation, generation);
432       // find more, but avoid testing the combinations we already have
433       noConts += detectContractions(toSort, size, deltaPP, deltaPSize,
434 				    newDeltaP, newDeltaSize, lesserToAddTo, lesserToAddToSize, 3*size, status);
435       noConts += detectContractions(deltaPP, deltaPSize, toSort, size,
436 				    newDeltaP, newDeltaSize, lesserToAddTo, lesserToAddToSize, 3*size, status);
437       calculateSortKeys();
438 
439       addAll(deltaP, deltaPSize);
440       setSortingArray(toSort, lines, size);
441       sort(true, true);
442       setSortingArray(newDeltaSorted, newDeltaP, newDeltaSize);
443       sort(newDeltaSorted, newDeltaSize, true);
444 
445       // if no new ones, bail
446       //if (newDeltaSize == 0) break;
447 
448       deltaPSize = newDeltaSize;
449       newDeltaSize = 0;
450       if(deltaP == delta) {
451         deltaP = newDelta;
452         deltaPP = newDeltaSorted;
453         newDeltaP = delta;
454       } else {
455         deltaP = delta;
456         deltaPP = deltaSorted;
457         newDeltaP = newDelta;
458       }
459       setDistancesFromEmpty(deltaP, deltaPSize);
460     }
461   }
462   status = U_ZERO_ERROR;
463   // add stuff from the last batch
464   addAll(deltaP, deltaPSize);
465 
466   // warning: we don't add the lesser ones in recursively, since they will
467   // infinitely loop
468   setDistancesFromEmpty(lesserToAddTo, lesserToAddToSize);
469   addAll(lesserToAddTo, lesserToAddToSize);
470   setSortingArray(toSort, lines, size);
471   sort(true, true);
472 
473   delete[] deltaSorted;
474   delete[] delta;
475   delete[] lesserToAddTo;
476   delete[] newDeltaSorted;
477   delete[] newDelta;
478   return noConts;
479 }
480 
481 
detectContractions(Line ** firstRep,int32_t firstSize,Line ** secondRep,int32_t secondSize,Line * toAddTo,int32_t & toAddToSize,Line * lesserToAddTo,int32_t & lesserToAddToSize,int32_t capacity,UErrorCode & status)482 int32_t SortedLines::detectContractions(Line **firstRep, int32_t firstSize,
483                                         Line **secondRep, int32_t secondSize,
484                                         Line *toAddTo, int32_t &toAddToSize,
485                                         Line *lesserToAddTo, int32_t &lesserToAddToSize,
486                                         int32_t capacity, UErrorCode &status)
487 {
488   int32_t noConts = 0;
489   int i = 0, j = 0, k = 0;
490   Line lower, upper, trial, toAdd, helper;
491   UChar32 firstStart, firstEnd, secondStart;
492   UChar NFCTrial[256];
493   int32_t NFCTrialLen = 0;
494   UBool thai;
495   i = -1;
496   while(i < firstSize-1 && U_SUCCESS(status)) {
497     i++;
498     if(!debug->isOn()) {
499       logger->log("\rTesting %05i/%05i. Found %05i conts.", i, firstSize, noConts);
500     }
501     U16_GET(firstRep[i]->name, 0, 0, firstRep[i]->len, firstStart);
502     if(uscript_getScript(firstStart, &status) == USCRIPT_HAN || firstRep[i]->strengthFromEmpty > UCOL_PRIMARY) //UCOL_TERTIARY)
503       {
504         continue;
505       }
506     lower = *firstRep[i];
507     for(j = 0; j < secondSize; j++) {
508       if(noConts == capacity) {
509         return noConts;
510       }
511       U16_GET(secondRep[j]->name, 0, 0, secondRep[j]->len, secondStart);
512       if(firstStart == 0x41 && secondStart == 0x308) {
513       int32_t putBreakPointHere = 0;
514     }
515       if(uscript_getScript(secondStart, &status) == USCRIPT_HAN) // || secondRep[j]->strengthFromEmpty > UCOL_TERTIARY)
516 	{
517           continue;
518         }
519       	if(duplicators && duplicators->get(UnicodeString(secondRep[j]->name, secondRep[j]->len)) != NULL) {
520           debug->log("Skipping duplicator ");
521           debug->log(secondRep[j]->toString(), true);
522 		  continue;
523 		}
524 
525       if(firstRep[i]->name[0] == 0x61 && secondRep[j]->name[0] == 0x308) {
526 	    int32_t putBreakpointhere = 0;
527       }
528       upper.setToConcat(firstRep[i], UB[UCOL_PRIMARY]);
529       //upper.setToConcat(firstRep[i], UB[secondRep[j]->strengthFromEmpty]);
530       toAdd.setToConcat(firstRep[i], secondRep[j]);
531       U16_GET(firstRep[i]->name, 0, firstRep[i]->len-1, firstRep[i]->len, firstEnd);
532       if((thai = u_hasBinaryProperty(firstEnd, UCHAR_LOGICAL_ORDER_EXCEPTION))) {
533 	// this means that the lower is single reordering character
534 	// if we do the lower test without taking this into account,
535 	// we'll comparing the secondRep directly to Thai. We add UB[UCOL_PRIMARY] to
536 	// end of lower and in the middle of trial, so we will have
537 	// lower = Thai + UB, trial Thai + UB + x, resolving to
538 	// UB + Thai vs UB + Thai + x.
539 	// for upper bound, we do the similar, so we have
540 	// upper = Thai + UB + UB, trial = Thai + UB + x,
541 	// resolving to UB + Thai + UB vs UB + Thai + x
542 	if(secondRep[j]->firstCC) {
543 	  UChar32 UBChar;
544 	  U16_GET(UB[UCOL_SECONDARY]->name, 0, 0, UB[UCOL_SECONDARY]->len, UBChar);
545 	  if(secondRep[j]->firstCC > u_getCombiningClass(UBChar)) {
546 	    continue;
547 	  }
548 	}
549 	upper = *firstRep[i];
550 	upper.append(*UB[UCOL_PRIMARY]);
551 	//upper.append(*UB[secondRep[j]->strengthFromEmpty]);
552 	upper.append(*UB[UCOL_PRIMARY]);
553 	lower.append(*UB[UCOL_PRIMARY]);
554 	trial = *firstRep[i];
555 	trial.append(*UB[UCOL_PRIMARY]);
556 	trial.append(*secondRep[j]);
557       } else if((firstRep[i]->lastCC > secondRep[j]->firstCC && secondRep[j]->firstCC && !frenchSecondary)
558 		|| (firstRep[i]->firstCC < secondRep[j]->lastCC && firstRep[i]->firstCC && frenchSecondary)) {
559 	// Skip because normalization will reorder
560 	// there will be a chance to check this again, since if we
561 	// try a+b, we will also try b+a
562 	    continue;
563       } else if(frenchSecondary && (firstRep[i]->strengthFromEmpty > UCOL_PRIMARY && secondRep[j]->strengthFromEmpty > UCOL_PRIMARY)) {
564 	    continue;
565       }else if(firstRep[i]->lastCC && secondRep[j]->firstCC && frenchSecondary) {
566 	    trial.setToConcat(secondRep[j], firstRep[i]);
567       } else {
568     	trial.setToConcat(firstRep[i], secondRep[j]);
569       }
570       // Now let's check the trial. The problem is that when you combine characters,
571       // you can end up with concatenation that is unknown for the examined API.
572       NFCTrialLen = unorm_normalize(trial.name, trial.len, UNORM_NFC, 0, NFCTrial, 256, &status);
573       if((u_strcmp(trial.name, NFCTrial) == 0) || u_strFindLast(NFCTrial, NFCTrialLen, secondRep[j]->name, secondRep[j]->len)) {
574 	   if(secondRep[j]->strengthFromEmpty > UCOL_TERTIARY) {
575 	     continue;
576 	   }
577 	 }
578       UChar32 c;
579       U16_GET(NFCTrial, 0, 0, NFCTrialLen, c);
580       helper.setTo(c);
581       if(probe.distanceFromEmptyString(helper) > UCOL_TERTIARY) {
582 	    continue;
583       }
584       if(NFCTrialLen > 1) {
585 	    U16_GET(NFCTrial, 0, NFCTrialLen-1, NFCTrialLen, c);
586 	    helper.setTo(c);
587         if(probe.distanceFromEmptyString(helper) > UCOL_TERTIARY) {
588 	      continue;
589 	    }
590       }
591 
592       if (probe.compare(lower, trial) >= 0) { // if lower is bigger than trial
593         // this might be ok, but I'm having doubts. Here is an additional check:
594         if(firstRep[i]->len == 1 || secondRep[j]->strengthFromEmpty == UCOL_PRIMARY) {
595           // I'm basically saying that I'll add this kind of contraction for cases where I combine
596           // one letter with an accent OR when I'm combining more than one symbol with a letter.
597           noteContraction("L", lesserToAddTo, lesserToAddToSize, firstRep[i], secondRep[j], noConts, status);
598         }
599       }
600       else if (probe.compare(trial, upper) > 0) { // trial is bigger than upper??
601         noteContraction("U", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
602       }
603 #if 0
604       else if(firstRep[i]->strengthFromEmpty == UCOL_PRIMARY)
605       {
606         Line expansionLine;
607         if(getExpansionLine(trial, *firstRep[i], *secondRep[j], expansionLine) &&
608         expansionLine.len && !(expansionLine == *secondRep[j])) {
609           noteContraction("D", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
610         }
611       }
612 #endif
613       else if(firstRep[i]->strengthFromEmpty == UCOL_PRIMARY && probe.getStrength(lower, trial) < secondRep[j]->strengthFromEmpty) {
614         noteContraction("D1", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
615       }
616       else if (firstRep[i]->strengthFromEmpty == UCOL_PRIMARY && secondRep[j]->strengthFromEmpty == UCOL_PRIMARY)
617       {
618         // I have added an additional check. The checks versus upper and lower bound should be sufficient
619         // when the right side is a combining mark. There might be a reordering of combining marks, but
620         // that should be already visible in their order.
621 	// compare the sequence
622 	// Y- <? Y <? Y+
623 	// and
624 	// XY- <? XY <? XY+
625 	Line xym, xyp, xy;
626 	UBool xymIsContraction = false, toAddIsContraction = false;
627     if(j) {
628 	  if(((!secondRep[j-1]->firstCC || firstRep[i]->lastCC < secondRep[j-1]->firstCC) && !frenchSecondary)
629              ||((!firstRep[i]->firstCC || firstRep[i]->firstCC > secondRep[j-1]->lastCC) && frenchSecondary)) {
630 	    xym.setToConcat(firstRep[i], secondRep[j-1]);
631 	    toAdd.strength = probe.getStrength(xym, toAdd);
632 	    if(secondRep[j]->strength != toAdd.strength) {
633 	      // there is possibility that either xym or xy are contractions
634 	      // There are two situations:
635 	      // xym > xy or xym <n xy and ym <k y but n != k
636 	      // if they are reordered, we are going to see if each of them
637 	      // is further reordered
638 	      if(toAdd.strength == UCOL_OFF) {
639 		// check whether toAdd shifted more down
640 		k = j - 2;
641 		while(k>=0 && secondRep[k]->strength > secondRep[j]->strength) {
642 		  k--;
643 		}
644 		while(!toAddIsContraction && k>=0) {
645 		  xyp.setToConcat(firstRep[i], secondRep[k]);
646 		  if(contractionsTable->get(UnicodeString(xyp.name, xyp.len)) != NULL) {
647 		    k--;
648 		    continue;
649 		  }
650 		  if(probe.compare(xyp, xym) >= 0) {
651 		    // xyp looks like a contraction
652 		    noteContraction("!1", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
653 		    toAddIsContraction = true;
654 		  } else {
655 		    break;
656 		  }
657 		}
658           // first let's see if xym has moved beyond
659           if(contractionsTable->get(UnicodeString(xym.name, xym.len)) == NULL) {
660             k = j+1;
661             // ignore weaker strengths
662             while(k < secondSize && secondRep[k]->strength > secondRep[j]->strength) {
663               k++;
664             }
665             // check if we skipped the following guy
666             if(k < secondSize) {
667               xyp.setToConcat(firstRep[i], secondRep[k]);
668               if(probe.compare(xyp, xym) <= 0) {
669 	            // xyp looks like a contraction
670 	            noteContraction("!2", toAddTo, toAddToSize, firstRep[i], secondRep[j-1], noConts, status);
671 	            xymIsContraction = true;
672               }
673             }
674           } else {
675             xymIsContraction = true;
676           }
677           // if they have reordered, but none has moved, then we add them both
678           // and hope for the best
679           if(!xymIsContraction && !toAddIsContraction) {
680               // it is possible that there is an NFC version version of one of the
681               // strings. If we have XY > XZ, but NFC(XZ) = W and X < W, we might have
682               // have a false contraction.
683               trial.len = unorm_normalize(toAdd.name, toAdd.len, UNORM_NFC, 0, trial.name, 25, &status);
684               //UColAttributeValue strength = probe.getStrength(*firstRep[i], trial);
685               if(trial == toAdd) {
686                 noteContraction("!3", toAddTo, toAddToSize, firstRep[i], secondRep[j-1], noConts, status);
687                 noteContraction("!3", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
688               } else {
689                 noteContraction("!4", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
690               }
691             }
692 	      } else { // only the strength has changed
693             // check whether the previous is contraction and if not, add the current
694             if(contractionsTable->get(UnicodeString(xym.name, xym.len)) == NULL) {
695               noteContraction("!5", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
696             }
697 	      }
698 	    }
699 	  }
700 	}
701       }
702       if(thai) { // restore lower
703         lower = *firstRep[i];
704       }
705     }
706   }
707   return noConts;
708 }
709 
710 void
noteContraction(const char * msg,Line * toAddTo,int32_t & toAddToSize,Line * left,Line * right,int32_t & noConts,UErrorCode & status)711 SortedLines::noteContraction(const char* msg, Line *toAddTo, int32_t &toAddToSize, Line *left, Line *right, int32_t &noConts, UErrorCode &status)
712 {
713   Line toAdd;
714   toAdd.setToConcat(left, right);
715   toAdd.left = left;
716   toAdd.right = right;
717   // if we're adding an accent to an existing contraction, we want to check
718 #if 0
719   Line test, trial1, trial2;
720   if(right->strengthFromEmpty > UCOL_PRIMARY) {
721     if(left->right && left->right->previous && left->right->next) {
722       test.setToConcat(left->left, left->right->previous);
723       trial1.setToConcat(&test, right);
724 
725       test.setToConcat(left->left, left->right->next);
726       trial2.setToConcat(&test, right);
727       if(probe.compare(trial1, toAdd) < 0 && probe.compare(toAdd, trial2) < 0) {
728         // this means that the contraction has been broken by the newly added accent
729         // so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above
730         debug->log("Con -");
731         debug->log(msg);
732         debug->log(toAdd.toString(false), true);
733         return;
734       }
735     } else {
736       if(right->previous && right->next) {
737         trial1.setToConcat(left, right->previous);
738         trial2.setToConcat(left, right->next);
739         if(probe.compare(trial1, toAdd) < 0 && probe.compare(toAdd, trial2) < 0) {
740           // this means that the contraction has been broken by the newly added accent
741           // so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above
742           debug->log("Con -");
743           debug->log(msg);
744           debug->log(toAdd.toString(false), true);
745           return;
746         }
747       }
748       if(left->previous && left->next) {
749         trial1.setToConcat(left->previous, right);
750         trial2.setToConcat(left->next, right);
751         if(probe.compare(trial1, toAdd) < 0 && probe.compare(toAdd, trial2) < 0) {
752           // this means that the contraction has been broken by the newly added accent
753           // so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above
754           debug->log("Con -");
755           debug->log(msg);
756           debug->log(toAdd.toString(false), true);
757           return;
758         }
759       }
760 
761     }
762   }
763   if(right->right && right->right->strengthFromEmpty > UCOL_PRIMARY && right->left->previous && right->left->next) { // maybe we already had a contraction with an accent
764     test.setToConcat(right->left->previous, right->right);
765     trial1.setToConcat(left, &test);
766     test.setToConcat(right->left->next, right->right);
767     trial2.setToConcat(left, &test);
768     if(probe.compare(trial1, toAdd) < 0 && probe.compare(toAdd, trial2) < 0) {
769       // this means that the contraction has been broken by the newly added accent
770       // so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above
771       debug->log("Con -");
772       debug->log(msg);
773       debug->log(toAdd.toString(false), true);
774       return;
775     }
776   }
777 #endif
778   if(contractionsTable->get(UnicodeString(toAdd.name, toAdd.len)) == NULL) {
779     if(probe.distanceFromEmptyString(toAdd) <= UCOL_TERTIARY) {
780       toAddTo[toAddToSize++] = toAdd;
781       contractionsTable->put(UnicodeString(toAdd.name, toAdd.len), &toAdd, status);
782       noConts++;
783       debug->log(msg);
784       debug->log(" Con + ");
785       debug->log(toAdd.toString(false), true);
786 
787       if(!left->sortKey) {
788         calculateSortKey(*left);
789       }
790       debug->log(left->dumpSortkey());
791       debug->log(" + ");
792 
793       if(!right->sortKey) {
794         calculateSortKey(*right);
795       }
796       debug->log(right->dumpSortkey());
797       debug->log(" = ");
798 
799       calculateSortKey(toAdd);
800       debug->log(toAdd.dumpSortkey(), true);
801       if(noConts > size/2) {
802         status = U_BUFFER_OVERFLOW_ERROR;
803       }
804     }
805   }
806 }
807 
808 
809 UBool
getExpansionLine(const Line & expansion,const Line & previous,const Line & exp,Line & expansionLine)810 SortedLines::getExpansionLine(const Line &expansion, const Line &previous, const Line &exp, Line &expansionLine)
811 {
812   int expIndexSize = 0;
813   UColAttributeValue expStrength = UCOL_OFF;
814   int32_t comparisonResult = 0;
815   int32_t i = 0, k = 0, prevK = 0;
816   Line trial;
817   UBool sequenceCompleted = false;
818   int32_t expIndexes[256];
819   int32_t expIndexesSize = 0;
820 
821   if(!sequenceCompleted) {
822     expIndexSize = 0;
823     expansionLine.clear();
824 
825     // we will start from strength between the expansion
826     // and the target (toSort[i] and toSort[j]. First we
827     // will add as many primaries as possible. Then we will
828     // try to add secondary pieces and then tertiary.
829     // found an expansion - what is the expanding sequence?
830 
831     expStrength = UCOL_PRIMARY;
832     while(!sequenceCompleted) {
833       k = 0;
834       prevK = 0;
835       while(k < size) {
836         if(expansionLine.len > 15) {
837           sequenceCompleted = true;
838           break;
839         }
840         while(k < size && toSort[k]->strength != UCOL_PRIMARY)
841         {
842           k++;
843         }
844         // nothing found
845         if(k == size) {
846           break;
847         }
848         // we need to skip over reordering things. If they were worthy, they would
849         // have been detected in the previous iteration.
850         //if(expansionLine.lastCC && toSort[k]->firstCC && expansionLine.lastCC > toSort[k]->firstCC) {
851           //k++;
852           //continue;
853         //}
854         trial = previous;
855         trial.append(expansionLine);
856         trial.append(*toSort[k]);
857         if(toSort[k]->name[0] == 0x0067) {
858           int32_t putBreakPointHere = 0;
859         }
860         comparisonResult = probe.compare(trial, expansion);
861         if(comparisonResult == 0) {
862           expansionLine = *toSort[k];
863           return true;
864         } else if (comparisonResult > 0) {
865           if(prevK) {
866             if(exp == *toSort[prevK]) {
867               expansionLine = exp;
868               return true;
869             }
870             i = prevK;
871             while(i < k-1) {
872               i++;
873               if(toSort[i]->strength > exp.strength) {
874                 continue;
875               }
876               trial = previous;
877               trial.append(expansionLine);
878               trial.append(*toSort[i]);
879               if(probe.compare(trial, expansion) > 0) {
880                 break;
881               }
882             }
883             // we got into situation where we have ch > ch+dot-below
884             // however, ch is a contraction and therefore we cannot use
885             // it properly. If we have hit on a contraction, we'll just try
886             // to continue. Probably need more logic here.
887             if(contractionsTable->get(UnicodeString(trial.name, trial.len)) == NULL) {
888               expansionLine.append(*toSort[i-1]);
889               expIndexes[expIndexSize++] = i-1;
890               break;
891             } else {
892               int32_t putBreakPointHere = 0;
893             }
894           } else {
895             sequenceCompleted = true;
896             break;
897           }
898           //break;
899         }
900         prevK = k;
901         k++;
902       }
903       if(!prevK || k == size) {
904         break;
905       }
906     }
907   }
908   return expIndexSize > 0;
909 }
910 
911 int32_t
gooseUp(int32_t resetIndex,int32_t expansionIndex,Line & expLine,int32_t * expIndexes,int32_t & expIndexSize,UColAttributeValue strength)912 SortedLines::gooseUp(int32_t resetIndex, int32_t expansionIndex, Line &expLine, int32_t *expIndexes, int32_t &expIndexSize, UColAttributeValue strength)
913 {
914   int32_t i = expansionIndex, k = resetIndex+1, n = 0, m = 0, start = 0;
915   UBool haveChanges = false;
916   Line trial, prefix, suffix;
917   // we will first try goosing up the reset index
918   //while(toSort[k]->strength >= strength)
919   for( ; toSort[k]->strength == strength; k++)
920   {
921     //if(toSort[k]->strength > strength) {
922       //continue;
923     //}
924     trial.setToConcat(toSort[k], &expLine);
925     if(probe.compare(trial, *toSort[i]) > 0) {
926       break;
927     }
928   }
929   resetIndex = k-1;
930 
931   // goose up individual characters
932   prefix = *toSort[resetIndex];
933   for(n = 0; n < expIndexSize; n++) {
934     suffix.clear();
935     for(m = n+1; m < expIndexSize; m++) {
936       suffix.append(*toSort[expIndexes[m]]);
937     }
938     k = expIndexes[n]+1;
939     //while(toSort[k]->strength >= strength)
940     for( ; toSort[k]->strength == strength; k++)
941     {
942       //if(toSort[k]->strength > strength) {
943         //continue;
944       //}
945       trial.setToConcat(&prefix, toSort[k]);
946       trial.append(suffix);
947       if(probe.compare(trial, *toSort[i]) > 0) {
948         break;
949       }
950     }
951     if(k > expIndexes[n]+1) {
952       haveChanges = true;
953       expIndexes[n] = k-1;
954     }
955     prefix.append(*toSort[expIndexes[n]]);
956   }
957 
958   // try inserting ignorables
959   UColAttributeValue lastStr = UCOL_OFF;
960   k = 0;
961   while(toSort[k]->strengthFromEmpty > strength) {
962     k++;
963   }
964   if(toSort[k]->strengthFromEmpty == strength) {
965     start = k;
966     prefix = *toSort[resetIndex];
967     n = 0;
968     while(n <= expIndexSize) {
969       suffix.clear();
970       for(m = n; m < expIndexSize; m++) {
971         suffix.append(*toSort[expIndexes[m]]);
972       }
973       k = start;
974       while(toSort[k]->strengthFromEmpty == strength) {
975         trial.setToConcat(&prefix, toSort[k]);
976         trial.append(suffix);
977         lastStr = probe.getStrength(trial, *toSort[i]);
978         if(lastStr == UCOL_OFF) { // shot over - we won't find anything here
979           break;
980         } else if(lastStr > strength) {
981           for(m = expIndexSize; m > n; m--) {
982             expIndexes[m] = expIndexes[m-1];
983           }
984           expIndexes[n] = k;
985           expIndexSize++;
986           haveChanges = true;
987           break;
988         }
989 #if 0
990         if(probe.compare(trial, *toSort[i]) > 0) {
991           // if the first one skips, that means that
992           // this position doesn't work
993           if(k > start) {
994             // insert an ignorable on position n
995             for(m = expIndexSize; m > n; m--) {
996               expIndexes[m] = expIndexes[m-1];
997             }
998             expIndexes[n] = k-1;
999             expIndexSize++;
1000             haveChanges = true;
1001             if(n == expIndexSize-1) { // added to the end of the string
1002               UColAttributeValue str = probe.getStrength(trial, *toSort[i]);
1003               int32_t putBreakHere = 0;
1004             }
1005           }
1006           break;
1007         } else {
1008           lastStr = probe.getStrength(trial, *toSort[i]);
1009         }
1010 #endif
1011         k++;
1012       }
1013       prefix.append(*toSort[expIndexes[n]]);
1014       n++;
1015     }
1016   }
1017 
1018   if(haveChanges) {
1019     expLine.clear();
1020     for(m = 0; m < expIndexSize; m++) {
1021       expLine.append(*toSort[expIndexes[m]]);
1022     }
1023   }
1024   return resetIndex;
1025 }
1026 
1027 int32_t
detectExpansions()1028 SortedLines::detectExpansions()
1029 {
1030   logger->log("\n*** Detecting expansions\n\n");
1031   int32_t exCount = 0;
1032   int32_t i = 0, j = 0, k = 0, prevK = 0;
1033   Line *previous, trial, expansionLine;
1034   UBool foundExp = false, sequenceCompleted = false;
1035   UColAttributeValue strength = UCOL_OFF;
1036   UColAttributeValue maxStrength = UCOL_IDENTICAL;
1037   UColAttributeValue expStrength = UCOL_OFF;
1038   int32_t expIndexes[256];
1039   int32_t expIndexSize = 0;
1040   memset(expIndexes, 0, sizeof(expIndexes));
1041 
1042   // for each element, we look back to find whether there is such a q for which
1043   // q <n x < qUBn. These are possible expansions. When going backwards we skip
1044   // over already detected expansions.
1045   i = 0;
1046   // it turns out that looking at accents as possible expansions is
1047   // quite a stupid thing to do, especially on non ICU platforms.
1048   // Previously this line skipped over identicals only, but
1049   // now we are going to skip all the way to non-ignorables.
1050   while(toSort[i]->strengthFromEmpty > UCOL_PRIMARY) {
1051     i++;
1052   }
1053   i++;
1054   for( ; i < size; i++) {
1055     if(toSort[i]->name[0]==0x0063 && toSort[i]->name[1] == 0x68) // && toSort[i]->name[1] == 0x308)0043 0043 0219
1056     {
1057       int32_t putBreakpointhere = 0;
1058     }
1059     foundExp = false;
1060     sequenceCompleted = false;
1061     strength = toSort[i]->strength;
1062     if(strength == UCOL_IDENTICAL && toSort[i-1]->isExpansion == true) {
1063       u_strcpy(toSort[i]->expansionString, toSort[i-1]->expansionString);
1064       toSort[i]->expLen = toSort[i-1]->expLen;
1065       toSort[i]->isExpansion = true;
1066       toSort[i]->expIndex = toSort[i-1]->expIndex;
1067       toSort[i]->expStrength = UCOL_IDENTICAL;
1068       //toSort[i]->expStrength = toSort[i-1]->expStrength;
1069       foundExp = true;
1070       sequenceCompleted = true;
1071     }
1072     //logger->log("%i %i\n", i, j);
1073     while(!foundExp && strength <= maxStrength) {
1074       j = i-1;
1075       while(j && (toSort[j]->isExpansion == true || toSort[j]->isRemoved == true)) {
1076         //if(toSort[j]->strength < strength) {
1077           //strength = toSort[j]->strength;
1078         //}
1079         j--;
1080       }
1081 
1082       //while(j && toSort[j]->strength > strength)
1083       while(j && toSort[j]->strength > probe.getStrength(*toSort[j], *toSort[i]))
1084       {
1085         j--;
1086       }
1087       //if(toSort[j]->strength == strength) {
1088         previous = toSort[j];
1089         if(previous->strengthFromEmpty >= UCOL_IDENTICAL ||
1090           (previous->strengthFromEmpty == UCOL_SECONDARY
1091           && strength == UCOL_SECONDARY
1092           && previous->lastCC > UB[strength]->firstCC)) {
1093           break;
1094           //continue;
1095         }
1096         //trial.setToConcat(previous, UB[strength]);
1097         trial.setToConcat(previous, UB[probe.getStrength(*toSort[j], *toSort[i])]);
1098         if(probe.compare(trial, *toSort[i]) > 0) {
1099           foundExp = true;
1100         }
1101       //}
1102       if(strength == UCOL_QUATERNARY) {
1103         strength = UCOL_IDENTICAL;
1104       } else {
1105         strength = (UColAttributeValue)(strength + 1);
1106       }
1107     }
1108     // calculate the expanding sequence
1109     if(foundExp && !sequenceCompleted) {
1110       expIndexSize = 0;
1111       expansionLine.clear();
1112       exCount++;
1113       // we will start from strength between the expansion
1114       // and the target (toSort[i] and toSort[j]. First we
1115       // will add as many primaries as possible. Then we will
1116       // try to add secondary pieces and then tertiary.
1117       // found an expansion - what is the expanding sequence?
1118 
1119       expStrength = UCOL_PRIMARY;
1120       while(!sequenceCompleted) {
1121         k = 0;
1122         prevK = 0;
1123         while(k < size) {
1124           if(expansionLine.len > 15) {
1125             sequenceCompleted = true;
1126             break;
1127           }
1128           while(k < size && toSort[k]->strength != UCOL_PRIMARY) {
1129             k++;
1130           }
1131           // nothing found
1132           if(k == size) {
1133             break;
1134           }
1135           // we need to skip over reordering things. If they were worthy, they would
1136           // have been detected in the previous iteration.
1137           //if(expansionLine.lastCC && toSort[k]->firstCC && expansionLine.lastCC > toSort[k]->firstCC) {
1138             //k++;
1139             //continue;
1140           //}
1141           trial = *previous;
1142           trial.append(expansionLine);
1143           trial.append(*toSort[k]);
1144           if(toSort[k]->name[0] == 0x0067) {
1145             int32_t putBreakPointHere = 0;
1146           }
1147           if(probe.compare(trial, *toSort[i]) > 0) {
1148             if(prevK) {
1149               // we got into situation where we have ch > ch+dot-below
1150               // however, ch is a contraction and therefore we cannot use
1151               // it properly. If we have hit on a contraction, we'll just try
1152               // to continue. Probably need more logic here.
1153               if(contractionsTable->get(UnicodeString(trial.name, trial.len)) == NULL) {
1154                 expansionLine.append(*toSort[prevK]);
1155                 expIndexes[expIndexSize++] = prevK;
1156                 break;
1157               } else {
1158                 int32_t putBreakPointHere = 0;
1159               }
1160             } else {
1161               sequenceCompleted = true;
1162               break;
1163             }
1164             //break;
1165           }
1166           prevK = k;
1167           k++;
1168         }
1169         if(!prevK || k == size) {
1170           break;
1171         }
1172       }
1173       // after this we have primaries lined up.
1174       // we are going to goose up with secondaries and
1175       // tertiaries
1176       trial.setToConcat(toSort[j], &expansionLine);
1177       expStrength = probe.getStrength(trial, *toSort[i]);
1178       if(expStrength > UCOL_PRIMARY) {
1179         if(expStrength == UCOL_SECONDARY || expStrength == UCOL_OFF) {
1180           j = gooseUp(j, i, expansionLine, expIndexes, expIndexSize, UCOL_SECONDARY);
1181           trial.setToConcat(toSort[j], &expansionLine);
1182           expStrength = probe.getStrength(trial, *toSort[i]);
1183           if(expStrength == UCOL_TERTIARY) {
1184             j = gooseUp(j, i, expansionLine, expIndexes, expIndexSize, UCOL_TERTIARY);
1185           }
1186         } else if(expStrength == UCOL_TERTIARY) {
1187           j = gooseUp(j, i, expansionLine, expIndexes, expIndexSize, UCOL_TERTIARY);
1188         }
1189       }
1190       trial.setToConcat(toSort[j], &expansionLine);
1191       expStrength = probe.getStrength(trial, *toSort[i]);
1192       if(expansionLine.len) {
1193         if(expansionLine.name[0] == 0x73 && expansionLine.name[1] == 0x7a) {
1194           int32_t putBreakpointhere = 0;
1195         }
1196         UBool isExpansionLineAContraction = (contractionsTable->get(UnicodeString(expansionLine.name, expansionLine.len)) != NULL);
1197         // we have an expansion line and an expansion. There could be some expansions where
1198         // the difference between expansion line and the end of expansion sequence is less or
1199         // equal than the expansion strength. These should probably be removed.
1200         int32_t diffLen = toSort[i]->len - expansionLine.len;
1201         if(diffLen > 0) {
1202           trial.setTo(UnicodeString(toSort[i]->name + diffLen, toSort[i]->len - diffLen));
1203         } else {
1204           trial = *toSort[i];
1205         }
1206         UColAttributeValue s1 = probe.getStrength(trial, expansionLine);
1207         if(s1 == UCOL_OFF) {
1208           s1 = probe.getStrength(expansionLine, trial);
1209         }
1210         if((!isExpansionLineAContraction && s1 >= expStrength) || (diffLen <= 0 && s1 == UCOL_IDENTICAL)) {
1211           contractionsTable->remove(UnicodeString(toSort[i]->name, toSort[i]->len));
1212           toSort[i]->isRemoved = true;
1213           if(toSort[i]->next && toSort[i]->previous) {
1214             toSort[i]->previous->next = toSort[i]->next;
1215           }
1216           if(toSort[i]->previous && toSort[i]->next) {
1217             toSort[i]->next->previous = toSort[i]->previous;
1218           }
1219 	      debug->log("Exp -N: ");
1220 	      debug->log(toSort[i]->toString(false));
1221 	      debug->log(" / ");
1222 	      debug->log(expansionLine.toString(false), true);
1223         }
1224         else
1225         {
1226           u_strncat(toSort[i]->expansionString, expansionLine.name, expansionLine.len);
1227           toSort[i]->isExpansion = true;
1228           toSort[i]->expStrength = expStrength;
1229           toSort[i]->expLen = expansionLine.len;
1230           toSort[i]->expansionString[toSort[i]->expLen] = 0;
1231           toSort[i]->expIndex = j;
1232         }
1233       }
1234     }
1235     if(toSort[i]->isExpansion == true) {
1236       if(debug->isOn()) {
1237         debug->log("Exp + : &");
1238         debug->log(toSort[j]->toString(false));
1239         debug->log(toSort[i]->strengthToString(toSort[i]->expStrength, true));
1240         debug->log(toSort[i]->toString(false));
1241         debug->log(" ");
1242         if(!toSort[j]->sortKey) {
1243           calculateSortKey(*toSort[j]);
1244         }
1245         debug->log(toSort[j]->dumpSortkey());
1246         debug->log(" ... ");
1247         if(!toSort[i]->sortKey) {
1248           calculateSortKey(*toSort[i]);
1249         }
1250         debug->log(toSort[i]->dumpSortkey());
1251         calculateSortKey(expansionLine);
1252         debug->log("/");
1253         debug->log(expansionLine.dumpSortkey(), true);
1254       }
1255 
1256     }
1257   }
1258   // after detecting expansions, we want to position them.
1259   // it is better to position expansions after all have been detected,
1260   // since otherwise we will change the ordering.
1261   for(i = size-1; i >= 0; i--) {
1262     if(toSort[i]->isExpansion) {
1263       if(toSort[i]->name[0] == 0x2A3) {
1264         int32_t putBreakPointHere = 0;
1265       }
1266       if(i) {
1267         if(toSort[i]->previous) {
1268           toSort[i]->previous->next = toSort[i]->next;
1269         }
1270       }
1271       if(i < size-1) {
1272         if(toSort[i]->next) {
1273           toSort[i]->next->previous = toSort[i]->previous;
1274         }
1275       }
1276       j = toSort[i]->expIndex;
1277       toSort[i]->next = toSort[j]->next;
1278       toSort[i]->previous = toSort[j];
1279       toSort[j]->next = toSort[i];
1280       if(toSort[i]->next) {
1281         toSort[i]->next->previous = toSort[i];
1282       }
1283       toSort[i]->strength = toSort[i]->expStrength;
1284     }
1285   }
1286   return exCount;
1287 }
1288 
1289 
1290 Line *
getFirst()1291 SortedLines::getFirst() {
1292   current = first;
1293   return current;
1294 }
1295 
1296 Line *
getLast()1297 SortedLines::getLast() {
1298   current = last;
1299   return current;
1300 }
1301 
1302 void
add(Line * line,UBool linkIn)1303 SortedLines::add(Line *line, UBool linkIn) {
1304   if(size++ == capacity) {
1305     // grow
1306   }
1307   lines[size] = *line;
1308   Line *toAdd = &lines[size];
1309   if(linkIn && first) {
1310     Line *current = first;
1311     while(current != NULL && probe.comparer(&current, &toAdd) < 0) {
1312       current = current->next;
1313     }
1314     if(current == NULL) {
1315       toAdd->previous = last;
1316       toAdd->next = NULL;
1317       if(last != NULL) {
1318         last->next = toAdd;
1319       }
1320       last = toAdd;
1321       if(first == NULL) {
1322         first = toAdd;
1323       }
1324     } else { // current != NULL
1325       toAdd->next = current;
1326       toAdd->previous = current->previous;
1327       if(current->previous) {
1328         current->previous->next = toAdd;
1329       } else {
1330         first = toAdd;
1331       }
1332       current->previous = toAdd;
1333     }
1334   }
1335 }
1336 
1337 
1338 Line *
getNext()1339 SortedLines::getNext()
1340 {
1341   if(current != NULL) {
1342     current=current->next;
1343   }
1344   return current;
1345 }
1346 
1347 Line *
getPrevious()1348 SortedLines::getPrevious()
1349 {
1350   if(current != NULL) {
1351     current=current->previous;
1352   }
1353   return current;
1354 }
1355 
1356 Line *
operator [](int32_t index)1357 SortedLines::operator[](int32_t index)
1358 {
1359   int32_t i = 0;
1360   Line *c = first;
1361   for(i = 0; i < index; i++) {
1362     if(c != NULL) {
1363       c = c->next;
1364     }
1365   }
1366   return c;
1367 }
1368 
1369 UnicodeString
arrayToString(Line ** sortedLines,int32_t linesSize,UBool pretty,UBool useLinks,UBool printSortKeys)1370 SortedLines::arrayToString(Line** sortedLines, int32_t linesSize, UBool pretty, UBool useLinks, UBool printSortKeys) {
1371   UnicodeString result;
1372   int32_t i = 0;
1373 
1374   Line *line = NULL;
1375   Line *previous = sortedLines[0];
1376   if(printSortKeys && !sortkeys) {
1377     printSortKeys = false;
1378   }
1379   if(previous->isReset) {
1380     result.append(" & ");
1381     result.append(previous->name, previous->len);
1382     if(pretty) {
1383       result.append("        # ");
1384       result.append(previous->stringToName(previous->name, previous->len));
1385       result.append("\n");
1386     }
1387   } else if(!previous->isRemoved) {
1388     result.append(previous->toString(pretty));
1389     if(pretty) {
1390       result.append("\n");
1391     }
1392   }
1393   i = 1;
1394   while((i < linesSize && !useLinks) || (previous->next && useLinks)) {
1395     if(useLinks) {
1396       line = previous->next;
1397     } else {
1398       line = sortedLines[i];
1399     }
1400     if(line->isReset) {
1401       result.append(" &");
1402       result.append(line->name, line->len);
1403       if(pretty) {
1404         result.append("        # ");
1405         result.append(line->stringToName(line->name, line->len));
1406         result.append("\n");
1407       }
1408     } else if(!line->isRemoved) {
1409       if(i > 0) {
1410         result.append(line->strengthToString(line->strength, pretty));
1411       }
1412       result.append(line->toString(pretty));
1413       if(printSortKeys) {
1414         result.append(line->dumpSortkey());
1415       }
1416       if(pretty) {
1417         result.append("\n");
1418       }
1419     }
1420     previous = line;
1421     i++;
1422   }
1423   return result;
1424 }
1425 
SortedLines(FILE * file,UPrinter * logger,UPrinter * debug,UErrorCode & status)1426 SortedLines::SortedLines(FILE *file, UPrinter *logger, UPrinter *debug, UErrorCode &status) :
1427 toSort(NULL),
1428 toSortCapacity(0),
1429 lines(NULL),
1430 size(0),
1431 capacity(0),
1432 first(NULL),
1433 last(NULL),
1434 logger(logger),
1435 debug(debug),
1436 contractionsTable(NULL),
1437 duplicators(NULL),
1438 maxExpansionPrefixSize(0),
1439 wordSort(false),
1440 frenchSecondary(false),
1441 upperFirst(false),
1442 sortkeys(NULL),
1443 sortkeyOffset(0)
1444 {
1445   debug->log("*** loading a dump\n");
1446   memset(UB, 0, sizeof(UB));
1447   int32_t i = 0;
1448   for(i = 0; i < UCOL_OFF; i++) {
1449     UB[i] = &empty;
1450   }
1451 
1452   int32_t newFrench, newUpperFirst;
1453   fscanf(file, "%i,%i,%i\n", &size, &newFrench, &newUpperFirst);
1454   debug->log("Read size %i, frenchSecondary %i and upperFirst %i\n", size, newFrench, newUpperFirst);
1455   frenchSecondary = (UBool)newFrench;
1456   upperFirst = (UBool)newUpperFirst;
1457   capacity = size;
1458   lines = new Line[capacity];
1459   i = 0;
1460 
1461   char buff[256];
1462 
1463   while(fgets(buff, 256, file)) {
1464     if(i % 20 == 0) {
1465       logger->log("\rLine: %04i", i, buff);
1466     }
1467     lines[i].initFromString(buff, 256, status);
1468     if(i) {
1469       lines[i].previous = &lines[i-1];
1470       lines[i-1].next = &lines[i];
1471     }
1472     i++;
1473   }
1474   size = i;
1475   toSort = new Line*[size];
1476   setSortingArray(toSort, lines, size);
1477   first = &lines[0];
1478   last = &lines[size-1];
1479 }
1480 
1481 void
toFile(FILE * file,UBool useLinks,UErrorCode & status)1482 SortedLines::toFile(FILE *file, UBool useLinks, UErrorCode &status)
1483 {
1484   fprintf(file, "%i,%i,%i\n", size, frenchSecondary, upperFirst);
1485   int32_t i = 1;
1486   Line *previous = toSort[0];
1487   Line *line = NULL;
1488   char buff[256];
1489   previous->write(buff, 256, status);
1490   fprintf(file, "%s\n", buff);
1491   fflush(file);
1492   while(previous->next) {
1493     if(useLinks) {
1494       line = previous->next;
1495     } else {
1496       line = toSort[i];
1497     }
1498     line->write(buff, 256, status);
1499     fprintf(file, "%s\n", buff);
1500     i++;
1501     previous = line;
1502   }
1503 }
1504 
1505 
1506 
1507 UnicodeString
toStringFromEmpty()1508 SortedLines::toStringFromEmpty() {
1509   UBool useLinks = false;
1510   UBool pretty = false;
1511   UnicodeString result;
1512   int32_t i = 0;
1513 
1514   Line *line = NULL;
1515   Line *previous = toSort[0];
1516   if(previous->isReset) {
1517     result.append(" & ");
1518     if(pretty) {
1519       result.append("\n");
1520     }
1521     result.append(previous->name, previous->len);
1522   } else if(!previous->isRemoved) {
1523     result.append(previous->toString(pretty));
1524     if(pretty) {
1525       result.append("\n");
1526     }
1527   }
1528   i = 1;
1529   while(i < size || previous->next) {
1530     if(useLinks) {
1531       line = previous->next;
1532     } else {
1533       line = toSort[i];
1534     }
1535     if(line->isReset) {
1536       result.append(" &");
1537       result.append(line->name, line->len);
1538       if(pretty) {
1539         result.append("        # ");
1540         result.append(line->stringToName(line->name, line->len));
1541         result.append("\n");
1542       }
1543     } else if(!line->isRemoved) {
1544       if(i > 0) {
1545         result.append(line->strengthToString(line->strengthFromEmpty, pretty));
1546       }
1547       result.append(line->toString(pretty));
1548       if(pretty) {
1549         result.append("\n");
1550       }
1551     }
1552     previous = line;
1553     i++;
1554   }
1555   return result;
1556 }
1557 
1558 UnicodeString
toString(UBool useLinks)1559 SortedLines::toString(UBool useLinks)
1560 {
1561   return arrayToString(toSort, size, false, useLinks, false);
1562 }
1563 
1564 
1565 UnicodeString
toPrettyString(UBool useLinks,UBool printSortKeys)1566 SortedLines::toPrettyString(UBool useLinks, UBool printSortKeys)
1567 {
1568   return arrayToString(toSort, size, true, useLinks, printSortKeys);
1569 }
1570 
1571 UnicodeString
toOutput(const char * format,const char * locale,const char * platform,const char * reference,UBool useLinks,UBool initialize,UBool moreToCome)1572 SortedLines::toOutput(const char *format,
1573                        const char *locale, const char *platform, const char *reference,
1574                        UBool useLinks, UBool initialize, UBool moreToCome) {
1575   if(strcmp(format, "HTML") == 0) {
1576     return toHTML(locale, platform, reference, useLinks, initialize, moreToCome);
1577   } else if(strcmp(format, "XML") == 0) {
1578     return toXML(locale, platform, reference, useLinks, initialize, moreToCome);
1579   } else {
1580     return toBundle(locale, platform, reference, useLinks, initialize, moreToCome);
1581   }
1582 }
1583 
1584 
1585 UnicodeString
toHTML(const char * locale,const char * platform,const char * reference,UBool useLinks,UBool initialize,UBool moreToCome)1586 SortedLines::toHTML(const char *locale,
1587                        const char *platform, const char *reference,
1588                        UBool useLinks, UBool initialize, UBool moreToCome)
1589 {
1590   UnicodeString result;
1591   int32_t i = 0;
1592   if(initialize) {
1593     result.append("<html>\n<head>\n<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\">\n</head>\n");
1594     result.append("# Collation data resource bundle generated for locale: ");
1595     result.append(locale);
1596     result.append("<br>\n# For platform ");
1597     result.append(platform);
1598     result.append(" reference platform ");
1599     result.append(reference);
1600     result.append("<br><br>\n\n\n");
1601 
1602     result.append(locale);
1603     if(platform) {
1604       result.append("_");
1605       result.append(platform);
1606     }
1607     if(reference) {
1608       result.append("_vs_");
1609       result.append(reference);
1610     }
1611     result.append("&nbsp;{<br>\n");
1612 
1613     result.append("&nbsp;&nbsp;collations&nbsp;{<br>\n&nbsp;&nbsp;&nbsp;&nbsp;standard&nbsp;{<br>\n&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Sequence&nbsp;{<br>\n");
1614   }
1615 
1616   if(frenchSecondary) {
1617     result.append("[backwards 2]<br>\n");
1618   }
1619   if(upperFirst) {
1620     result.append("[casefirst upper]<br>\n");
1621   }
1622 
1623   Line *line = toSort[0];
1624 
1625   i = 0;
1626   while((i < size && !useLinks) || (line->next && useLinks)) {
1627     if(line->isReset || !line->isRemoved) {
1628       result.append(line->toHTMLString());
1629     }
1630     i++;
1631     if(useLinks) {
1632       line = line->next;
1633     } else {
1634       line = toSort[i];
1635     }
1636   }
1637   if(!moreToCome) {
1638     result.append("&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;}<br>\n&nbsp;&nbsp;&nbsp;&nbsp;}<br>\n&nbsp;&nbsp;}<br>\n}<br>\n");
1639 
1640     result.append("</html>\n");
1641   }
1642 
1643   return result;
1644 }
1645 
1646 UnicodeString
toXML(const char * locale,const char * platform,const char * reference,UBool useLinks,UBool initialize,UBool moreToCome)1647 SortedLines::toXML(const char *locale,
1648                        const char *platform, const char *reference,
1649                        UBool useLinks, UBool initialize, UBool moreToCome)
1650 {
1651   UnicodeString result;
1652   int32_t i = 0;
1653   if(initialize) {
1654     result.append("<html>\n<head>\n<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\">\n</head>\n");
1655     result.append("# Collation data resource bundle generated for locale: ");
1656     result.append(locale);
1657     result.append("<br>\n# For platform ");
1658     result.append(platform);
1659     result.append(" reference platform ");
1660     result.append(reference);
1661     result.append("<br><br>\n\n\n");
1662 
1663     result.append(locale);
1664     if(platform) {
1665       result.append("_");
1666       result.append(platform);
1667     }
1668     if(reference) {
1669       result.append("_vs_");
1670       result.append(reference);
1671     }
1672     result.append("&nbsp;{<br>\n");
1673 
1674     result.append("&nbsp;&nbsp;collations&nbsp;{<br>\n&nbsp;&nbsp;&nbsp;&nbsp;standard&nbsp;{<br>\n&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Sequence&nbsp;{<br>\n");
1675   }
1676 
1677   if(frenchSecondary) {
1678     result.append("[backwards 2]<br>\n");
1679   }
1680   if(upperFirst) {
1681     result.append("[casefirst upper]<br>\n");
1682   }
1683 
1684   Line *line = toSort[0];
1685 
1686   i = 0;
1687   while((i < size && !useLinks) || (line->next && useLinks)) {
1688     if(line->isReset || !line->isRemoved) {
1689       result.append(line->toHTMLString());
1690     }
1691     i++;
1692     if(useLinks) {
1693       line = line->next;
1694     } else {
1695       line = toSort[i];
1696     }
1697   }
1698   if(!moreToCome) {
1699     result.append("&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;}<br>\n&nbsp;&nbsp;&nbsp;&nbsp;}<br>\n&nbsp;&nbsp;}<br>\n}<br>\n");
1700 
1701     result.append("</html>\n");
1702   }
1703 
1704   return result;
1705 }
1706 
1707 UnicodeString
toBundle(const char * locale,const char * platform,const char * reference,UBool useLinks,UBool initialize,UBool moreToCome)1708 SortedLines::toBundle(const char *locale,
1709                        const char *platform, const char *reference,
1710                        UBool useLinks, UBool initialize, UBool moreToCome)
1711 {
1712   UnicodeString result;
1713   int32_t i = 0;
1714 
1715   if(initialize) {
1716     result.append("// Collation data resource bundle generated for locale: ");
1717     result.append(locale);
1718     result.append("\n// For platform ");
1719     result.append(platform);
1720     result.append(" reference platform ");
1721     result.append(reference);
1722     result.append("\n\n\n");
1723 
1724     result.append(locale);
1725     /*
1726     if(platform) {
1727       result.append("_");
1728       result.append(platform);
1729     }
1730     if(reference) {
1731       result.append("_vs_");
1732       result.append(reference);
1733     }
1734     */
1735     result.append(" {\n");
1736 
1737     result.append("  collations {\n   standard {\n    Sequence {\n");
1738   }
1739 
1740   if(frenchSecondary) {
1741     result.append("[backwards 2]\n");
1742   }
1743   if(upperFirst) {
1744     result.append("[casefirst upper]\n");
1745   }
1746 
1747   Line *line = toSort[0];
1748 
1749   i = 0;
1750   while((i < size && !useLinks) || (line->next && useLinks)) {
1751     if(line->isReset || !line->isRemoved) {
1752       result.append(line->toBundleString());
1753     }
1754     i++;
1755     if(useLinks) {
1756       line = line->next;
1757     } else {
1758       line = toSort[i];
1759     }
1760   }
1761 
1762   if(!moreToCome) {
1763     result.append("    }\n   }\n  }\n}\n");
1764   }
1765 
1766   return result;
1767 }
1768 
1769 
1770 int32_t
getSize() const1771 SortedLines::getSize() const {
1772   return repertoire.size();
1773 }
1774 
1775 void
reduceDifference(SortedLines & reference)1776 SortedLines::reduceDifference(SortedLines& reference) {
1777   UErrorCode status = U_ZERO_ERROR;
1778   if(upperFirst) {
1779     swapCase();
1780   }
1781   // both sorted lines structures need to have established links and strengths
1782   // We walk down both structures and note differences. These
1783   // differences will modify this by removng elements, setting resets
1784   // etc...
1785   // we will prefer insertions from tailoring to reference, then deletions
1786   // there are two tables that keep seen elements.
1787   Hashtable *seenThis = new Hashtable();
1788   Hashtable *seenReference = new Hashtable();
1789 
1790 
1791   UBool found = false;
1792   UBool finished = false;
1793   const int32_t lookForward = 20;
1794   int32_t tailoringMove = 0;
1795   //int32_t referenceSize = reference.getSize();
1796   Line *refLine = reference.getFirst();
1797   Line *refLatestEqual = refLine;
1798   refLine = refLine->next;
1799   Line *myLine = getFirst();
1800   Line *myLatestEqual = myLine;
1801   myLatestEqual->isRemoved = true;
1802   myLine = myLine->next;
1803   while(myLine && refLine) {
1804     found = false;
1805     while(myLine && refLine && myLine->equals(*refLine)) {
1806       myLatestEqual = myLine;
1807       myLatestEqual->isRemoved = true;
1808       myLine = myLine->next;
1809       refLatestEqual = refLine;
1810       refLine = refLine->next;
1811       if(refLine == NULL && myLine == NULL) {
1812         finished = true;
1813       }
1814     }
1815     if(myLine) {
1816       myLine->cumulativeStrength = myLine->strength;
1817     }
1818     if(refLine) {
1819       refLine->cumulativeStrength = refLine->strength;
1820     }
1821 
1822     // here is the difference
1823     while(!found && !finished) {
1824       tailoringMove = 0;
1825       if(myLine && refLine) {
1826         if(myLine->cumulativeStrength > refLine->cumulativeStrength) {
1827           // tailoring z <<< x, UCA z < y
1828           while(myLine->cumulativeStrength > refLine->cumulativeStrength) {
1829             myLine = myLine->next;
1830             if(myLine) {
1831               transferCumulativeStrength(myLine->previous, myLine);
1832             } else {
1833               break;
1834             }
1835           }
1836         } else if(myLine->cumulativeStrength < refLine->cumulativeStrength) {
1837           // tailoring z < x, UCA z <<< y
1838           while(myLine->cumulativeStrength < refLine->cumulativeStrength) {
1839             seenReference->put(UnicodeString(refLine->name, refLine->len), refLine, status);
1840             refLine = refLine->next;
1841             if(refLine) {
1842               transferCumulativeStrength(refLine->previous, refLine);
1843             } else {
1844               break;
1845             }
1846           }
1847         }
1848         // this is the interesting point. Now we search for character match
1849         while(myLine && refLine && (!myLine->equals(*refLine) || myLine->strength == UCOL_IDENTICAL)
1850           && tailoringMove < lookForward) {
1851           if(seenThis->get(UnicodeString(refLine->name, refLine->len))) {
1852             // we are not interested in stuff from the reference that is already accounted
1853             // for in the tailoring.
1854             refLine = refLine->next;
1855             if(refLine) {
1856               transferCumulativeStrength(refLine->previous, refLine);
1857             }
1858           } else {
1859             myLine = myLine->next;
1860             if(myLine) {
1861               transferCumulativeStrength(myLine->previous, myLine);
1862               if(!seenReference->get(UnicodeString(myLine->name, myLine->len))) {
1863                 tailoringMove++;
1864               }
1865             }
1866           }
1867         }
1868       }
1869       if(refLine == NULL) { // ran out of reference
1870         // this is the tail of tailoring - the last insertion
1871         myLine  = NULL;
1872         found = true;
1873       } else if(tailoringMove == lookForward || myLine == NULL) { // run over treshold or out of tailoring
1874         tailoringMove = 0;
1875         // we didn't find insertion after all
1876         // we will try substitution next
1877         // reset the tailoring pointer
1878         myLine = myLatestEqual->next;
1879         // move the reference
1880         refLine = refLine->next;
1881         if(refLine) {
1882           transferCumulativeStrength(refLine->previous, refLine);
1883         }
1884       } else { // we found an insertion
1885         tailoringMove = 0;
1886         if(myLine->strength != refLine->strength) {
1887           while(myLine && refLine && *myLine == *refLine
1888             && (myLine->strength != refLine->strength
1889             || myLine->strength == UCOL_IDENTICAL)) {
1890             myLine = myLine->next;
1891             refLine = refLine->next;
1892           }
1893           if(*myLine != *refLine) {
1894             continue;
1895           }
1896         }
1897         if(myLine && refLine && myLine->previous->strength < myLine->strength) {
1898           myLine = myLine->next;
1899           refLine = refLine->next;
1900           if(*myLine != *refLine) {
1901             continue;
1902           }
1903         }
1904         found = true;
1905       }
1906       if(found) {
1907         if(myLatestEqual->next != myLine || refLine == NULL) {
1908           Line *myStart = NULL;
1909           // this is a reset and a sequence
1910           // myLatestEqual points at the last point that was the same
1911           // This point will be a reset
1912           if(myLine && refLine) { // if there is anything more to do - it might be worth saving it
1913             myStart = myLatestEqual;
1914             while(myStart != myLine) {
1915               seenThis->put(UnicodeString(myStart->name, myStart->len), myStart, status);
1916               myStart = myStart->next;
1917             }
1918           }
1919           // Try to weed out stuff that is not affected, like:
1920           // Tailoring:
1921           // <<<S<<\u017F<\u0161<<<\u0160<t
1922           // UCA:
1923           // <<<S<<\u0161<<<\u0160<<\u017F<t
1924           // Result:
1925           // &S<<\u017F<\u0161<<<\u0160
1926           // we have a sequence that spans from myLatestEqual to myLine (that one could be NULL,
1927           // so we have to go down from myLatestEqual.
1928           // Basically, for every element, we want to see the strongest cumulative difference
1929           // from the reset point. If the cumulative difference is the same in both the reference and
1930           // tailoring, that element could be removed.
1931           calculateCumulativeStrengths(myLatestEqual, myLine);
1932           calculateCumulativeStrengths(refLatestEqual, refLine);
1933           myStart = myLatestEqual;
1934           int32_t removed = 0;
1935           int32_t traversed = 0;
1936           while(myStart && myStart != myLine) {
1937             Line *refStart = refLatestEqual;
1938             while(refStart && refStart != refLine) {
1939               if(*myStart == *refStart) {
1940                 if(myStart->cumulativeStrength == refStart->cumulativeStrength) {
1941                   myStart->isRemoved = true;
1942                   removed++;
1943                 }
1944               }
1945               refStart = refStart->next;
1946             }
1947             myStart = myStart->next;
1948             traversed++;
1949           }
1950           if(removed < traversed) {
1951             myLatestEqual->isReset = true;
1952             myLatestEqual->isRemoved = false;
1953           }
1954 
1955           myLatestEqual = myLine;
1956         }
1957       }
1958     }
1959   }
1960 
1961   if(upperFirst) {
1962     //swapCase();
1963   }
1964 
1965   delete seenThis;
1966   delete seenReference;
1967 
1968 }
1969 
1970 void
transferCumulativeStrength(Line * previous,Line * that)1971 SortedLines::transferCumulativeStrength(Line *previous, Line *that) {
1972   if(that->strength > previous->cumulativeStrength) {
1973     that->cumulativeStrength = previous->cumulativeStrength;
1974   } else {
1975     that->cumulativeStrength = that->strength;
1976   }
1977 }
1978 
1979 void
calculateCumulativeStrengths(Line * start,Line * end)1980 SortedLines::calculateCumulativeStrengths(Line *start, Line *end) {
1981   // start is a reset - end may be NULL
1982   start = start->next;
1983   UColAttributeValue cumulativeStrength = UCOL_OFF;
1984   while(start && start != end) {
1985     if(start->strength < cumulativeStrength) {
1986       cumulativeStrength = start->strength;
1987     }
1988     start->cumulativeStrength = cumulativeStrength;
1989     start = start->next;
1990   }
1991 }
1992 
1993 
1994 void
getRepertoire(UnicodeSet & fillIn)1995 SortedLines::getRepertoire(UnicodeSet &fillIn) {
1996   fillIn.clear();
1997   fillIn.addAll(repertoire);
1998 }
1999 
2000 
2001 void
removeDecompositionsFromRepertoire()2002 SortedLines::removeDecompositionsFromRepertoire() {
2003   UnicodeSetIterator repertoireIter(repertoire);
2004   UErrorCode status = U_ZERO_ERROR;
2005   UChar string[256];
2006   UChar composed[256];
2007   int32_t len = 0, compLen = 0;
2008   UnicodeString compString;
2009   UnicodeSet toRemove;
2010 
2011   while(repertoireIter.next()) {
2012     len = 0;
2013     if(repertoireIter.isString()) { // process a string
2014       len = repertoireIter.getString().length();
2015       u_memcpy(string, repertoireIter.getString().getBuffer(), len);
2016     } else { // process code point
2017       UBool isError = false;
2018       U16_APPEND(string, len, 25, repertoireIter.getCodepoint(), isError);
2019     }
2020     string[len] = 0; // zero terminate, for our evil ways
2021     compLen = unorm_normalize(string, len, UNORM_NFC, 0, composed, 256, &status);
2022     if(compLen != len || u_strcmp(string, composed) != 0) {
2023       compString.setTo(composed, compLen);
2024       if(repertoire.contains(compString)) {
2025         toRemove.add(UnicodeString(string, len));
2026       }
2027     }
2028   }
2029   debug->log("\nRemoving\n");
2030   debug->log(toRemove.toPattern(compString, true), true);
2031   repertoire.removeAll(toRemove);
2032 }
2033 
2034 
2035 void
swapCase()2036 SortedLines::swapCase()
2037 {
2038   int32_t i = 0;
2039   for(i = 0; i < size; i++) {
2040     toSort[i]->swapCase();
2041   }
2042 }
2043 
2044 void
calculateSortKey(Line & line)2045 SortedLines::calculateSortKey(Line &line)
2046 {
2047   if(!sortkeys) {
2048     sortkeys = new uint8_t[size*1024];
2049     memset(sortkeys, 0, size*1024);
2050   }
2051   line.sortKey = sortkeys+sortkeyOffset;
2052   sortkeyOffset += probe.getSortKey(line, sortkeys+sortkeyOffset, size*256-sortkeyOffset);
2053 }
2054 
2055 
2056 void
calculateSortKeys()2057 SortedLines::calculateSortKeys()
2058 {
2059   if(sortkeys) {
2060     delete[] sortkeys;
2061   }
2062   sortkeyOffset = 0;
2063   sortkeys = new uint8_t[size*256];
2064   memset(sortkeys, 0, size*256);
2065   int32_t i = 0;
2066   for(i = 0; i < size; i++) {
2067     calculateSortKey(*toSort[i]);
2068   }
2069 }
2070