1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 #include "sortedlines.h"
4
codePointCmp(const void * a,const void * b)5 static int codePointCmp(const void *a, const void *b) {
6 return u_strcmp((*(Line **)a)->name, (*(Line **)b)->name);
7 }
8
SortedLines(const UnicodeSet & set,const UnicodeSet & excludeBounds,const StrengthProbe & probe,UPrinter * logger,UPrinter * debug)9 SortedLines::SortedLines(const UnicodeSet &set, const UnicodeSet &excludeBounds, const StrengthProbe &probe,
10 UPrinter *logger, UPrinter *debug) :
11 toSort(NULL),
12 toSortCapacity(0),
13 lines(NULL),
14 size(0),
15 capacity(0),
16 repertoire(set),
17 excludeBounds(excludeBounds),
18 probe(probe),
19 first(NULL),
20 last(NULL),
21 logger(logger),
22 debug(debug),
23 contractionsTable(NULL),
24 duplicators(NULL),
25 maxExpansionPrefixSize(0),
26 wordSort(false),
27 frenchSecondary(false),
28 upperFirst(false),
29 sortkeys(NULL),
30 sortkeyOffset(0)
31 {
32 memset(UB, 0, sizeof(UB));
33 int32_t i = 0;
34 for(i = 0; i < UCOL_OFF; i++) {
35 UB[i] = ∅
36 }
37 init();
38 }
39
~SortedLines()40 SortedLines::~SortedLines()
41 {
42 delete[] lines;
43 if(sortkeys) {
44 delete[] sortkeys;
45 }
46 if(toSort) {
47 delete[] toSort;
48 }
49 if(contractionsTable) {
50 delete contractionsTable;
51 }
52 if(duplicators) {
53 delete duplicators;
54 }
55 }
56
57 void
getBounds(UErrorCode & status)58 SortedLines::getBounds(UErrorCode &status) {
59 // first sort through the set
60 debug->log(toString(), true);
61 int32_t i = 0, j = 0;
62 UColAttributeValue strength = UCOL_OFF;
63 for(i = 0; i < size; i++) {
64 if(toSort[i]->strengthFromEmpty < strength) {
65 if(i && strength < UCOL_OFF) {
66 //u_strcpy(UB[strength], toSort[i-1]->name);
67 j = 1;
68 while(excludeBounds.contains(UnicodeString(toSort[i-j]->name, toSort[i-j]->len))) {
69 j++;
70 }
71 UB[strength] = toSort[i-j];
72 }
73 strength = toSort[i]->strengthFromEmpty;
74 if(strength == UCOL_PRIMARY) {
75 probe.SE = toSort[i]->name[0];
76 }
77 }
78 }
79 //u_strcpy(UB[strength], toSort[size-1]->name);
80 // a different solution for bounds: go from end and see if the guys on the top
81 // cause duplication for things
82 UChar dupch[] = { 0x0020, 0x0030, 0x0042, 0x0051, 0x0062, 0x0071, 0x0391, 0x0396, 0x03b1, 0x03b6 };
83 j = 1;
84 Line dup;
85 Line bound;
86 int32_t dups = 0;
87 while(j < size) {
88 dups = 0;
89 for(i = 0; i < sizeof(dupch)/sizeof(dupch[0]); i++) {
90 dup.setTo(dupch[i]);
91 dup.append(dupch[i]);
92 bound.setTo(dupch[i]);
93 bound.append(toSort[size-j]->name, toSort[size-j]->len);
94 if(probe.getStrength(dup, bound) >= UCOL_IDENTICAL) {
95 dups++;
96 }
97 }
98 if(dups == 0) {
99 break;
100 } else {
101 if(!duplicators) {
102 duplicators = new Hashtable();
103 }
104 duplicators->put(UnicodeString(toSort[size-j]->name, toSort[size-j]->len), &toSort[size-j], status);
105 debug->log(toSort[size-j]->toString());
106 debug->log(" is not good enough to be an upper bound\n");
107 j++;
108 }
109 }
110 if(j == size) {
111 debug->log("Oi! I'm hallucinating. Will use the first upper bound");
112 delete duplicators;
113 duplicators = NULL;
114 j = 1;
115 }
116 /*
117 j = 1;
118 while(excludeBounds.contains(UnicodeString(toSort[size-j]->name, toSort[size-j]->len))) {
119 j++;
120 }
121 */
122 UB[strength] = toSort[size-j];
123 for(i = 0; i < UCOL_OFF; i++) {
124 if(UB[i]) {
125 //debug->log(UB[i], true);
126 debug->log(UB[i]->toString(true), true);
127 }
128 }
129 }
130
131 // classifies repertoire according to the strength of their difference
132 // from the empty string
133 void
classifyRepertoire()134 SortedLines::classifyRepertoire() {
135 UColAttributeValue strongestStrengthFromEmpty = UCOL_OFF;
136 int32_t lastChange = 0;
137 int32_t i = 0, j = 0;
138 while(i < size) // && probe.distanceFromEmptyString(*toSort[i]) > UCOL_PRIMARY)
139 {
140 toSort[i]->strengthFromEmpty = probe.distanceFromEmptyString(*toSort[i]);
141 if(toSort[i]->strengthFromEmpty < strongestStrengthFromEmpty) {
142 strongestStrengthFromEmpty = toSort[i]->strengthFromEmpty;
143 lastChange = i;
144 } else if (toSort[i]->strengthFromEmpty > strongestStrengthFromEmpty) {
145 // there is a problem in detection. Most probably a quaternary.
146 // why don't we try to interpolate
147 UColAttributeValue nextStrength = UCOL_OFF;
148 UColAttributeValue prevStrength = UCOL_OFF;
149 UColAttributeValue st = UCOL_OFF;
150
151 logger->log("Interpolating to get the distance from empty for Line ");
152 logger->log(toSort[i]->toString(true), true);
153
154 if(i) {
155 st = probe.getStrength(*toSort[i-1], *toSort[i]);
156 if(st == UCOL_OFF) {
157 logger->log("Cannot deduce distance from empty using previous element. Something is very wrong! Line:");
158 logger->log(toSort[i]->toString(true), true);
159 } else if(st == UCOL_IDENTICAL || st >= toSort[i-1]->strengthFromEmpty) {
160 prevStrength = toSort[i-1]->strengthFromEmpty;
161 } else if(st < toSort[i-1]->strengthFromEmpty) {
162 prevStrength = st;
163 }
164 toSort[i]->strengthFromEmpty = prevStrength;
165 }
166 if(i < size-2) {
167 toSort[i+1]->strengthFromEmpty = probe.distanceFromEmptyString(*toSort[i+1]);
168 st = probe.getStrength(*toSort[i+1], *toSort[i]);
169 if(st == UCOL_OFF) {
170 logger->log("Cannot deduce distance from empty using next element. Something is very wrong! Line:");
171 logger->log(toSort[i]->toString(true), true);
172 } else if(st == UCOL_IDENTICAL || st < toSort[i+1]->strengthFromEmpty) {
173 nextStrength = toSort[i+1]->strengthFromEmpty;
174 } else if(st >= toSort[i+1]->strengthFromEmpty) {
175 nextStrength = st;
176 }
177 if(i) {
178 if(prevStrength != nextStrength) {
179 logger->log("Inconsistent results from interpolation! Results will most likely be wrong\n");
180 }
181 }
182 toSort[i]->strengthFromEmpty = nextStrength;
183 }
184 /*
185 UColAttributeValue problemStrength = UCOL_PRIMARY;
186 for(j = lastChange; j < i ; j++) {
187 if(toSort[j]->strength > problemStrength) {
188 problemStrength = toSort[j]->strength;
189 }
190 }
191 for(j = lastChange; j < i ; j++) {
192 toSort[j]->strengthFromEmpty = problemStrength;
193 }
194 strongestStrengthFromEmpty = toSort[i]->strengthFromEmpty;
195 lastChange = i;
196 debug->log("Problem detected in distances from empty. Most probably word sort is on\n");
197 */
198 wordSort = true;
199 }
200 i++;
201 }
202 debug->log("Distances from empty string\n");
203 debug->log(toStringFromEmpty(), true);
204 }
205
206 void
analyse(UErrorCode & status)207 SortedLines::analyse(UErrorCode &status) {
208 frenchSecondary = probe.isFrenchSecondary(status);
209 if(U_FAILURE(status)) {
210 logger->log("Test for French secondary failed. Bailing out!\n");
211 return;
212 }
213 logger->log("French secondary value is %i\n", frenchSecondary, frenchSecondary);
214 upperFirst = probe.isUpperFirst(status);
215 if(U_FAILURE(status)) {
216 logger->log("Test for upper first failed. Bailing out!\n");
217 return;
218 }
219 logger->log("upper first value is %i\n", upperFirst, upperFirst);
220 sort(true, true);
221 classifyRepertoire();
222 getBounds(status);
223 //sort(true, true);
224 addContractionsToRepertoire(status);
225 //sort(true, true);
226 debug->log("\n*** Order after detecting contractions\n\n");
227 calculateSortKeys();
228 debug->log(toPrettyString(false, true), true);
229 detectExpansions();
230 }
231
init()232 void SortedLines::init()
233 {
234 size = repertoire.size();
235 capacity = 5*size;
236 lines = new Line[capacity];
237 init(repertoire, lines);
238 }
239
init(UnicodeSet & rep,Line * lin)240 void SortedLines::init(UnicodeSet &rep, Line *lin)
241 {
242
243 UnicodeSetIterator exemplarUSetIter(rep);
244 int32_t size = 0;
245
246 while(exemplarUSetIter.next()) {
247 Line *currLine = lin+size;
248 if(exemplarUSetIter.isString()) { // process a string
249 currLine->setTo(exemplarUSetIter.getString());
250 } else { // process code point
251 currLine->setTo(exemplarUSetIter.getCodepoint());
252 }
253 currLine->name[currLine->len] = 0; // zero terminate, for our evil ways
254 //currLine->index = size;
255 size++;
256 }
257 }
258
259 void
setSortingArray(Line ** sortingArray,Line * elements,int32_t sizeToSort)260 SortedLines::setSortingArray(Line **sortingArray, Line *elements, int32_t sizeToSort) {
261 int32_t i = 0;
262 for(i = 0; i < sizeToSort; i++) {
263 sortingArray[i] = &elements[i];
264 }
265 }
266
267 int32_t
setSortingArray(Line ** sortingArray,Hashtable * table)268 SortedLines::setSortingArray(Line **sortingArray, Hashtable *table) {
269 int32_t size = table->count();
270 int32_t hashIndex = -1;
271 const UHashElement *hashElement = NULL;
272 int32_t count = 0;
273 while((hashElement = table->nextElement(hashIndex)) != NULL) {
274 sortingArray[count++] = (Line *)hashElement->value.pointer;
275 }
276 return size;
277 }
278
279 void
sort(Line ** sortingArray,int32_t sizeToSort,UBool setStrengths,UBool link)280 SortedLines::sort(Line **sortingArray, int32_t sizeToSort, UBool setStrengths, UBool link) {
281 int32_t i = 0;
282 int32_t equalStart = 0;
283 UColAttributeValue equalStrength = UCOL_OFF;
284
285 qsort(sortingArray, sizeToSort, sizeof(Line *), probe.comparer);
286
287 if(setStrengths) { // analyze strengths
288 for(i = 1; i < sizeToSort; i++) {
289 sortingArray[i]->strength = probe.getStrength(*sortingArray[i-1], *sortingArray[i]);
290 }
291 // for equal guys, do the code point ordering
292
293 i = 1;
294 while(i < sizeToSort)
295 {
296 if(sortingArray[i]->strength == UCOL_IDENTICAL) {
297 equalStart = i - 1;
298 equalStrength = sortingArray[equalStart]->strength;
299 sortingArray[equalStart]->strength = UCOL_IDENTICAL;
300 while(i < sizeToSort && sortingArray[i]->strength == UCOL_IDENTICAL) {
301 i++;
302 }
303 qsort(sortingArray+equalStart, i-equalStart, sizeof(Line *), codePointCmp);
304 sortingArray[equalStart]->strength = equalStrength;
305 } else {
306 i++;
307 }
308 }
309
310 }
311
312
313
314 if(link) { // do the linking
315 for(i = 0; i < sizeToSort - 1; i++) {
316 Line *curr = *(sortingArray+i);
317 curr->next = *(sortingArray+i+1);
318 (*(sortingArray+i+1))->previous = curr;
319 }
320 }
321 }
322
323 void
sort(UBool setStrengths,UBool link)324 SortedLines::sort(UBool setStrengths, UBool link) {
325 if(toSortCapacity < size || !toSort) {
326 if(toSort) {
327 delete[] toSort;
328 }
329 toSort = new Line*[size*2];
330 toSortCapacity = size*2;
331 }
332
333 setSortingArray(toSort, lines, size);
334 sort(toSort, size, setStrengths, link);
335
336 first = last = NULL;
337
338 if(link) { // do the linking
339 first = *toSort;
340 last = *(toSort+size-1);
341 }
342 }
343
344 void
updateBounds(UnicodeSet & set)345 SortedLines::updateBounds(UnicodeSet &set) {
346 Line line;
347 UnicodeString s1;
348 UnicodeSetIterator it1(set);
349 while(it1.next()) {
350 if(!debug->isOn()) {
351 logger->log(".");
352 }
353 if(it1.isString()) { // process a string
354 s1.setTo(it1.getString());
355 } else { // process code point
356 s1.setTo(it1.getCodepoint());
357 }
358 //line.setTo(s1);
359 UColAttributeValue strength = probe.distanceFromEmptyString(s1);
360 if(probe.compare(UnicodeString(UB[strength]->name), s1) < 0) {
361 // TODO: leak here - fixit!
362 UB[strength] = new Line(s1);
363 //u_strcpy(UB[strength], s1.getTerminatedBuffer());
364 }
365 }
366
367
368
369 }
370
addAll(Line * toAdd,int32_t toAddSize)371 void SortedLines::addAll(Line* toAdd, int32_t toAddSize)
372 {
373 if(size+toAddSize > capacity) {
374 int32_t doGrowingBreakpoint = 0;
375 // we need to do growing here
376 }
377 int32_t i = 0;
378
379 for(i = 0; i < toAddSize; i++) {
380 lines[size+i] = toAdd[i];
381 }
382 size += toAddSize;
383 }
384
setDistancesFromEmpty(Line * array,int32_t arraySize)385 void SortedLines::setDistancesFromEmpty(Line* array, int32_t arraySize)
386 {
387 int32_t i = 0;
388 for(i = 0; i < arraySize; i++) {
389 array[i].strengthFromEmpty = probe.distanceFromEmptyString(array[i]);
390 }
391 }
392
393
394 // adds contractions in to repertoire
addContractionsToRepertoire(UErrorCode & status)395 int32_t SortedLines::addContractionsToRepertoire(UErrorCode &status)
396 {
397 logger->log("\n*** Detecting contractions\n\n");
398 contractionsTable = new Hashtable();
399 int32_t noConts = 0;
400 int32_t allocateSize = 50*size;
401 // first check for simple contractions
402 Line* delta = new Line[allocateSize];
403 Line** deltaSorted = new Line*[allocateSize];
404 Line* lesserToAddTo = new Line[allocateSize];
405 Line* newDelta = new Line[allocateSize];
406 Line** newDeltaSorted = new Line*[allocateSize];
407 Line* deltaP = delta;
408 Line** deltaPP = deltaSorted;
409 Line* newDeltaP = newDelta;
410 int32_t deltaSize = 0, lesserToAddToSize = 0, newDeltaSize = 0;
411 logger->log("++ Contraction detection generation 0\n");
412 noConts = detectContractions(toSort, size, toSort, size,
413 delta, deltaSize, lesserToAddTo, lesserToAddToSize, 3*size, status);
414 setSortingArray(deltaSorted, delta, deltaSize);
415 sort(deltaSorted, deltaSize, true);
416
417 setDistancesFromEmpty(delta, deltaSize);
418 int32_t deltaPSize = deltaSize;
419 //updateBounds(delta);
420
421 int32_t generation = 0;
422 // if we found any, we have to try multiple contractions
423 // However, we want to prevent the contractions explosion
424 // if the number of simple contractions is greater than the
425 // starting size, chances are that we either have an algorithmic
426 // contraction (like iteration marks on w2k) or something
427 // is seriosly wrong.
428 if(deltaPSize < size/2) {
429 while (deltaPSize && generation < 1) {
430 generation++;
431 logger->log("\n++ Contraction detection generation %i\n", generation, generation);
432 // find more, but avoid testing the combinations we already have
433 noConts += detectContractions(toSort, size, deltaPP, deltaPSize,
434 newDeltaP, newDeltaSize, lesserToAddTo, lesserToAddToSize, 3*size, status);
435 noConts += detectContractions(deltaPP, deltaPSize, toSort, size,
436 newDeltaP, newDeltaSize, lesserToAddTo, lesserToAddToSize, 3*size, status);
437 calculateSortKeys();
438
439 addAll(deltaP, deltaPSize);
440 setSortingArray(toSort, lines, size);
441 sort(true, true);
442 setSortingArray(newDeltaSorted, newDeltaP, newDeltaSize);
443 sort(newDeltaSorted, newDeltaSize, true);
444
445 // if no new ones, bail
446 //if (newDeltaSize == 0) break;
447
448 deltaPSize = newDeltaSize;
449 newDeltaSize = 0;
450 if(deltaP == delta) {
451 deltaP = newDelta;
452 deltaPP = newDeltaSorted;
453 newDeltaP = delta;
454 } else {
455 deltaP = delta;
456 deltaPP = deltaSorted;
457 newDeltaP = newDelta;
458 }
459 setDistancesFromEmpty(deltaP, deltaPSize);
460 }
461 }
462 status = U_ZERO_ERROR;
463 // add stuff from the last batch
464 addAll(deltaP, deltaPSize);
465
466 // warning: we don't add the lesser ones in recursively, since they will
467 // infinitely loop
468 setDistancesFromEmpty(lesserToAddTo, lesserToAddToSize);
469 addAll(lesserToAddTo, lesserToAddToSize);
470 setSortingArray(toSort, lines, size);
471 sort(true, true);
472
473 delete[] deltaSorted;
474 delete[] delta;
475 delete[] lesserToAddTo;
476 delete[] newDeltaSorted;
477 delete[] newDelta;
478 return noConts;
479 }
480
481
detectContractions(Line ** firstRep,int32_t firstSize,Line ** secondRep,int32_t secondSize,Line * toAddTo,int32_t & toAddToSize,Line * lesserToAddTo,int32_t & lesserToAddToSize,int32_t capacity,UErrorCode & status)482 int32_t SortedLines::detectContractions(Line **firstRep, int32_t firstSize,
483 Line **secondRep, int32_t secondSize,
484 Line *toAddTo, int32_t &toAddToSize,
485 Line *lesserToAddTo, int32_t &lesserToAddToSize,
486 int32_t capacity, UErrorCode &status)
487 {
488 int32_t noConts = 0;
489 int i = 0, j = 0, k = 0;
490 Line lower, upper, trial, toAdd, helper;
491 UChar32 firstStart, firstEnd, secondStart;
492 UChar NFCTrial[256];
493 int32_t NFCTrialLen = 0;
494 UBool thai;
495 i = -1;
496 while(i < firstSize-1 && U_SUCCESS(status)) {
497 i++;
498 if(!debug->isOn()) {
499 logger->log("\rTesting %05i/%05i. Found %05i conts.", i, firstSize, noConts);
500 }
501 U16_GET(firstRep[i]->name, 0, 0, firstRep[i]->len, firstStart);
502 if(uscript_getScript(firstStart, &status) == USCRIPT_HAN || firstRep[i]->strengthFromEmpty > UCOL_PRIMARY) //UCOL_TERTIARY)
503 {
504 continue;
505 }
506 lower = *firstRep[i];
507 for(j = 0; j < secondSize; j++) {
508 if(noConts == capacity) {
509 return noConts;
510 }
511 U16_GET(secondRep[j]->name, 0, 0, secondRep[j]->len, secondStart);
512 if(firstStart == 0x41 && secondStart == 0x308) {
513 int32_t putBreakPointHere = 0;
514 }
515 if(uscript_getScript(secondStart, &status) == USCRIPT_HAN) // || secondRep[j]->strengthFromEmpty > UCOL_TERTIARY)
516 {
517 continue;
518 }
519 if(duplicators && duplicators->get(UnicodeString(secondRep[j]->name, secondRep[j]->len)) != NULL) {
520 debug->log("Skipping duplicator ");
521 debug->log(secondRep[j]->toString(), true);
522 continue;
523 }
524
525 if(firstRep[i]->name[0] == 0x61 && secondRep[j]->name[0] == 0x308) {
526 int32_t putBreakpointhere = 0;
527 }
528 upper.setToConcat(firstRep[i], UB[UCOL_PRIMARY]);
529 //upper.setToConcat(firstRep[i], UB[secondRep[j]->strengthFromEmpty]);
530 toAdd.setToConcat(firstRep[i], secondRep[j]);
531 U16_GET(firstRep[i]->name, 0, firstRep[i]->len-1, firstRep[i]->len, firstEnd);
532 if((thai = u_hasBinaryProperty(firstEnd, UCHAR_LOGICAL_ORDER_EXCEPTION))) {
533 // this means that the lower is single reordering character
534 // if we do the lower test without taking this into account,
535 // we'll comparing the secondRep directly to Thai. We add UB[UCOL_PRIMARY] to
536 // end of lower and in the middle of trial, so we will have
537 // lower = Thai + UB, trial Thai + UB + x, resolving to
538 // UB + Thai vs UB + Thai + x.
539 // for upper bound, we do the similar, so we have
540 // upper = Thai + UB + UB, trial = Thai + UB + x,
541 // resolving to UB + Thai + UB vs UB + Thai + x
542 if(secondRep[j]->firstCC) {
543 UChar32 UBChar;
544 U16_GET(UB[UCOL_SECONDARY]->name, 0, 0, UB[UCOL_SECONDARY]->len, UBChar);
545 if(secondRep[j]->firstCC > u_getCombiningClass(UBChar)) {
546 continue;
547 }
548 }
549 upper = *firstRep[i];
550 upper.append(*UB[UCOL_PRIMARY]);
551 //upper.append(*UB[secondRep[j]->strengthFromEmpty]);
552 upper.append(*UB[UCOL_PRIMARY]);
553 lower.append(*UB[UCOL_PRIMARY]);
554 trial = *firstRep[i];
555 trial.append(*UB[UCOL_PRIMARY]);
556 trial.append(*secondRep[j]);
557 } else if((firstRep[i]->lastCC > secondRep[j]->firstCC && secondRep[j]->firstCC && !frenchSecondary)
558 || (firstRep[i]->firstCC < secondRep[j]->lastCC && firstRep[i]->firstCC && frenchSecondary)) {
559 // Skip because normalization will reorder
560 // there will be a chance to check this again, since if we
561 // try a+b, we will also try b+a
562 continue;
563 } else if(frenchSecondary && (firstRep[i]->strengthFromEmpty > UCOL_PRIMARY && secondRep[j]->strengthFromEmpty > UCOL_PRIMARY)) {
564 continue;
565 }else if(firstRep[i]->lastCC && secondRep[j]->firstCC && frenchSecondary) {
566 trial.setToConcat(secondRep[j], firstRep[i]);
567 } else {
568 trial.setToConcat(firstRep[i], secondRep[j]);
569 }
570 // Now let's check the trial. The problem is that when you combine characters,
571 // you can end up with concatenation that is unknown for the examined API.
572 NFCTrialLen = unorm_normalize(trial.name, trial.len, UNORM_NFC, 0, NFCTrial, 256, &status);
573 if((u_strcmp(trial.name, NFCTrial) == 0) || u_strFindLast(NFCTrial, NFCTrialLen, secondRep[j]->name, secondRep[j]->len)) {
574 if(secondRep[j]->strengthFromEmpty > UCOL_TERTIARY) {
575 continue;
576 }
577 }
578 UChar32 c;
579 U16_GET(NFCTrial, 0, 0, NFCTrialLen, c);
580 helper.setTo(c);
581 if(probe.distanceFromEmptyString(helper) > UCOL_TERTIARY) {
582 continue;
583 }
584 if(NFCTrialLen > 1) {
585 U16_GET(NFCTrial, 0, NFCTrialLen-1, NFCTrialLen, c);
586 helper.setTo(c);
587 if(probe.distanceFromEmptyString(helper) > UCOL_TERTIARY) {
588 continue;
589 }
590 }
591
592 if (probe.compare(lower, trial) >= 0) { // if lower is bigger than trial
593 // this might be ok, but I'm having doubts. Here is an additional check:
594 if(firstRep[i]->len == 1 || secondRep[j]->strengthFromEmpty == UCOL_PRIMARY) {
595 // I'm basically saying that I'll add this kind of contraction for cases where I combine
596 // one letter with an accent OR when I'm combining more than one symbol with a letter.
597 noteContraction("L", lesserToAddTo, lesserToAddToSize, firstRep[i], secondRep[j], noConts, status);
598 }
599 }
600 else if (probe.compare(trial, upper) > 0) { // trial is bigger than upper??
601 noteContraction("U", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
602 }
603 #if 0
604 else if(firstRep[i]->strengthFromEmpty == UCOL_PRIMARY)
605 {
606 Line expansionLine;
607 if(getExpansionLine(trial, *firstRep[i], *secondRep[j], expansionLine) &&
608 expansionLine.len && !(expansionLine == *secondRep[j])) {
609 noteContraction("D", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
610 }
611 }
612 #endif
613 else if(firstRep[i]->strengthFromEmpty == UCOL_PRIMARY && probe.getStrength(lower, trial) < secondRep[j]->strengthFromEmpty) {
614 noteContraction("D1", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
615 }
616 else if (firstRep[i]->strengthFromEmpty == UCOL_PRIMARY && secondRep[j]->strengthFromEmpty == UCOL_PRIMARY)
617 {
618 // I have added an additional check. The checks versus upper and lower bound should be sufficient
619 // when the right side is a combining mark. There might be a reordering of combining marks, but
620 // that should be already visible in their order.
621 // compare the sequence
622 // Y- <? Y <? Y+
623 // and
624 // XY- <? XY <? XY+
625 Line xym, xyp, xy;
626 UBool xymIsContraction = false, toAddIsContraction = false;
627 if(j) {
628 if(((!secondRep[j-1]->firstCC || firstRep[i]->lastCC < secondRep[j-1]->firstCC) && !frenchSecondary)
629 ||((!firstRep[i]->firstCC || firstRep[i]->firstCC > secondRep[j-1]->lastCC) && frenchSecondary)) {
630 xym.setToConcat(firstRep[i], secondRep[j-1]);
631 toAdd.strength = probe.getStrength(xym, toAdd);
632 if(secondRep[j]->strength != toAdd.strength) {
633 // there is possibility that either xym or xy are contractions
634 // There are two situations:
635 // xym > xy or xym <n xy and ym <k y but n != k
636 // if they are reordered, we are going to see if each of them
637 // is further reordered
638 if(toAdd.strength == UCOL_OFF) {
639 // check whether toAdd shifted more down
640 k = j - 2;
641 while(k>=0 && secondRep[k]->strength > secondRep[j]->strength) {
642 k--;
643 }
644 while(!toAddIsContraction && k>=0) {
645 xyp.setToConcat(firstRep[i], secondRep[k]);
646 if(contractionsTable->get(UnicodeString(xyp.name, xyp.len)) != NULL) {
647 k--;
648 continue;
649 }
650 if(probe.compare(xyp, xym) >= 0) {
651 // xyp looks like a contraction
652 noteContraction("!1", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
653 toAddIsContraction = true;
654 } else {
655 break;
656 }
657 }
658 // first let's see if xym has moved beyond
659 if(contractionsTable->get(UnicodeString(xym.name, xym.len)) == NULL) {
660 k = j+1;
661 // ignore weaker strengths
662 while(k < secondSize && secondRep[k]->strength > secondRep[j]->strength) {
663 k++;
664 }
665 // check if we skipped the following guy
666 if(k < secondSize) {
667 xyp.setToConcat(firstRep[i], secondRep[k]);
668 if(probe.compare(xyp, xym) <= 0) {
669 // xyp looks like a contraction
670 noteContraction("!2", toAddTo, toAddToSize, firstRep[i], secondRep[j-1], noConts, status);
671 xymIsContraction = true;
672 }
673 }
674 } else {
675 xymIsContraction = true;
676 }
677 // if they have reordered, but none has moved, then we add them both
678 // and hope for the best
679 if(!xymIsContraction && !toAddIsContraction) {
680 // it is possible that there is an NFC version version of one of the
681 // strings. If we have XY > XZ, but NFC(XZ) = W and X < W, we might have
682 // have a false contraction.
683 trial.len = unorm_normalize(toAdd.name, toAdd.len, UNORM_NFC, 0, trial.name, 25, &status);
684 //UColAttributeValue strength = probe.getStrength(*firstRep[i], trial);
685 if(trial == toAdd) {
686 noteContraction("!3", toAddTo, toAddToSize, firstRep[i], secondRep[j-1], noConts, status);
687 noteContraction("!3", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
688 } else {
689 noteContraction("!4", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
690 }
691 }
692 } else { // only the strength has changed
693 // check whether the previous is contraction and if not, add the current
694 if(contractionsTable->get(UnicodeString(xym.name, xym.len)) == NULL) {
695 noteContraction("!5", toAddTo, toAddToSize, firstRep[i], secondRep[j], noConts, status);
696 }
697 }
698 }
699 }
700 }
701 }
702 if(thai) { // restore lower
703 lower = *firstRep[i];
704 }
705 }
706 }
707 return noConts;
708 }
709
710 void
noteContraction(const char * msg,Line * toAddTo,int32_t & toAddToSize,Line * left,Line * right,int32_t & noConts,UErrorCode & status)711 SortedLines::noteContraction(const char* msg, Line *toAddTo, int32_t &toAddToSize, Line *left, Line *right, int32_t &noConts, UErrorCode &status)
712 {
713 Line toAdd;
714 toAdd.setToConcat(left, right);
715 toAdd.left = left;
716 toAdd.right = right;
717 // if we're adding an accent to an existing contraction, we want to check
718 #if 0
719 Line test, trial1, trial2;
720 if(right->strengthFromEmpty > UCOL_PRIMARY) {
721 if(left->right && left->right->previous && left->right->next) {
722 test.setToConcat(left->left, left->right->previous);
723 trial1.setToConcat(&test, right);
724
725 test.setToConcat(left->left, left->right->next);
726 trial2.setToConcat(&test, right);
727 if(probe.compare(trial1, toAdd) < 0 && probe.compare(toAdd, trial2) < 0) {
728 // this means that the contraction has been broken by the newly added accent
729 // so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above
730 debug->log("Con -");
731 debug->log(msg);
732 debug->log(toAdd.toString(false), true);
733 return;
734 }
735 } else {
736 if(right->previous && right->next) {
737 trial1.setToConcat(left, right->previous);
738 trial2.setToConcat(left, right->next);
739 if(probe.compare(trial1, toAdd) < 0 && probe.compare(toAdd, trial2) < 0) {
740 // this means that the contraction has been broken by the newly added accent
741 // so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above
742 debug->log("Con -");
743 debug->log(msg);
744 debug->log(toAdd.toString(false), true);
745 return;
746 }
747 }
748 if(left->previous && left->next) {
749 trial1.setToConcat(left->previous, right);
750 trial2.setToConcat(left->next, right);
751 if(probe.compare(trial1, toAdd) < 0 && probe.compare(toAdd, trial2) < 0) {
752 // this means that the contraction has been broken by the newly added accent
753 // so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above
754 debug->log("Con -");
755 debug->log(msg);
756 debug->log(toAdd.toString(false), true);
757 return;
758 }
759 }
760
761 }
762 }
763 if(right->right && right->right->strengthFromEmpty > UCOL_PRIMARY && right->left->previous && right->left->next) { // maybe we already had a contraction with an accent
764 test.setToConcat(right->left->previous, right->right);
765 trial1.setToConcat(left, &test);
766 test.setToConcat(right->left->next, right->right);
767 trial2.setToConcat(left, &test);
768 if(probe.compare(trial1, toAdd) < 0 && probe.compare(toAdd, trial2) < 0) {
769 // this means that the contraction has been broken by the newly added accent
770 // so while 'ch' is contraction, 'ch'+dot_above sorts between 'cg'+dot_above and 'ci'+dot_above
771 debug->log("Con -");
772 debug->log(msg);
773 debug->log(toAdd.toString(false), true);
774 return;
775 }
776 }
777 #endif
778 if(contractionsTable->get(UnicodeString(toAdd.name, toAdd.len)) == NULL) {
779 if(probe.distanceFromEmptyString(toAdd) <= UCOL_TERTIARY) {
780 toAddTo[toAddToSize++] = toAdd;
781 contractionsTable->put(UnicodeString(toAdd.name, toAdd.len), &toAdd, status);
782 noConts++;
783 debug->log(msg);
784 debug->log(" Con + ");
785 debug->log(toAdd.toString(false), true);
786
787 if(!left->sortKey) {
788 calculateSortKey(*left);
789 }
790 debug->log(left->dumpSortkey());
791 debug->log(" + ");
792
793 if(!right->sortKey) {
794 calculateSortKey(*right);
795 }
796 debug->log(right->dumpSortkey());
797 debug->log(" = ");
798
799 calculateSortKey(toAdd);
800 debug->log(toAdd.dumpSortkey(), true);
801 if(noConts > size/2) {
802 status = U_BUFFER_OVERFLOW_ERROR;
803 }
804 }
805 }
806 }
807
808
809 UBool
getExpansionLine(const Line & expansion,const Line & previous,const Line & exp,Line & expansionLine)810 SortedLines::getExpansionLine(const Line &expansion, const Line &previous, const Line &exp, Line &expansionLine)
811 {
812 int expIndexSize = 0;
813 UColAttributeValue expStrength = UCOL_OFF;
814 int32_t comparisonResult = 0;
815 int32_t i = 0, k = 0, prevK = 0;
816 Line trial;
817 UBool sequenceCompleted = false;
818 int32_t expIndexes[256];
819 int32_t expIndexesSize = 0;
820
821 if(!sequenceCompleted) {
822 expIndexSize = 0;
823 expansionLine.clear();
824
825 // we will start from strength between the expansion
826 // and the target (toSort[i] and toSort[j]. First we
827 // will add as many primaries as possible. Then we will
828 // try to add secondary pieces and then tertiary.
829 // found an expansion - what is the expanding sequence?
830
831 expStrength = UCOL_PRIMARY;
832 while(!sequenceCompleted) {
833 k = 0;
834 prevK = 0;
835 while(k < size) {
836 if(expansionLine.len > 15) {
837 sequenceCompleted = true;
838 break;
839 }
840 while(k < size && toSort[k]->strength != UCOL_PRIMARY)
841 {
842 k++;
843 }
844 // nothing found
845 if(k == size) {
846 break;
847 }
848 // we need to skip over reordering things. If they were worthy, they would
849 // have been detected in the previous iteration.
850 //if(expansionLine.lastCC && toSort[k]->firstCC && expansionLine.lastCC > toSort[k]->firstCC) {
851 //k++;
852 //continue;
853 //}
854 trial = previous;
855 trial.append(expansionLine);
856 trial.append(*toSort[k]);
857 if(toSort[k]->name[0] == 0x0067) {
858 int32_t putBreakPointHere = 0;
859 }
860 comparisonResult = probe.compare(trial, expansion);
861 if(comparisonResult == 0) {
862 expansionLine = *toSort[k];
863 return true;
864 } else if (comparisonResult > 0) {
865 if(prevK) {
866 if(exp == *toSort[prevK]) {
867 expansionLine = exp;
868 return true;
869 }
870 i = prevK;
871 while(i < k-1) {
872 i++;
873 if(toSort[i]->strength > exp.strength) {
874 continue;
875 }
876 trial = previous;
877 trial.append(expansionLine);
878 trial.append(*toSort[i]);
879 if(probe.compare(trial, expansion) > 0) {
880 break;
881 }
882 }
883 // we got into situation where we have ch > ch+dot-below
884 // however, ch is a contraction and therefore we cannot use
885 // it properly. If we have hit on a contraction, we'll just try
886 // to continue. Probably need more logic here.
887 if(contractionsTable->get(UnicodeString(trial.name, trial.len)) == NULL) {
888 expansionLine.append(*toSort[i-1]);
889 expIndexes[expIndexSize++] = i-1;
890 break;
891 } else {
892 int32_t putBreakPointHere = 0;
893 }
894 } else {
895 sequenceCompleted = true;
896 break;
897 }
898 //break;
899 }
900 prevK = k;
901 k++;
902 }
903 if(!prevK || k == size) {
904 break;
905 }
906 }
907 }
908 return expIndexSize > 0;
909 }
910
911 int32_t
gooseUp(int32_t resetIndex,int32_t expansionIndex,Line & expLine,int32_t * expIndexes,int32_t & expIndexSize,UColAttributeValue strength)912 SortedLines::gooseUp(int32_t resetIndex, int32_t expansionIndex, Line &expLine, int32_t *expIndexes, int32_t &expIndexSize, UColAttributeValue strength)
913 {
914 int32_t i = expansionIndex, k = resetIndex+1, n = 0, m = 0, start = 0;
915 UBool haveChanges = false;
916 Line trial, prefix, suffix;
917 // we will first try goosing up the reset index
918 //while(toSort[k]->strength >= strength)
919 for( ; toSort[k]->strength == strength; k++)
920 {
921 //if(toSort[k]->strength > strength) {
922 //continue;
923 //}
924 trial.setToConcat(toSort[k], &expLine);
925 if(probe.compare(trial, *toSort[i]) > 0) {
926 break;
927 }
928 }
929 resetIndex = k-1;
930
931 // goose up individual characters
932 prefix = *toSort[resetIndex];
933 for(n = 0; n < expIndexSize; n++) {
934 suffix.clear();
935 for(m = n+1; m < expIndexSize; m++) {
936 suffix.append(*toSort[expIndexes[m]]);
937 }
938 k = expIndexes[n]+1;
939 //while(toSort[k]->strength >= strength)
940 for( ; toSort[k]->strength == strength; k++)
941 {
942 //if(toSort[k]->strength > strength) {
943 //continue;
944 //}
945 trial.setToConcat(&prefix, toSort[k]);
946 trial.append(suffix);
947 if(probe.compare(trial, *toSort[i]) > 0) {
948 break;
949 }
950 }
951 if(k > expIndexes[n]+1) {
952 haveChanges = true;
953 expIndexes[n] = k-1;
954 }
955 prefix.append(*toSort[expIndexes[n]]);
956 }
957
958 // try inserting ignorables
959 UColAttributeValue lastStr = UCOL_OFF;
960 k = 0;
961 while(toSort[k]->strengthFromEmpty > strength) {
962 k++;
963 }
964 if(toSort[k]->strengthFromEmpty == strength) {
965 start = k;
966 prefix = *toSort[resetIndex];
967 n = 0;
968 while(n <= expIndexSize) {
969 suffix.clear();
970 for(m = n; m < expIndexSize; m++) {
971 suffix.append(*toSort[expIndexes[m]]);
972 }
973 k = start;
974 while(toSort[k]->strengthFromEmpty == strength) {
975 trial.setToConcat(&prefix, toSort[k]);
976 trial.append(suffix);
977 lastStr = probe.getStrength(trial, *toSort[i]);
978 if(lastStr == UCOL_OFF) { // shot over - we won't find anything here
979 break;
980 } else if(lastStr > strength) {
981 for(m = expIndexSize; m > n; m--) {
982 expIndexes[m] = expIndexes[m-1];
983 }
984 expIndexes[n] = k;
985 expIndexSize++;
986 haveChanges = true;
987 break;
988 }
989 #if 0
990 if(probe.compare(trial, *toSort[i]) > 0) {
991 // if the first one skips, that means that
992 // this position doesn't work
993 if(k > start) {
994 // insert an ignorable on position n
995 for(m = expIndexSize; m > n; m--) {
996 expIndexes[m] = expIndexes[m-1];
997 }
998 expIndexes[n] = k-1;
999 expIndexSize++;
1000 haveChanges = true;
1001 if(n == expIndexSize-1) { // added to the end of the string
1002 UColAttributeValue str = probe.getStrength(trial, *toSort[i]);
1003 int32_t putBreakHere = 0;
1004 }
1005 }
1006 break;
1007 } else {
1008 lastStr = probe.getStrength(trial, *toSort[i]);
1009 }
1010 #endif
1011 k++;
1012 }
1013 prefix.append(*toSort[expIndexes[n]]);
1014 n++;
1015 }
1016 }
1017
1018 if(haveChanges) {
1019 expLine.clear();
1020 for(m = 0; m < expIndexSize; m++) {
1021 expLine.append(*toSort[expIndexes[m]]);
1022 }
1023 }
1024 return resetIndex;
1025 }
1026
1027 int32_t
detectExpansions()1028 SortedLines::detectExpansions()
1029 {
1030 logger->log("\n*** Detecting expansions\n\n");
1031 int32_t exCount = 0;
1032 int32_t i = 0, j = 0, k = 0, prevK = 0;
1033 Line *previous, trial, expansionLine;
1034 UBool foundExp = false, sequenceCompleted = false;
1035 UColAttributeValue strength = UCOL_OFF;
1036 UColAttributeValue maxStrength = UCOL_IDENTICAL;
1037 UColAttributeValue expStrength = UCOL_OFF;
1038 int32_t expIndexes[256];
1039 int32_t expIndexSize = 0;
1040 memset(expIndexes, 0, sizeof(expIndexes));
1041
1042 // for each element, we look back to find whether there is such a q for which
1043 // q <n x < qUBn. These are possible expansions. When going backwards we skip
1044 // over already detected expansions.
1045 i = 0;
1046 // it turns out that looking at accents as possible expansions is
1047 // quite a stupid thing to do, especially on non ICU platforms.
1048 // Previously this line skipped over identicals only, but
1049 // now we are going to skip all the way to non-ignorables.
1050 while(toSort[i]->strengthFromEmpty > UCOL_PRIMARY) {
1051 i++;
1052 }
1053 i++;
1054 for( ; i < size; i++) {
1055 if(toSort[i]->name[0]==0x0063 && toSort[i]->name[1] == 0x68) // && toSort[i]->name[1] == 0x308)0043 0043 0219
1056 {
1057 int32_t putBreakpointhere = 0;
1058 }
1059 foundExp = false;
1060 sequenceCompleted = false;
1061 strength = toSort[i]->strength;
1062 if(strength == UCOL_IDENTICAL && toSort[i-1]->isExpansion == true) {
1063 u_strcpy(toSort[i]->expansionString, toSort[i-1]->expansionString);
1064 toSort[i]->expLen = toSort[i-1]->expLen;
1065 toSort[i]->isExpansion = true;
1066 toSort[i]->expIndex = toSort[i-1]->expIndex;
1067 toSort[i]->expStrength = UCOL_IDENTICAL;
1068 //toSort[i]->expStrength = toSort[i-1]->expStrength;
1069 foundExp = true;
1070 sequenceCompleted = true;
1071 }
1072 //logger->log("%i %i\n", i, j);
1073 while(!foundExp && strength <= maxStrength) {
1074 j = i-1;
1075 while(j && (toSort[j]->isExpansion == true || toSort[j]->isRemoved == true)) {
1076 //if(toSort[j]->strength < strength) {
1077 //strength = toSort[j]->strength;
1078 //}
1079 j--;
1080 }
1081
1082 //while(j && toSort[j]->strength > strength)
1083 while(j && toSort[j]->strength > probe.getStrength(*toSort[j], *toSort[i]))
1084 {
1085 j--;
1086 }
1087 //if(toSort[j]->strength == strength) {
1088 previous = toSort[j];
1089 if(previous->strengthFromEmpty >= UCOL_IDENTICAL ||
1090 (previous->strengthFromEmpty == UCOL_SECONDARY
1091 && strength == UCOL_SECONDARY
1092 && previous->lastCC > UB[strength]->firstCC)) {
1093 break;
1094 //continue;
1095 }
1096 //trial.setToConcat(previous, UB[strength]);
1097 trial.setToConcat(previous, UB[probe.getStrength(*toSort[j], *toSort[i])]);
1098 if(probe.compare(trial, *toSort[i]) > 0) {
1099 foundExp = true;
1100 }
1101 //}
1102 if(strength == UCOL_QUATERNARY) {
1103 strength = UCOL_IDENTICAL;
1104 } else {
1105 strength = (UColAttributeValue)(strength + 1);
1106 }
1107 }
1108 // calculate the expanding sequence
1109 if(foundExp && !sequenceCompleted) {
1110 expIndexSize = 0;
1111 expansionLine.clear();
1112 exCount++;
1113 // we will start from strength between the expansion
1114 // and the target (toSort[i] and toSort[j]. First we
1115 // will add as many primaries as possible. Then we will
1116 // try to add secondary pieces and then tertiary.
1117 // found an expansion - what is the expanding sequence?
1118
1119 expStrength = UCOL_PRIMARY;
1120 while(!sequenceCompleted) {
1121 k = 0;
1122 prevK = 0;
1123 while(k < size) {
1124 if(expansionLine.len > 15) {
1125 sequenceCompleted = true;
1126 break;
1127 }
1128 while(k < size && toSort[k]->strength != UCOL_PRIMARY) {
1129 k++;
1130 }
1131 // nothing found
1132 if(k == size) {
1133 break;
1134 }
1135 // we need to skip over reordering things. If they were worthy, they would
1136 // have been detected in the previous iteration.
1137 //if(expansionLine.lastCC && toSort[k]->firstCC && expansionLine.lastCC > toSort[k]->firstCC) {
1138 //k++;
1139 //continue;
1140 //}
1141 trial = *previous;
1142 trial.append(expansionLine);
1143 trial.append(*toSort[k]);
1144 if(toSort[k]->name[0] == 0x0067) {
1145 int32_t putBreakPointHere = 0;
1146 }
1147 if(probe.compare(trial, *toSort[i]) > 0) {
1148 if(prevK) {
1149 // we got into situation where we have ch > ch+dot-below
1150 // however, ch is a contraction and therefore we cannot use
1151 // it properly. If we have hit on a contraction, we'll just try
1152 // to continue. Probably need more logic here.
1153 if(contractionsTable->get(UnicodeString(trial.name, trial.len)) == NULL) {
1154 expansionLine.append(*toSort[prevK]);
1155 expIndexes[expIndexSize++] = prevK;
1156 break;
1157 } else {
1158 int32_t putBreakPointHere = 0;
1159 }
1160 } else {
1161 sequenceCompleted = true;
1162 break;
1163 }
1164 //break;
1165 }
1166 prevK = k;
1167 k++;
1168 }
1169 if(!prevK || k == size) {
1170 break;
1171 }
1172 }
1173 // after this we have primaries lined up.
1174 // we are going to goose up with secondaries and
1175 // tertiaries
1176 trial.setToConcat(toSort[j], &expansionLine);
1177 expStrength = probe.getStrength(trial, *toSort[i]);
1178 if(expStrength > UCOL_PRIMARY) {
1179 if(expStrength == UCOL_SECONDARY || expStrength == UCOL_OFF) {
1180 j = gooseUp(j, i, expansionLine, expIndexes, expIndexSize, UCOL_SECONDARY);
1181 trial.setToConcat(toSort[j], &expansionLine);
1182 expStrength = probe.getStrength(trial, *toSort[i]);
1183 if(expStrength == UCOL_TERTIARY) {
1184 j = gooseUp(j, i, expansionLine, expIndexes, expIndexSize, UCOL_TERTIARY);
1185 }
1186 } else if(expStrength == UCOL_TERTIARY) {
1187 j = gooseUp(j, i, expansionLine, expIndexes, expIndexSize, UCOL_TERTIARY);
1188 }
1189 }
1190 trial.setToConcat(toSort[j], &expansionLine);
1191 expStrength = probe.getStrength(trial, *toSort[i]);
1192 if(expansionLine.len) {
1193 if(expansionLine.name[0] == 0x73 && expansionLine.name[1] == 0x7a) {
1194 int32_t putBreakpointhere = 0;
1195 }
1196 UBool isExpansionLineAContraction = (contractionsTable->get(UnicodeString(expansionLine.name, expansionLine.len)) != NULL);
1197 // we have an expansion line and an expansion. There could be some expansions where
1198 // the difference between expansion line and the end of expansion sequence is less or
1199 // equal than the expansion strength. These should probably be removed.
1200 int32_t diffLen = toSort[i]->len - expansionLine.len;
1201 if(diffLen > 0) {
1202 trial.setTo(UnicodeString(toSort[i]->name + diffLen, toSort[i]->len - diffLen));
1203 } else {
1204 trial = *toSort[i];
1205 }
1206 UColAttributeValue s1 = probe.getStrength(trial, expansionLine);
1207 if(s1 == UCOL_OFF) {
1208 s1 = probe.getStrength(expansionLine, trial);
1209 }
1210 if((!isExpansionLineAContraction && s1 >= expStrength) || (diffLen <= 0 && s1 == UCOL_IDENTICAL)) {
1211 contractionsTable->remove(UnicodeString(toSort[i]->name, toSort[i]->len));
1212 toSort[i]->isRemoved = true;
1213 if(toSort[i]->next && toSort[i]->previous) {
1214 toSort[i]->previous->next = toSort[i]->next;
1215 }
1216 if(toSort[i]->previous && toSort[i]->next) {
1217 toSort[i]->next->previous = toSort[i]->previous;
1218 }
1219 debug->log("Exp -N: ");
1220 debug->log(toSort[i]->toString(false));
1221 debug->log(" / ");
1222 debug->log(expansionLine.toString(false), true);
1223 }
1224 else
1225 {
1226 u_strncat(toSort[i]->expansionString, expansionLine.name, expansionLine.len);
1227 toSort[i]->isExpansion = true;
1228 toSort[i]->expStrength = expStrength;
1229 toSort[i]->expLen = expansionLine.len;
1230 toSort[i]->expansionString[toSort[i]->expLen] = 0;
1231 toSort[i]->expIndex = j;
1232 }
1233 }
1234 }
1235 if(toSort[i]->isExpansion == true) {
1236 if(debug->isOn()) {
1237 debug->log("Exp + : &");
1238 debug->log(toSort[j]->toString(false));
1239 debug->log(toSort[i]->strengthToString(toSort[i]->expStrength, true));
1240 debug->log(toSort[i]->toString(false));
1241 debug->log(" ");
1242 if(!toSort[j]->sortKey) {
1243 calculateSortKey(*toSort[j]);
1244 }
1245 debug->log(toSort[j]->dumpSortkey());
1246 debug->log(" ... ");
1247 if(!toSort[i]->sortKey) {
1248 calculateSortKey(*toSort[i]);
1249 }
1250 debug->log(toSort[i]->dumpSortkey());
1251 calculateSortKey(expansionLine);
1252 debug->log("/");
1253 debug->log(expansionLine.dumpSortkey(), true);
1254 }
1255
1256 }
1257 }
1258 // after detecting expansions, we want to position them.
1259 // it is better to position expansions after all have been detected,
1260 // since otherwise we will change the ordering.
1261 for(i = size-1; i >= 0; i--) {
1262 if(toSort[i]->isExpansion) {
1263 if(toSort[i]->name[0] == 0x2A3) {
1264 int32_t putBreakPointHere = 0;
1265 }
1266 if(i) {
1267 if(toSort[i]->previous) {
1268 toSort[i]->previous->next = toSort[i]->next;
1269 }
1270 }
1271 if(i < size-1) {
1272 if(toSort[i]->next) {
1273 toSort[i]->next->previous = toSort[i]->previous;
1274 }
1275 }
1276 j = toSort[i]->expIndex;
1277 toSort[i]->next = toSort[j]->next;
1278 toSort[i]->previous = toSort[j];
1279 toSort[j]->next = toSort[i];
1280 if(toSort[i]->next) {
1281 toSort[i]->next->previous = toSort[i];
1282 }
1283 toSort[i]->strength = toSort[i]->expStrength;
1284 }
1285 }
1286 return exCount;
1287 }
1288
1289
1290 Line *
getFirst()1291 SortedLines::getFirst() {
1292 current = first;
1293 return current;
1294 }
1295
1296 Line *
getLast()1297 SortedLines::getLast() {
1298 current = last;
1299 return current;
1300 }
1301
1302 void
add(Line * line,UBool linkIn)1303 SortedLines::add(Line *line, UBool linkIn) {
1304 if(size++ == capacity) {
1305 // grow
1306 }
1307 lines[size] = *line;
1308 Line *toAdd = &lines[size];
1309 if(linkIn && first) {
1310 Line *current = first;
1311 while(current != NULL && probe.comparer(¤t, &toAdd) < 0) {
1312 current = current->next;
1313 }
1314 if(current == NULL) {
1315 toAdd->previous = last;
1316 toAdd->next = NULL;
1317 if(last != NULL) {
1318 last->next = toAdd;
1319 }
1320 last = toAdd;
1321 if(first == NULL) {
1322 first = toAdd;
1323 }
1324 } else { // current != NULL
1325 toAdd->next = current;
1326 toAdd->previous = current->previous;
1327 if(current->previous) {
1328 current->previous->next = toAdd;
1329 } else {
1330 first = toAdd;
1331 }
1332 current->previous = toAdd;
1333 }
1334 }
1335 }
1336
1337
1338 Line *
getNext()1339 SortedLines::getNext()
1340 {
1341 if(current != NULL) {
1342 current=current->next;
1343 }
1344 return current;
1345 }
1346
1347 Line *
getPrevious()1348 SortedLines::getPrevious()
1349 {
1350 if(current != NULL) {
1351 current=current->previous;
1352 }
1353 return current;
1354 }
1355
1356 Line *
operator [](int32_t index)1357 SortedLines::operator[](int32_t index)
1358 {
1359 int32_t i = 0;
1360 Line *c = first;
1361 for(i = 0; i < index; i++) {
1362 if(c != NULL) {
1363 c = c->next;
1364 }
1365 }
1366 return c;
1367 }
1368
1369 UnicodeString
arrayToString(Line ** sortedLines,int32_t linesSize,UBool pretty,UBool useLinks,UBool printSortKeys)1370 SortedLines::arrayToString(Line** sortedLines, int32_t linesSize, UBool pretty, UBool useLinks, UBool printSortKeys) {
1371 UnicodeString result;
1372 int32_t i = 0;
1373
1374 Line *line = NULL;
1375 Line *previous = sortedLines[0];
1376 if(printSortKeys && !sortkeys) {
1377 printSortKeys = false;
1378 }
1379 if(previous->isReset) {
1380 result.append(" & ");
1381 result.append(previous->name, previous->len);
1382 if(pretty) {
1383 result.append(" # ");
1384 result.append(previous->stringToName(previous->name, previous->len));
1385 result.append("\n");
1386 }
1387 } else if(!previous->isRemoved) {
1388 result.append(previous->toString(pretty));
1389 if(pretty) {
1390 result.append("\n");
1391 }
1392 }
1393 i = 1;
1394 while((i < linesSize && !useLinks) || (previous->next && useLinks)) {
1395 if(useLinks) {
1396 line = previous->next;
1397 } else {
1398 line = sortedLines[i];
1399 }
1400 if(line->isReset) {
1401 result.append(" &");
1402 result.append(line->name, line->len);
1403 if(pretty) {
1404 result.append(" # ");
1405 result.append(line->stringToName(line->name, line->len));
1406 result.append("\n");
1407 }
1408 } else if(!line->isRemoved) {
1409 if(i > 0) {
1410 result.append(line->strengthToString(line->strength, pretty));
1411 }
1412 result.append(line->toString(pretty));
1413 if(printSortKeys) {
1414 result.append(line->dumpSortkey());
1415 }
1416 if(pretty) {
1417 result.append("\n");
1418 }
1419 }
1420 previous = line;
1421 i++;
1422 }
1423 return result;
1424 }
1425
SortedLines(FILE * file,UPrinter * logger,UPrinter * debug,UErrorCode & status)1426 SortedLines::SortedLines(FILE *file, UPrinter *logger, UPrinter *debug, UErrorCode &status) :
1427 toSort(NULL),
1428 toSortCapacity(0),
1429 lines(NULL),
1430 size(0),
1431 capacity(0),
1432 first(NULL),
1433 last(NULL),
1434 logger(logger),
1435 debug(debug),
1436 contractionsTable(NULL),
1437 duplicators(NULL),
1438 maxExpansionPrefixSize(0),
1439 wordSort(false),
1440 frenchSecondary(false),
1441 upperFirst(false),
1442 sortkeys(NULL),
1443 sortkeyOffset(0)
1444 {
1445 debug->log("*** loading a dump\n");
1446 memset(UB, 0, sizeof(UB));
1447 int32_t i = 0;
1448 for(i = 0; i < UCOL_OFF; i++) {
1449 UB[i] = ∅
1450 }
1451
1452 int32_t newFrench, newUpperFirst;
1453 fscanf(file, "%i,%i,%i\n", &size, &newFrench, &newUpperFirst);
1454 debug->log("Read size %i, frenchSecondary %i and upperFirst %i\n", size, newFrench, newUpperFirst);
1455 frenchSecondary = (UBool)newFrench;
1456 upperFirst = (UBool)newUpperFirst;
1457 capacity = size;
1458 lines = new Line[capacity];
1459 i = 0;
1460
1461 char buff[256];
1462
1463 while(fgets(buff, 256, file)) {
1464 if(i % 20 == 0) {
1465 logger->log("\rLine: %04i", i, buff);
1466 }
1467 lines[i].initFromString(buff, 256, status);
1468 if(i) {
1469 lines[i].previous = &lines[i-1];
1470 lines[i-1].next = &lines[i];
1471 }
1472 i++;
1473 }
1474 size = i;
1475 toSort = new Line*[size];
1476 setSortingArray(toSort, lines, size);
1477 first = &lines[0];
1478 last = &lines[size-1];
1479 }
1480
1481 void
toFile(FILE * file,UBool useLinks,UErrorCode & status)1482 SortedLines::toFile(FILE *file, UBool useLinks, UErrorCode &status)
1483 {
1484 fprintf(file, "%i,%i,%i\n", size, frenchSecondary, upperFirst);
1485 int32_t i = 1;
1486 Line *previous = toSort[0];
1487 Line *line = NULL;
1488 char buff[256];
1489 previous->write(buff, 256, status);
1490 fprintf(file, "%s\n", buff);
1491 fflush(file);
1492 while(previous->next) {
1493 if(useLinks) {
1494 line = previous->next;
1495 } else {
1496 line = toSort[i];
1497 }
1498 line->write(buff, 256, status);
1499 fprintf(file, "%s\n", buff);
1500 i++;
1501 previous = line;
1502 }
1503 }
1504
1505
1506
1507 UnicodeString
toStringFromEmpty()1508 SortedLines::toStringFromEmpty() {
1509 UBool useLinks = false;
1510 UBool pretty = false;
1511 UnicodeString result;
1512 int32_t i = 0;
1513
1514 Line *line = NULL;
1515 Line *previous = toSort[0];
1516 if(previous->isReset) {
1517 result.append(" & ");
1518 if(pretty) {
1519 result.append("\n");
1520 }
1521 result.append(previous->name, previous->len);
1522 } else if(!previous->isRemoved) {
1523 result.append(previous->toString(pretty));
1524 if(pretty) {
1525 result.append("\n");
1526 }
1527 }
1528 i = 1;
1529 while(i < size || previous->next) {
1530 if(useLinks) {
1531 line = previous->next;
1532 } else {
1533 line = toSort[i];
1534 }
1535 if(line->isReset) {
1536 result.append(" &");
1537 result.append(line->name, line->len);
1538 if(pretty) {
1539 result.append(" # ");
1540 result.append(line->stringToName(line->name, line->len));
1541 result.append("\n");
1542 }
1543 } else if(!line->isRemoved) {
1544 if(i > 0) {
1545 result.append(line->strengthToString(line->strengthFromEmpty, pretty));
1546 }
1547 result.append(line->toString(pretty));
1548 if(pretty) {
1549 result.append("\n");
1550 }
1551 }
1552 previous = line;
1553 i++;
1554 }
1555 return result;
1556 }
1557
1558 UnicodeString
toString(UBool useLinks)1559 SortedLines::toString(UBool useLinks)
1560 {
1561 return arrayToString(toSort, size, false, useLinks, false);
1562 }
1563
1564
1565 UnicodeString
toPrettyString(UBool useLinks,UBool printSortKeys)1566 SortedLines::toPrettyString(UBool useLinks, UBool printSortKeys)
1567 {
1568 return arrayToString(toSort, size, true, useLinks, printSortKeys);
1569 }
1570
1571 UnicodeString
toOutput(const char * format,const char * locale,const char * platform,const char * reference,UBool useLinks,UBool initialize,UBool moreToCome)1572 SortedLines::toOutput(const char *format,
1573 const char *locale, const char *platform, const char *reference,
1574 UBool useLinks, UBool initialize, UBool moreToCome) {
1575 if(strcmp(format, "HTML") == 0) {
1576 return toHTML(locale, platform, reference, useLinks, initialize, moreToCome);
1577 } else if(strcmp(format, "XML") == 0) {
1578 return toXML(locale, platform, reference, useLinks, initialize, moreToCome);
1579 } else {
1580 return toBundle(locale, platform, reference, useLinks, initialize, moreToCome);
1581 }
1582 }
1583
1584
1585 UnicodeString
toHTML(const char * locale,const char * platform,const char * reference,UBool useLinks,UBool initialize,UBool moreToCome)1586 SortedLines::toHTML(const char *locale,
1587 const char *platform, const char *reference,
1588 UBool useLinks, UBool initialize, UBool moreToCome)
1589 {
1590 UnicodeString result;
1591 int32_t i = 0;
1592 if(initialize) {
1593 result.append("<html>\n<head>\n<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\">\n</head>\n");
1594 result.append("# Collation data resource bundle generated for locale: ");
1595 result.append(locale);
1596 result.append("<br>\n# For platform ");
1597 result.append(platform);
1598 result.append(" reference platform ");
1599 result.append(reference);
1600 result.append("<br><br>\n\n\n");
1601
1602 result.append(locale);
1603 if(platform) {
1604 result.append("_");
1605 result.append(platform);
1606 }
1607 if(reference) {
1608 result.append("_vs_");
1609 result.append(reference);
1610 }
1611 result.append(" {<br>\n");
1612
1613 result.append(" collations {<br>\n standard {<br>\n Sequence {<br>\n");
1614 }
1615
1616 if(frenchSecondary) {
1617 result.append("[backwards 2]<br>\n");
1618 }
1619 if(upperFirst) {
1620 result.append("[casefirst upper]<br>\n");
1621 }
1622
1623 Line *line = toSort[0];
1624
1625 i = 0;
1626 while((i < size && !useLinks) || (line->next && useLinks)) {
1627 if(line->isReset || !line->isRemoved) {
1628 result.append(line->toHTMLString());
1629 }
1630 i++;
1631 if(useLinks) {
1632 line = line->next;
1633 } else {
1634 line = toSort[i];
1635 }
1636 }
1637 if(!moreToCome) {
1638 result.append(" }<br>\n }<br>\n }<br>\n}<br>\n");
1639
1640 result.append("</html>\n");
1641 }
1642
1643 return result;
1644 }
1645
1646 UnicodeString
toXML(const char * locale,const char * platform,const char * reference,UBool useLinks,UBool initialize,UBool moreToCome)1647 SortedLines::toXML(const char *locale,
1648 const char *platform, const char *reference,
1649 UBool useLinks, UBool initialize, UBool moreToCome)
1650 {
1651 UnicodeString result;
1652 int32_t i = 0;
1653 if(initialize) {
1654 result.append("<html>\n<head>\n<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\">\n</head>\n");
1655 result.append("# Collation data resource bundle generated for locale: ");
1656 result.append(locale);
1657 result.append("<br>\n# For platform ");
1658 result.append(platform);
1659 result.append(" reference platform ");
1660 result.append(reference);
1661 result.append("<br><br>\n\n\n");
1662
1663 result.append(locale);
1664 if(platform) {
1665 result.append("_");
1666 result.append(platform);
1667 }
1668 if(reference) {
1669 result.append("_vs_");
1670 result.append(reference);
1671 }
1672 result.append(" {<br>\n");
1673
1674 result.append(" collations {<br>\n standard {<br>\n Sequence {<br>\n");
1675 }
1676
1677 if(frenchSecondary) {
1678 result.append("[backwards 2]<br>\n");
1679 }
1680 if(upperFirst) {
1681 result.append("[casefirst upper]<br>\n");
1682 }
1683
1684 Line *line = toSort[0];
1685
1686 i = 0;
1687 while((i < size && !useLinks) || (line->next && useLinks)) {
1688 if(line->isReset || !line->isRemoved) {
1689 result.append(line->toHTMLString());
1690 }
1691 i++;
1692 if(useLinks) {
1693 line = line->next;
1694 } else {
1695 line = toSort[i];
1696 }
1697 }
1698 if(!moreToCome) {
1699 result.append(" }<br>\n }<br>\n }<br>\n}<br>\n");
1700
1701 result.append("</html>\n");
1702 }
1703
1704 return result;
1705 }
1706
1707 UnicodeString
toBundle(const char * locale,const char * platform,const char * reference,UBool useLinks,UBool initialize,UBool moreToCome)1708 SortedLines::toBundle(const char *locale,
1709 const char *platform, const char *reference,
1710 UBool useLinks, UBool initialize, UBool moreToCome)
1711 {
1712 UnicodeString result;
1713 int32_t i = 0;
1714
1715 if(initialize) {
1716 result.append("// Collation data resource bundle generated for locale: ");
1717 result.append(locale);
1718 result.append("\n// For platform ");
1719 result.append(platform);
1720 result.append(" reference platform ");
1721 result.append(reference);
1722 result.append("\n\n\n");
1723
1724 result.append(locale);
1725 /*
1726 if(platform) {
1727 result.append("_");
1728 result.append(platform);
1729 }
1730 if(reference) {
1731 result.append("_vs_");
1732 result.append(reference);
1733 }
1734 */
1735 result.append(" {\n");
1736
1737 result.append(" collations {\n standard {\n Sequence {\n");
1738 }
1739
1740 if(frenchSecondary) {
1741 result.append("[backwards 2]\n");
1742 }
1743 if(upperFirst) {
1744 result.append("[casefirst upper]\n");
1745 }
1746
1747 Line *line = toSort[0];
1748
1749 i = 0;
1750 while((i < size && !useLinks) || (line->next && useLinks)) {
1751 if(line->isReset || !line->isRemoved) {
1752 result.append(line->toBundleString());
1753 }
1754 i++;
1755 if(useLinks) {
1756 line = line->next;
1757 } else {
1758 line = toSort[i];
1759 }
1760 }
1761
1762 if(!moreToCome) {
1763 result.append(" }\n }\n }\n}\n");
1764 }
1765
1766 return result;
1767 }
1768
1769
1770 int32_t
getSize() const1771 SortedLines::getSize() const {
1772 return repertoire.size();
1773 }
1774
1775 void
reduceDifference(SortedLines & reference)1776 SortedLines::reduceDifference(SortedLines& reference) {
1777 UErrorCode status = U_ZERO_ERROR;
1778 if(upperFirst) {
1779 swapCase();
1780 }
1781 // both sorted lines structures need to have established links and strengths
1782 // We walk down both structures and note differences. These
1783 // differences will modify this by removng elements, setting resets
1784 // etc...
1785 // we will prefer insertions from tailoring to reference, then deletions
1786 // there are two tables that keep seen elements.
1787 Hashtable *seenThis = new Hashtable();
1788 Hashtable *seenReference = new Hashtable();
1789
1790
1791 UBool found = false;
1792 UBool finished = false;
1793 const int32_t lookForward = 20;
1794 int32_t tailoringMove = 0;
1795 //int32_t referenceSize = reference.getSize();
1796 Line *refLine = reference.getFirst();
1797 Line *refLatestEqual = refLine;
1798 refLine = refLine->next;
1799 Line *myLine = getFirst();
1800 Line *myLatestEqual = myLine;
1801 myLatestEqual->isRemoved = true;
1802 myLine = myLine->next;
1803 while(myLine && refLine) {
1804 found = false;
1805 while(myLine && refLine && myLine->equals(*refLine)) {
1806 myLatestEqual = myLine;
1807 myLatestEqual->isRemoved = true;
1808 myLine = myLine->next;
1809 refLatestEqual = refLine;
1810 refLine = refLine->next;
1811 if(refLine == NULL && myLine == NULL) {
1812 finished = true;
1813 }
1814 }
1815 if(myLine) {
1816 myLine->cumulativeStrength = myLine->strength;
1817 }
1818 if(refLine) {
1819 refLine->cumulativeStrength = refLine->strength;
1820 }
1821
1822 // here is the difference
1823 while(!found && !finished) {
1824 tailoringMove = 0;
1825 if(myLine && refLine) {
1826 if(myLine->cumulativeStrength > refLine->cumulativeStrength) {
1827 // tailoring z <<< x, UCA z < y
1828 while(myLine->cumulativeStrength > refLine->cumulativeStrength) {
1829 myLine = myLine->next;
1830 if(myLine) {
1831 transferCumulativeStrength(myLine->previous, myLine);
1832 } else {
1833 break;
1834 }
1835 }
1836 } else if(myLine->cumulativeStrength < refLine->cumulativeStrength) {
1837 // tailoring z < x, UCA z <<< y
1838 while(myLine->cumulativeStrength < refLine->cumulativeStrength) {
1839 seenReference->put(UnicodeString(refLine->name, refLine->len), refLine, status);
1840 refLine = refLine->next;
1841 if(refLine) {
1842 transferCumulativeStrength(refLine->previous, refLine);
1843 } else {
1844 break;
1845 }
1846 }
1847 }
1848 // this is the interesting point. Now we search for character match
1849 while(myLine && refLine && (!myLine->equals(*refLine) || myLine->strength == UCOL_IDENTICAL)
1850 && tailoringMove < lookForward) {
1851 if(seenThis->get(UnicodeString(refLine->name, refLine->len))) {
1852 // we are not interested in stuff from the reference that is already accounted
1853 // for in the tailoring.
1854 refLine = refLine->next;
1855 if(refLine) {
1856 transferCumulativeStrength(refLine->previous, refLine);
1857 }
1858 } else {
1859 myLine = myLine->next;
1860 if(myLine) {
1861 transferCumulativeStrength(myLine->previous, myLine);
1862 if(!seenReference->get(UnicodeString(myLine->name, myLine->len))) {
1863 tailoringMove++;
1864 }
1865 }
1866 }
1867 }
1868 }
1869 if(refLine == NULL) { // ran out of reference
1870 // this is the tail of tailoring - the last insertion
1871 myLine = NULL;
1872 found = true;
1873 } else if(tailoringMove == lookForward || myLine == NULL) { // run over treshold or out of tailoring
1874 tailoringMove = 0;
1875 // we didn't find insertion after all
1876 // we will try substitution next
1877 // reset the tailoring pointer
1878 myLine = myLatestEqual->next;
1879 // move the reference
1880 refLine = refLine->next;
1881 if(refLine) {
1882 transferCumulativeStrength(refLine->previous, refLine);
1883 }
1884 } else { // we found an insertion
1885 tailoringMove = 0;
1886 if(myLine->strength != refLine->strength) {
1887 while(myLine && refLine && *myLine == *refLine
1888 && (myLine->strength != refLine->strength
1889 || myLine->strength == UCOL_IDENTICAL)) {
1890 myLine = myLine->next;
1891 refLine = refLine->next;
1892 }
1893 if(*myLine != *refLine) {
1894 continue;
1895 }
1896 }
1897 if(myLine && refLine && myLine->previous->strength < myLine->strength) {
1898 myLine = myLine->next;
1899 refLine = refLine->next;
1900 if(*myLine != *refLine) {
1901 continue;
1902 }
1903 }
1904 found = true;
1905 }
1906 if(found) {
1907 if(myLatestEqual->next != myLine || refLine == NULL) {
1908 Line *myStart = NULL;
1909 // this is a reset and a sequence
1910 // myLatestEqual points at the last point that was the same
1911 // This point will be a reset
1912 if(myLine && refLine) { // if there is anything more to do - it might be worth saving it
1913 myStart = myLatestEqual;
1914 while(myStart != myLine) {
1915 seenThis->put(UnicodeString(myStart->name, myStart->len), myStart, status);
1916 myStart = myStart->next;
1917 }
1918 }
1919 // Try to weed out stuff that is not affected, like:
1920 // Tailoring:
1921 // <<<S<<\u017F<\u0161<<<\u0160<t
1922 // UCA:
1923 // <<<S<<\u0161<<<\u0160<<\u017F<t
1924 // Result:
1925 // &S<<\u017F<\u0161<<<\u0160
1926 // we have a sequence that spans from myLatestEqual to myLine (that one could be NULL,
1927 // so we have to go down from myLatestEqual.
1928 // Basically, for every element, we want to see the strongest cumulative difference
1929 // from the reset point. If the cumulative difference is the same in both the reference and
1930 // tailoring, that element could be removed.
1931 calculateCumulativeStrengths(myLatestEqual, myLine);
1932 calculateCumulativeStrengths(refLatestEqual, refLine);
1933 myStart = myLatestEqual;
1934 int32_t removed = 0;
1935 int32_t traversed = 0;
1936 while(myStart && myStart != myLine) {
1937 Line *refStart = refLatestEqual;
1938 while(refStart && refStart != refLine) {
1939 if(*myStart == *refStart) {
1940 if(myStart->cumulativeStrength == refStart->cumulativeStrength) {
1941 myStart->isRemoved = true;
1942 removed++;
1943 }
1944 }
1945 refStart = refStart->next;
1946 }
1947 myStart = myStart->next;
1948 traversed++;
1949 }
1950 if(removed < traversed) {
1951 myLatestEqual->isReset = true;
1952 myLatestEqual->isRemoved = false;
1953 }
1954
1955 myLatestEqual = myLine;
1956 }
1957 }
1958 }
1959 }
1960
1961 if(upperFirst) {
1962 //swapCase();
1963 }
1964
1965 delete seenThis;
1966 delete seenReference;
1967
1968 }
1969
1970 void
transferCumulativeStrength(Line * previous,Line * that)1971 SortedLines::transferCumulativeStrength(Line *previous, Line *that) {
1972 if(that->strength > previous->cumulativeStrength) {
1973 that->cumulativeStrength = previous->cumulativeStrength;
1974 } else {
1975 that->cumulativeStrength = that->strength;
1976 }
1977 }
1978
1979 void
calculateCumulativeStrengths(Line * start,Line * end)1980 SortedLines::calculateCumulativeStrengths(Line *start, Line *end) {
1981 // start is a reset - end may be NULL
1982 start = start->next;
1983 UColAttributeValue cumulativeStrength = UCOL_OFF;
1984 while(start && start != end) {
1985 if(start->strength < cumulativeStrength) {
1986 cumulativeStrength = start->strength;
1987 }
1988 start->cumulativeStrength = cumulativeStrength;
1989 start = start->next;
1990 }
1991 }
1992
1993
1994 void
getRepertoire(UnicodeSet & fillIn)1995 SortedLines::getRepertoire(UnicodeSet &fillIn) {
1996 fillIn.clear();
1997 fillIn.addAll(repertoire);
1998 }
1999
2000
2001 void
removeDecompositionsFromRepertoire()2002 SortedLines::removeDecompositionsFromRepertoire() {
2003 UnicodeSetIterator repertoireIter(repertoire);
2004 UErrorCode status = U_ZERO_ERROR;
2005 UChar string[256];
2006 UChar composed[256];
2007 int32_t len = 0, compLen = 0;
2008 UnicodeString compString;
2009 UnicodeSet toRemove;
2010
2011 while(repertoireIter.next()) {
2012 len = 0;
2013 if(repertoireIter.isString()) { // process a string
2014 len = repertoireIter.getString().length();
2015 u_memcpy(string, repertoireIter.getString().getBuffer(), len);
2016 } else { // process code point
2017 UBool isError = false;
2018 U16_APPEND(string, len, 25, repertoireIter.getCodepoint(), isError);
2019 }
2020 string[len] = 0; // zero terminate, for our evil ways
2021 compLen = unorm_normalize(string, len, UNORM_NFC, 0, composed, 256, &status);
2022 if(compLen != len || u_strcmp(string, composed) != 0) {
2023 compString.setTo(composed, compLen);
2024 if(repertoire.contains(compString)) {
2025 toRemove.add(UnicodeString(string, len));
2026 }
2027 }
2028 }
2029 debug->log("\nRemoving\n");
2030 debug->log(toRemove.toPattern(compString, true), true);
2031 repertoire.removeAll(toRemove);
2032 }
2033
2034
2035 void
swapCase()2036 SortedLines::swapCase()
2037 {
2038 int32_t i = 0;
2039 for(i = 0; i < size; i++) {
2040 toSort[i]->swapCase();
2041 }
2042 }
2043
2044 void
calculateSortKey(Line & line)2045 SortedLines::calculateSortKey(Line &line)
2046 {
2047 if(!sortkeys) {
2048 sortkeys = new uint8_t[size*1024];
2049 memset(sortkeys, 0, size*1024);
2050 }
2051 line.sortKey = sortkeys+sortkeyOffset;
2052 sortkeyOffset += probe.getSortKey(line, sortkeys+sortkeyOffset, size*256-sortkeyOffset);
2053 }
2054
2055
2056 void
calculateSortKeys()2057 SortedLines::calculateSortKeys()
2058 {
2059 if(sortkeys) {
2060 delete[] sortkeys;
2061 }
2062 sortkeyOffset = 0;
2063 sortkeys = new uint8_t[size*256];
2064 memset(sortkeys, 0, size*256);
2065 int32_t i = 0;
2066 for(i = 0; i < size; i++) {
2067 calculateSortKey(*toSort[i]);
2068 }
2069 }
2070