• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 1999-2010, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  ucol_wgt.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2001mar08
14 *   created by: Markus W. Scherer
15 *
16 *   This file contains code for allocating n collation element weights
17 *   between two exclusive limits.
18 *   It is used only internally by ucol_bld.
19 */
20 
21 #include "unicode/utypes.h"
22 
23 #if !UCONFIG_NO_COLLATION
24 
25 #include "ucol_imp.h"
26 #include "ucol_wgt.h"
27 #include "cmemory.h"
28 #include "uarrsort.h"
29 
30 #ifdef UCOL_DEBUG
31 #   include <stdio.h>
32 #endif
33 
34 /* collation element weight allocation -------------------------------------- */
35 
36 /* helper functions for CE weights */
37 
38 static U_INLINE int32_t
lengthOfWeight(uint32_t weight)39 lengthOfWeight(uint32_t weight) {
40     if((weight&0xffffff)==0) {
41         return 1;
42     } else if((weight&0xffff)==0) {
43         return 2;
44     } else if((weight&0xff)==0) {
45         return 3;
46     } else {
47         return 4;
48     }
49 }
50 
51 static U_INLINE uint32_t
getWeightTrail(uint32_t weight,int32_t length)52 getWeightTrail(uint32_t weight, int32_t length) {
53     return (uint32_t)(weight>>(8*(4-length)))&0xff;
54 }
55 
56 static U_INLINE uint32_t
setWeightTrail(uint32_t weight,int32_t length,uint32_t trail)57 setWeightTrail(uint32_t weight, int32_t length, uint32_t trail) {
58     length=8*(4-length);
59     return (uint32_t)((weight&(0xffffff00<<length))|(trail<<length));
60 }
61 
62 static U_INLINE uint32_t
getWeightByte(uint32_t weight,int32_t idx)63 getWeightByte(uint32_t weight, int32_t idx) {
64     return getWeightTrail(weight, idx); /* same calculation */
65 }
66 
67 static U_INLINE uint32_t
setWeightByte(uint32_t weight,int32_t idx,uint32_t byte)68 setWeightByte(uint32_t weight, int32_t idx, uint32_t byte) {
69     uint32_t mask; /* 0xffffffff except a 00 "hole" for the index-th byte */
70 
71     idx*=8;
72     mask=((uint32_t)0xffffffff)>>idx;
73     idx=32-idx;
74     mask|=0xffffff00<<idx;
75     return (uint32_t)((weight&mask)|(byte<<idx));
76 }
77 
78 static U_INLINE uint32_t
truncateWeight(uint32_t weight,int32_t length)79 truncateWeight(uint32_t weight, int32_t length) {
80     return (uint32_t)(weight&(0xffffffff<<(8*(4-length))));
81 }
82 
83 static U_INLINE uint32_t
incWeightTrail(uint32_t weight,int32_t length)84 incWeightTrail(uint32_t weight, int32_t length) {
85     return (uint32_t)(weight+(1UL<<(8*(4-length))));
86 }
87 
88 static U_INLINE uint32_t
decWeightTrail(uint32_t weight,int32_t length)89 decWeightTrail(uint32_t weight, int32_t length) {
90     return (uint32_t)(weight-(1UL<<(8*(4-length))));
91 }
92 
93 static U_INLINE uint32_t
incWeight(uint32_t weight,int32_t length,uint32_t maxByte)94 incWeight(uint32_t weight, int32_t length, uint32_t maxByte) {
95     uint32_t byte;
96 
97     for(;;) {
98         byte=getWeightByte(weight, length);
99         if(byte<maxByte) {
100             return setWeightByte(weight, length, byte+1);
101         } else {
102             /* roll over, set this byte to UCOL_BYTE_FIRST_TAILORED and increment the previous one */
103             weight=setWeightByte(weight, length, UCOL_BYTE_FIRST_TAILORED);
104             --length;
105         }
106     }
107 }
108 
109 static U_INLINE int32_t
lengthenRange(WeightRange * range,uint32_t maxByte,uint32_t countBytes)110 lengthenRange(WeightRange *range, uint32_t maxByte, uint32_t countBytes) {
111     int32_t length;
112 
113     length=range->length2+1;
114     range->start=setWeightTrail(range->start, length, UCOL_BYTE_FIRST_TAILORED);
115     range->end=setWeightTrail(range->end, length, maxByte);
116     range->count2*=countBytes;
117     range->length2=length;
118     return length;
119 }
120 
121 /* for uprv_sortArray: sort ranges in weight order */
122 static int32_t U_CALLCONV
compareRanges(const void *,const void * left,const void * right)123 compareRanges(const void * /*context*/, const void *left, const void *right) {
124     uint32_t l, r;
125 
126     l=((const WeightRange *)left)->start;
127     r=((const WeightRange *)right)->start;
128     if(l<r) {
129         return -1;
130     } else if(l>r) {
131         return 1;
132     } else {
133         return 0;
134     }
135 }
136 
137 /*
138  * take two CE weights and calculate the
139  * possible ranges of weights between the two limits, excluding them
140  * for weights with up to 4 bytes there are up to 2*4-1=7 ranges
141  */
142 static U_INLINE int32_t
getWeightRanges(uint32_t lowerLimit,uint32_t upperLimit,uint32_t maxByte,uint32_t countBytes,WeightRange ranges[7])143 getWeightRanges(uint32_t lowerLimit, uint32_t upperLimit,
144                 uint32_t maxByte, uint32_t countBytes,
145                 WeightRange ranges[7]) {
146     WeightRange lower[5], middle, upper[5]; /* [0] and [1] are not used - this simplifies indexing */
147     uint32_t weight, trail;
148     int32_t length, lowerLength, upperLength, rangeCount;
149 
150     /* assume that both lowerLimit & upperLimit are not 0 */
151 
152     /* get the lengths of the limits */
153     lowerLength=lengthOfWeight(lowerLimit);
154     upperLength=lengthOfWeight(upperLimit);
155 
156 #ifdef UCOL_DEBUG
157     printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength);
158     printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength);
159 #endif
160 
161     if(lowerLimit>=upperLimit) {
162 #ifdef UCOL_DEBUG
163         printf("error: no space between lower & upper limits\n");
164 #endif
165         return 0;
166     }
167 
168     /* check that neither is a prefix of the other */
169     if(lowerLength<upperLength) {
170         if(lowerLimit==truncateWeight(upperLimit, lowerLength)) {
171 #ifdef UCOL_DEBUG
172             printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit, upperLimit);
173 #endif
174             return 0;
175         }
176     }
177     /* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */
178 
179     /* reset local variables */
180     uprv_memset(lower, 0, sizeof(lower));
181     uprv_memset(&middle, 0, sizeof(middle));
182     uprv_memset(upper, 0, sizeof(upper));
183 
184     /*
185      * With the limit lengths of 1..4, there are up to 7 ranges for allocation:
186      * range     minimum length
187      * lower[4]  4
188      * lower[3]  3
189      * lower[2]  2
190      * middle    1
191      * upper[2]  2
192      * upper[3]  3
193      * upper[4]  4
194      *
195      * We are now going to calculate up to 7 ranges.
196      * Some of them will typically overlap, so we will then have to merge and eliminate ranges.
197      */
198     weight=lowerLimit;
199     for(length=lowerLength; length>=2; --length) {
200         trail=getWeightTrail(weight, length);
201         if(trail<maxByte) {
202             lower[length].start=incWeightTrail(weight, length);
203             lower[length].end=setWeightTrail(weight, length, maxByte);
204             lower[length].length=length;
205             lower[length].count=maxByte-trail;
206         }
207         weight=truncateWeight(weight, length-1);
208     }
209     middle.start=incWeightTrail(weight, 1);
210 
211     weight=upperLimit;
212     for(length=upperLength; length>=2; --length) {
213         trail=getWeightTrail(weight, length);
214         if(trail>UCOL_BYTE_FIRST_TAILORED) {
215             upper[length].start=setWeightTrail(weight, length, UCOL_BYTE_FIRST_TAILORED);
216             upper[length].end=decWeightTrail(weight, length);
217             upper[length].length=length;
218             upper[length].count=trail-UCOL_BYTE_FIRST_TAILORED;
219         }
220         weight=truncateWeight(weight, length-1);
221     }
222     middle.end=decWeightTrail(weight, 1);
223 
224     /* set the middle range */
225     middle.length=1;
226     if(middle.end>=middle.start) {
227         middle.count=(int32_t)((middle.end-middle.start)>>24)+1;
228     } else {
229         /* eliminate overlaps */
230         uint32_t start, end;
231 
232         /* remove the middle range */
233         middle.count=0;
234 
235         /* reduce or remove the lower ranges that go beyond upperLimit */
236         for(length=4; length>=2; --length) {
237             if(lower[length].count>0 && upper[length].count>0) {
238                 start=upper[length].start;
239                 end=lower[length].end;
240 
241                 if(end>=start || incWeight(end, length, maxByte)==start) {
242                     /* lower and upper ranges collide or are directly adjacent: merge these two and remove all shorter ranges */
243                     start=lower[length].start;
244                     end=lower[length].end=upper[length].end;
245                     /*
246                      * merging directly adjacent ranges needs to subtract the 0/1 gaps in between;
247                      * it may result in a range with count>countBytes
248                      */
249                     lower[length].count=
250                         (int32_t)(getWeightTrail(end, length)-getWeightTrail(start, length)+1+
251                                   countBytes*(getWeightByte(end, length-1)-getWeightByte(start, length-1)));
252                     upper[length].count=0;
253                     while(--length>=2) {
254                         lower[length].count=upper[length].count=0;
255                     }
256                     break;
257                 }
258             }
259         }
260     }
261 
262 #ifdef UCOL_DEBUG
263     /* print ranges */
264     for(length=4; length>=2; --length) {
265         if(lower[length].count>0) {
266             printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, lower[length].start, lower[length].end, lower[length].count);
267         }
268     }
269     if(middle.count>0) {
270         printf("middle   .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start, middle.end, middle.count);
271     }
272     for(length=2; length<=4; ++length) {
273         if(upper[length].count>0) {
274             printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, upper[length].start, upper[length].end, upper[length].count);
275         }
276     }
277 #endif
278 
279     /* copy the ranges, shortest first, into the result array */
280     rangeCount=0;
281     if(middle.count>0) {
282         uprv_memcpy(ranges, &middle, sizeof(WeightRange));
283         rangeCount=1;
284     }
285     for(length=2; length<=4; ++length) {
286         /* copy upper first so that later the middle range is more likely the first one to use */
287         if(upper[length].count>0) {
288             uprv_memcpy(ranges+rangeCount, upper+length, sizeof(WeightRange));
289             ++rangeCount;
290         }
291         if(lower[length].count>0) {
292             uprv_memcpy(ranges+rangeCount, lower+length, sizeof(WeightRange));
293             ++rangeCount;
294         }
295     }
296     return rangeCount;
297 }
298 
299 /*
300  * call getWeightRanges and then determine heuristically
301  * which ranges to use for a given number of weights between (excluding)
302  * two limits
303  */
304 U_CFUNC int32_t
ucol_allocWeights(uint32_t lowerLimit,uint32_t upperLimit,uint32_t n,uint32_t maxByte,WeightRange ranges[7])305 ucol_allocWeights(uint32_t lowerLimit, uint32_t upperLimit,
306                   uint32_t n,
307                   uint32_t maxByte,
308                   WeightRange ranges[7]) {
309     /* number of usable byte values 3..maxByte */
310     uint32_t countBytes=maxByte-UCOL_BYTE_FIRST_TAILORED+1;
311 
312     uint32_t lengthCounts[6]; /* [0] unused, [5] to make index checks unnecessary */
313     uint32_t maxCount;
314     int32_t i, rangeCount, minLength/*, maxLength*/;
315 
316     /* countBytes to the power of index */
317     uint32_t powers[5];
318     /* gcc requires explicit initialization */
319     powers[0] = 1;
320     powers[1] = countBytes;
321     powers[2] = countBytes*countBytes;
322     powers[3] = countBytes*countBytes*countBytes;
323     powers[4] = countBytes*countBytes*countBytes*countBytes;
324 
325 #ifdef UCOL_DEBUG
326     puts("");
327 #endif
328 
329     rangeCount=getWeightRanges(lowerLimit, upperLimit, maxByte, countBytes, ranges);
330     if(rangeCount<=0) {
331 #ifdef UCOL_DEBUG
332         printf("error: unable to get Weight ranges\n");
333 #endif
334         return 0;
335     }
336 
337     /* what is the maximum number of weights with these ranges? */
338     maxCount=0;
339     for(i=0; i<rangeCount; ++i) {
340         maxCount+=(uint32_t)ranges[i].count*powers[4-ranges[i].length];
341     }
342     if(maxCount>=n) {
343 #ifdef UCOL_DEBUG
344         printf("the maximum number of %lu weights is sufficient for n=%lu\n", maxCount, n);
345 #endif
346     } else {
347 #ifdef UCOL_DEBUG
348         printf("error: the maximum number of %lu weights is insufficient for n=%lu\n", maxCount, n);
349 #endif
350         return 0;
351     }
352 
353     /* set the length2 and count2 fields */
354     for(i=0; i<rangeCount; ++i) {
355         ranges[i].length2=ranges[i].length;
356         ranges[i].count2=(uint32_t)ranges[i].count;
357     }
358 
359     /* try until we find suitably large ranges */
360     for(;;) {
361         /* get the smallest number of bytes in a range */
362         minLength=ranges[0].length2;
363 
364         /* sum up the number of elements that fit into ranges of each byte length */
365         uprv_memset(lengthCounts, 0, sizeof(lengthCounts));
366         for(i=0; i<rangeCount; ++i) {
367             lengthCounts[ranges[i].length2]+=ranges[i].count2;
368         }
369 
370         /* now try to allocate n elements in the available short ranges */
371         if(n<=(lengthCounts[minLength]+lengthCounts[minLength+1])) {
372             /* trivial cases, use the first few ranges */
373             maxCount=0;
374             rangeCount=0;
375             do {
376                 maxCount+=ranges[rangeCount].count2;
377                 ++rangeCount;
378             } while(n>maxCount);
379 #ifdef UCOL_DEBUG
380             printf("take first %ld ranges\n", rangeCount);
381 #endif
382             break;
383         } else if(n<=ranges[0].count2*countBytes) {
384             /* easy case, just make this one range large enough by lengthening it once more, possibly split it */
385             uint32_t count1, count2, power_1, power;
386 
387             /*maxLength=minLength+1;*/
388 
389             /* calculate how to split the range between maxLength-1 (count1) and maxLength (count2) */
390             power_1=powers[minLength-ranges[0].length];
391             power=power_1*countBytes;
392             count2=(n+power-1)/power;
393             count1=ranges[0].count-count2;
394 
395             /* split the range */
396 #ifdef UCOL_DEBUG
397             printf("split the first range %ld:%ld\n", count1, count2);
398 #endif
399             if(count1<1) {
400                 rangeCount=1;
401 
402                 /* lengthen the entire range to maxLength */
403                 lengthenRange(ranges, maxByte, countBytes);
404             } else {
405                 /* really split the range */
406                 uint32_t byte;
407 
408                 /* create a new range with the end and initial and current length of the old one */
409                 rangeCount=2;
410                 ranges[1].end=ranges[0].end;
411                 ranges[1].length=ranges[0].length;
412                 ranges[1].length2=minLength;
413 
414                 /* set the end of the first range according to count1 */
415                 i=ranges[0].length;
416                 byte=getWeightByte(ranges[0].start, i)+count1-1;
417 
418                 /*
419                  * ranges[0].count and count1 may be >countBytes
420                  * from merging adjacent ranges;
421                  * byte>maxByte is possible
422                  */
423                 if(byte<=maxByte) {
424                     ranges[0].end=setWeightByte(ranges[0].start, i, byte);
425                 } else /* byte>maxByte */ {
426                     ranges[0].end=setWeightByte(incWeight(ranges[0].start, i-1, maxByte), i, byte-countBytes);
427                 }
428 
429                 /* set the bytes in the end weight at length+1..length2 to maxByte */
430                 byte=(maxByte<<24)|(maxByte<<16)|(maxByte<<8)|maxByte; /* this used to be 0xffffffff */
431                 ranges[0].end=truncateWeight(ranges[0].end, i)|
432                               ((byte>>(8*i))&(byte<<(8*(4-minLength))));
433 
434                 /* set the start of the second range to immediately follow the end of the first one */
435                 ranges[1].start=incWeight(ranges[0].end, minLength, maxByte);
436 
437                 /* set the count values (informational) */
438                 ranges[0].count=count1;
439                 ranges[1].count=count2;
440 
441                 ranges[0].count2=count1*power_1;
442                 ranges[1].count2=count2*power_1; /* will be *countBytes when lengthened */
443 
444                 /* lengthen the second range to maxLength */
445                 lengthenRange(ranges+1, maxByte, countBytes);
446             }
447             break;
448         }
449 
450         /* no good match, lengthen all minLength ranges and iterate */
451 #ifdef UCOL_DEBUG
452         printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1);
453 #endif
454         for(i=0; ranges[i].length2==minLength; ++i) {
455             lengthenRange(ranges+i, maxByte, countBytes);
456         }
457     }
458 
459     if(rangeCount>1) {
460         /* sort the ranges by weight values */
461         UErrorCode errorCode=U_ZERO_ERROR;
462         uprv_sortArray(ranges, rangeCount, sizeof(WeightRange), compareRanges, NULL, FALSE, &errorCode);
463         /* ignore error code: we know that the internal sort function will not fail here */
464     }
465 
466 #ifdef UCOL_DEBUG
467     puts("final ranges:");
468     for(i=0; i<rangeCount; ++i) {
469         printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .length2=%ld .count=%ld .count2=%lu\n",
470                i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].length2, ranges[i].count, ranges[i].count2);
471     }
472 #endif
473 
474     /* set maxByte in ranges[0] for ucol_nextWeight() */
475     ranges[0].count=maxByte;
476 
477     return rangeCount;
478 }
479 
480 /*
481  * given a set of ranges calculated by ucol_allocWeights(),
482  * iterate through the weights
483  */
484 U_CFUNC uint32_t
ucol_nextWeight(WeightRange ranges[],int32_t * pRangeCount)485 ucol_nextWeight(WeightRange ranges[], int32_t *pRangeCount) {
486     if(*pRangeCount<=0) {
487         return 0xffffffff;
488     } else {
489         uint32_t weight, maxByte;
490 
491         /* get maxByte from the .count field */
492         maxByte=ranges[0].count;
493 
494         /* get the next weight */
495         weight=ranges[0].start;
496         if(weight==ranges[0].end) {
497             /* this range is finished, remove it and move the following ones up */
498             if(--*pRangeCount>0) {
499                 uprv_memmove(ranges, ranges+1, *pRangeCount*sizeof(WeightRange));
500                 ranges[0].count=maxByte; /* keep maxByte in ranges[0] */
501             }
502         } else {
503             /* increment the weight for the next value */
504             ranges[0].start=incWeight(weight, ranges[0].length2, maxByte);
505         }
506 
507         return weight;
508     }
509 }
510 
511 #if 0 // #ifdef UCOL_DEBUG
512 
513 static void
514 testAlloc(uint32_t lowerLimit, uint32_t upperLimit, uint32_t n, UBool enumerate) {
515     WeightRange ranges[8];
516     int32_t rangeCount;
517 
518     rangeCount=ucol_allocWeights(lowerLimit, upperLimit, n, ranges);
519     if(enumerate) {
520         uint32_t weight;
521 
522         while(n>0) {
523             weight=ucol_nextWeight(ranges, &rangeCount);
524             if(weight==0xffffffff) {
525                 printf("error: 0xffffffff with %lu more weights to go\n", n);
526                 break;
527             }
528             printf("    0x%08lx\n", weight);
529             --n;
530         }
531     }
532 }
533 
534 extern int
535 main(int argc, const char *argv[]) {
536 #if 0
537 #endif
538     testAlloc(0x364214fc, 0x44b87d23, 5, FALSE);
539     testAlloc(0x36421500, 0x44b87d23, 5, FALSE);
540     testAlloc(0x36421500, 0x44b87d23, 20, FALSE);
541     testAlloc(0x36421500, 0x44b87d23, 13700, FALSE);
542     testAlloc(0x36421500, 0x38b87d23, 1, FALSE);
543     testAlloc(0x36421500, 0x38b87d23, 20, FALSE);
544     testAlloc(0x36421500, 0x38b87d23, 200, TRUE);
545     testAlloc(0x36421500, 0x38b87d23, 13700, FALSE);
546     testAlloc(0x36421500, 0x37b87d23, 13700, FALSE);
547     testAlloc(0x36ef1500, 0x37b87d23, 13700, FALSE);
548     testAlloc(0x36421500, 0x36b87d23, 13700, FALSE);
549     testAlloc(0x36b87122, 0x36b87d23, 13700, FALSE);
550     testAlloc(0x49000000, 0x4a600000, 13700, FALSE);
551     testAlloc(0x9fffffff, 0xd0000000, 13700, FALSE);
552     testAlloc(0x9fffffff, 0xd0000000, 67400, FALSE);
553     testAlloc(0x9fffffff, 0xa0030000, 67400, FALSE);
554     testAlloc(0x9fffffff, 0xa0030000, 40000, FALSE);
555     testAlloc(0xa0000000, 0xa0030000, 40000, FALSE);
556     testAlloc(0xa0031100, 0xa0030000, 40000, FALSE);
557 #if 0
558 #endif
559     return 0;
560 }
561 
562 #endif
563 
564 #endif /* #if !UCONFIG_NO_COLLATION */
565