1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 1999-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: ucol_wgt.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2001mar08
14 * created by: Markus W. Scherer
15 *
16 * This file contains code for allocating n collation element weights
17 * between two exclusive limits.
18 * It is used only internally by ucol_bld.
19 */
20
21 #include "unicode/utypes.h"
22
23 #if !UCONFIG_NO_COLLATION
24
25 #include "ucol_imp.h"
26 #include "ucol_wgt.h"
27 #include "cmemory.h"
28 #include "uarrsort.h"
29
30 #ifdef UCOL_DEBUG
31 # include <stdio.h>
32 #endif
33
34 /* collation element weight allocation -------------------------------------- */
35
36 /* helper functions for CE weights */
37
38 static U_INLINE int32_t
lengthOfWeight(uint32_t weight)39 lengthOfWeight(uint32_t weight) {
40 if((weight&0xffffff)==0) {
41 return 1;
42 } else if((weight&0xffff)==0) {
43 return 2;
44 } else if((weight&0xff)==0) {
45 return 3;
46 } else {
47 return 4;
48 }
49 }
50
51 static U_INLINE uint32_t
getWeightTrail(uint32_t weight,int32_t length)52 getWeightTrail(uint32_t weight, int32_t length) {
53 return (uint32_t)(weight>>(8*(4-length)))&0xff;
54 }
55
56 static U_INLINE uint32_t
setWeightTrail(uint32_t weight,int32_t length,uint32_t trail)57 setWeightTrail(uint32_t weight, int32_t length, uint32_t trail) {
58 length=8*(4-length);
59 return (uint32_t)((weight&(0xffffff00<<length))|(trail<<length));
60 }
61
62 static U_INLINE uint32_t
getWeightByte(uint32_t weight,int32_t idx)63 getWeightByte(uint32_t weight, int32_t idx) {
64 return getWeightTrail(weight, idx); /* same calculation */
65 }
66
67 static U_INLINE uint32_t
setWeightByte(uint32_t weight,int32_t idx,uint32_t byte)68 setWeightByte(uint32_t weight, int32_t idx, uint32_t byte) {
69 uint32_t mask; /* 0xffffffff except a 00 "hole" for the index-th byte */
70
71 idx*=8;
72 mask=((uint32_t)0xffffffff)>>idx;
73 idx=32-idx;
74 mask|=0xffffff00<<idx;
75 return (uint32_t)((weight&mask)|(byte<<idx));
76 }
77
78 static U_INLINE uint32_t
truncateWeight(uint32_t weight,int32_t length)79 truncateWeight(uint32_t weight, int32_t length) {
80 return (uint32_t)(weight&(0xffffffff<<(8*(4-length))));
81 }
82
83 static U_INLINE uint32_t
incWeightTrail(uint32_t weight,int32_t length)84 incWeightTrail(uint32_t weight, int32_t length) {
85 return (uint32_t)(weight+(1UL<<(8*(4-length))));
86 }
87
88 static U_INLINE uint32_t
decWeightTrail(uint32_t weight,int32_t length)89 decWeightTrail(uint32_t weight, int32_t length) {
90 return (uint32_t)(weight-(1UL<<(8*(4-length))));
91 }
92
93 static U_INLINE uint32_t
incWeight(uint32_t weight,int32_t length,uint32_t maxByte)94 incWeight(uint32_t weight, int32_t length, uint32_t maxByte) {
95 uint32_t byte;
96
97 for(;;) {
98 byte=getWeightByte(weight, length);
99 if(byte<maxByte) {
100 return setWeightByte(weight, length, byte+1);
101 } else {
102 /* roll over, set this byte to UCOL_BYTE_FIRST_TAILORED and increment the previous one */
103 weight=setWeightByte(weight, length, UCOL_BYTE_FIRST_TAILORED);
104 --length;
105 }
106 }
107 }
108
109 static U_INLINE int32_t
lengthenRange(WeightRange * range,uint32_t maxByte,uint32_t countBytes)110 lengthenRange(WeightRange *range, uint32_t maxByte, uint32_t countBytes) {
111 int32_t length;
112
113 length=range->length2+1;
114 range->start=setWeightTrail(range->start, length, UCOL_BYTE_FIRST_TAILORED);
115 range->end=setWeightTrail(range->end, length, maxByte);
116 range->count2*=countBytes;
117 range->length2=length;
118 return length;
119 }
120
121 /* for uprv_sortArray: sort ranges in weight order */
122 static int32_t U_CALLCONV
compareRanges(const void *,const void * left,const void * right)123 compareRanges(const void * /*context*/, const void *left, const void *right) {
124 uint32_t l, r;
125
126 l=((const WeightRange *)left)->start;
127 r=((const WeightRange *)right)->start;
128 if(l<r) {
129 return -1;
130 } else if(l>r) {
131 return 1;
132 } else {
133 return 0;
134 }
135 }
136
137 /*
138 * take two CE weights and calculate the
139 * possible ranges of weights between the two limits, excluding them
140 * for weights with up to 4 bytes there are up to 2*4-1=7 ranges
141 */
142 static U_INLINE int32_t
getWeightRanges(uint32_t lowerLimit,uint32_t upperLimit,uint32_t maxByte,uint32_t countBytes,WeightRange ranges[7])143 getWeightRanges(uint32_t lowerLimit, uint32_t upperLimit,
144 uint32_t maxByte, uint32_t countBytes,
145 WeightRange ranges[7]) {
146 WeightRange lower[5], middle, upper[5]; /* [0] and [1] are not used - this simplifies indexing */
147 uint32_t weight, trail;
148 int32_t length, lowerLength, upperLength, rangeCount;
149
150 /* assume that both lowerLimit & upperLimit are not 0 */
151
152 /* get the lengths of the limits */
153 lowerLength=lengthOfWeight(lowerLimit);
154 upperLength=lengthOfWeight(upperLimit);
155
156 #ifdef UCOL_DEBUG
157 printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength);
158 printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength);
159 #endif
160
161 if(lowerLimit>=upperLimit) {
162 #ifdef UCOL_DEBUG
163 printf("error: no space between lower & upper limits\n");
164 #endif
165 return 0;
166 }
167
168 /* check that neither is a prefix of the other */
169 if(lowerLength<upperLength) {
170 if(lowerLimit==truncateWeight(upperLimit, lowerLength)) {
171 #ifdef UCOL_DEBUG
172 printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit, upperLimit);
173 #endif
174 return 0;
175 }
176 }
177 /* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */
178
179 /* reset local variables */
180 uprv_memset(lower, 0, sizeof(lower));
181 uprv_memset(&middle, 0, sizeof(middle));
182 uprv_memset(upper, 0, sizeof(upper));
183
184 /*
185 * With the limit lengths of 1..4, there are up to 7 ranges for allocation:
186 * range minimum length
187 * lower[4] 4
188 * lower[3] 3
189 * lower[2] 2
190 * middle 1
191 * upper[2] 2
192 * upper[3] 3
193 * upper[4] 4
194 *
195 * We are now going to calculate up to 7 ranges.
196 * Some of them will typically overlap, so we will then have to merge and eliminate ranges.
197 */
198 weight=lowerLimit;
199 for(length=lowerLength; length>=2; --length) {
200 trail=getWeightTrail(weight, length);
201 if(trail<maxByte) {
202 lower[length].start=incWeightTrail(weight, length);
203 lower[length].end=setWeightTrail(weight, length, maxByte);
204 lower[length].length=length;
205 lower[length].count=maxByte-trail;
206 }
207 weight=truncateWeight(weight, length-1);
208 }
209 middle.start=incWeightTrail(weight, 1);
210
211 weight=upperLimit;
212 for(length=upperLength; length>=2; --length) {
213 trail=getWeightTrail(weight, length);
214 if(trail>UCOL_BYTE_FIRST_TAILORED) {
215 upper[length].start=setWeightTrail(weight, length, UCOL_BYTE_FIRST_TAILORED);
216 upper[length].end=decWeightTrail(weight, length);
217 upper[length].length=length;
218 upper[length].count=trail-UCOL_BYTE_FIRST_TAILORED;
219 }
220 weight=truncateWeight(weight, length-1);
221 }
222 middle.end=decWeightTrail(weight, 1);
223
224 /* set the middle range */
225 middle.length=1;
226 if(middle.end>=middle.start) {
227 middle.count=(int32_t)((middle.end-middle.start)>>24)+1;
228 } else {
229 /* eliminate overlaps */
230 uint32_t start, end;
231
232 /* remove the middle range */
233 middle.count=0;
234
235 /* reduce or remove the lower ranges that go beyond upperLimit */
236 for(length=4; length>=2; --length) {
237 if(lower[length].count>0 && upper[length].count>0) {
238 start=upper[length].start;
239 end=lower[length].end;
240
241 if(end>=start || incWeight(end, length, maxByte)==start) {
242 /* lower and upper ranges collide or are directly adjacent: merge these two and remove all shorter ranges */
243 start=lower[length].start;
244 end=lower[length].end=upper[length].end;
245 /*
246 * merging directly adjacent ranges needs to subtract the 0/1 gaps in between;
247 * it may result in a range with count>countBytes
248 */
249 lower[length].count=
250 (int32_t)(getWeightTrail(end, length)-getWeightTrail(start, length)+1+
251 countBytes*(getWeightByte(end, length-1)-getWeightByte(start, length-1)));
252 upper[length].count=0;
253 while(--length>=2) {
254 lower[length].count=upper[length].count=0;
255 }
256 break;
257 }
258 }
259 }
260 }
261
262 #ifdef UCOL_DEBUG
263 /* print ranges */
264 for(length=4; length>=2; --length) {
265 if(lower[length].count>0) {
266 printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, lower[length].start, lower[length].end, lower[length].count);
267 }
268 }
269 if(middle.count>0) {
270 printf("middle .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start, middle.end, middle.count);
271 }
272 for(length=2; length<=4; ++length) {
273 if(upper[length].count>0) {
274 printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, upper[length].start, upper[length].end, upper[length].count);
275 }
276 }
277 #endif
278
279 /* copy the ranges, shortest first, into the result array */
280 rangeCount=0;
281 if(middle.count>0) {
282 uprv_memcpy(ranges, &middle, sizeof(WeightRange));
283 rangeCount=1;
284 }
285 for(length=2; length<=4; ++length) {
286 /* copy upper first so that later the middle range is more likely the first one to use */
287 if(upper[length].count>0) {
288 uprv_memcpy(ranges+rangeCount, upper+length, sizeof(WeightRange));
289 ++rangeCount;
290 }
291 if(lower[length].count>0) {
292 uprv_memcpy(ranges+rangeCount, lower+length, sizeof(WeightRange));
293 ++rangeCount;
294 }
295 }
296 return rangeCount;
297 }
298
299 /*
300 * call getWeightRanges and then determine heuristically
301 * which ranges to use for a given number of weights between (excluding)
302 * two limits
303 */
304 U_CFUNC int32_t
ucol_allocWeights(uint32_t lowerLimit,uint32_t upperLimit,uint32_t n,uint32_t maxByte,WeightRange ranges[7])305 ucol_allocWeights(uint32_t lowerLimit, uint32_t upperLimit,
306 uint32_t n,
307 uint32_t maxByte,
308 WeightRange ranges[7]) {
309 /* number of usable byte values 3..maxByte */
310 uint32_t countBytes=maxByte-UCOL_BYTE_FIRST_TAILORED+1;
311
312 uint32_t lengthCounts[6]; /* [0] unused, [5] to make index checks unnecessary */
313 uint32_t maxCount;
314 int32_t i, rangeCount, minLength/*, maxLength*/;
315
316 /* countBytes to the power of index */
317 uint32_t powers[5];
318 /* gcc requires explicit initialization */
319 powers[0] = 1;
320 powers[1] = countBytes;
321 powers[2] = countBytes*countBytes;
322 powers[3] = countBytes*countBytes*countBytes;
323 powers[4] = countBytes*countBytes*countBytes*countBytes;
324
325 #ifdef UCOL_DEBUG
326 puts("");
327 #endif
328
329 rangeCount=getWeightRanges(lowerLimit, upperLimit, maxByte, countBytes, ranges);
330 if(rangeCount<=0) {
331 #ifdef UCOL_DEBUG
332 printf("error: unable to get Weight ranges\n");
333 #endif
334 return 0;
335 }
336
337 /* what is the maximum number of weights with these ranges? */
338 maxCount=0;
339 for(i=0; i<rangeCount; ++i) {
340 maxCount+=(uint32_t)ranges[i].count*powers[4-ranges[i].length];
341 }
342 if(maxCount>=n) {
343 #ifdef UCOL_DEBUG
344 printf("the maximum number of %lu weights is sufficient for n=%lu\n", maxCount, n);
345 #endif
346 } else {
347 #ifdef UCOL_DEBUG
348 printf("error: the maximum number of %lu weights is insufficient for n=%lu\n", maxCount, n);
349 #endif
350 return 0;
351 }
352
353 /* set the length2 and count2 fields */
354 for(i=0; i<rangeCount; ++i) {
355 ranges[i].length2=ranges[i].length;
356 ranges[i].count2=(uint32_t)ranges[i].count;
357 }
358
359 /* try until we find suitably large ranges */
360 for(;;) {
361 /* get the smallest number of bytes in a range */
362 minLength=ranges[0].length2;
363
364 /* sum up the number of elements that fit into ranges of each byte length */
365 uprv_memset(lengthCounts, 0, sizeof(lengthCounts));
366 for(i=0; i<rangeCount; ++i) {
367 lengthCounts[ranges[i].length2]+=ranges[i].count2;
368 }
369
370 /* now try to allocate n elements in the available short ranges */
371 if(n<=(lengthCounts[minLength]+lengthCounts[minLength+1])) {
372 /* trivial cases, use the first few ranges */
373 maxCount=0;
374 rangeCount=0;
375 do {
376 maxCount+=ranges[rangeCount].count2;
377 ++rangeCount;
378 } while(n>maxCount);
379 #ifdef UCOL_DEBUG
380 printf("take first %ld ranges\n", rangeCount);
381 #endif
382 break;
383 } else if(n<=ranges[0].count2*countBytes) {
384 /* easy case, just make this one range large enough by lengthening it once more, possibly split it */
385 uint32_t count1, count2, power_1, power;
386
387 /*maxLength=minLength+1;*/
388
389 /* calculate how to split the range between maxLength-1 (count1) and maxLength (count2) */
390 power_1=powers[minLength-ranges[0].length];
391 power=power_1*countBytes;
392 count2=(n+power-1)/power;
393 count1=ranges[0].count-count2;
394
395 /* split the range */
396 #ifdef UCOL_DEBUG
397 printf("split the first range %ld:%ld\n", count1, count2);
398 #endif
399 if(count1<1) {
400 rangeCount=1;
401
402 /* lengthen the entire range to maxLength */
403 lengthenRange(ranges, maxByte, countBytes);
404 } else {
405 /* really split the range */
406 uint32_t byte;
407
408 /* create a new range with the end and initial and current length of the old one */
409 rangeCount=2;
410 ranges[1].end=ranges[0].end;
411 ranges[1].length=ranges[0].length;
412 ranges[1].length2=minLength;
413
414 /* set the end of the first range according to count1 */
415 i=ranges[0].length;
416 byte=getWeightByte(ranges[0].start, i)+count1-1;
417
418 /*
419 * ranges[0].count and count1 may be >countBytes
420 * from merging adjacent ranges;
421 * byte>maxByte is possible
422 */
423 if(byte<=maxByte) {
424 ranges[0].end=setWeightByte(ranges[0].start, i, byte);
425 } else /* byte>maxByte */ {
426 ranges[0].end=setWeightByte(incWeight(ranges[0].start, i-1, maxByte), i, byte-countBytes);
427 }
428
429 /* set the bytes in the end weight at length+1..length2 to maxByte */
430 byte=(maxByte<<24)|(maxByte<<16)|(maxByte<<8)|maxByte; /* this used to be 0xffffffff */
431 ranges[0].end=truncateWeight(ranges[0].end, i)|
432 ((byte>>(8*i))&(byte<<(8*(4-minLength))));
433
434 /* set the start of the second range to immediately follow the end of the first one */
435 ranges[1].start=incWeight(ranges[0].end, minLength, maxByte);
436
437 /* set the count values (informational) */
438 ranges[0].count=count1;
439 ranges[1].count=count2;
440
441 ranges[0].count2=count1*power_1;
442 ranges[1].count2=count2*power_1; /* will be *countBytes when lengthened */
443
444 /* lengthen the second range to maxLength */
445 lengthenRange(ranges+1, maxByte, countBytes);
446 }
447 break;
448 }
449
450 /* no good match, lengthen all minLength ranges and iterate */
451 #ifdef UCOL_DEBUG
452 printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1);
453 #endif
454 for(i=0; ranges[i].length2==minLength; ++i) {
455 lengthenRange(ranges+i, maxByte, countBytes);
456 }
457 }
458
459 if(rangeCount>1) {
460 /* sort the ranges by weight values */
461 UErrorCode errorCode=U_ZERO_ERROR;
462 uprv_sortArray(ranges, rangeCount, sizeof(WeightRange), compareRanges, NULL, FALSE, &errorCode);
463 /* ignore error code: we know that the internal sort function will not fail here */
464 }
465
466 #ifdef UCOL_DEBUG
467 puts("final ranges:");
468 for(i=0; i<rangeCount; ++i) {
469 printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .length2=%ld .count=%ld .count2=%lu\n",
470 i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].length2, ranges[i].count, ranges[i].count2);
471 }
472 #endif
473
474 /* set maxByte in ranges[0] for ucol_nextWeight() */
475 ranges[0].count=maxByte;
476
477 return rangeCount;
478 }
479
480 /*
481 * given a set of ranges calculated by ucol_allocWeights(),
482 * iterate through the weights
483 */
484 U_CFUNC uint32_t
ucol_nextWeight(WeightRange ranges[],int32_t * pRangeCount)485 ucol_nextWeight(WeightRange ranges[], int32_t *pRangeCount) {
486 if(*pRangeCount<=0) {
487 return 0xffffffff;
488 } else {
489 uint32_t weight, maxByte;
490
491 /* get maxByte from the .count field */
492 maxByte=ranges[0].count;
493
494 /* get the next weight */
495 weight=ranges[0].start;
496 if(weight==ranges[0].end) {
497 /* this range is finished, remove it and move the following ones up */
498 if(--*pRangeCount>0) {
499 uprv_memmove(ranges, ranges+1, *pRangeCount*sizeof(WeightRange));
500 ranges[0].count=maxByte; /* keep maxByte in ranges[0] */
501 }
502 } else {
503 /* increment the weight for the next value */
504 ranges[0].start=incWeight(weight, ranges[0].length2, maxByte);
505 }
506
507 return weight;
508 }
509 }
510
511 #if 0 // #ifdef UCOL_DEBUG
512
513 static void
514 testAlloc(uint32_t lowerLimit, uint32_t upperLimit, uint32_t n, UBool enumerate) {
515 WeightRange ranges[8];
516 int32_t rangeCount;
517
518 rangeCount=ucol_allocWeights(lowerLimit, upperLimit, n, ranges);
519 if(enumerate) {
520 uint32_t weight;
521
522 while(n>0) {
523 weight=ucol_nextWeight(ranges, &rangeCount);
524 if(weight==0xffffffff) {
525 printf("error: 0xffffffff with %lu more weights to go\n", n);
526 break;
527 }
528 printf(" 0x%08lx\n", weight);
529 --n;
530 }
531 }
532 }
533
534 extern int
535 main(int argc, const char *argv[]) {
536 #if 0
537 #endif
538 testAlloc(0x364214fc, 0x44b87d23, 5, FALSE);
539 testAlloc(0x36421500, 0x44b87d23, 5, FALSE);
540 testAlloc(0x36421500, 0x44b87d23, 20, FALSE);
541 testAlloc(0x36421500, 0x44b87d23, 13700, FALSE);
542 testAlloc(0x36421500, 0x38b87d23, 1, FALSE);
543 testAlloc(0x36421500, 0x38b87d23, 20, FALSE);
544 testAlloc(0x36421500, 0x38b87d23, 200, TRUE);
545 testAlloc(0x36421500, 0x38b87d23, 13700, FALSE);
546 testAlloc(0x36421500, 0x37b87d23, 13700, FALSE);
547 testAlloc(0x36ef1500, 0x37b87d23, 13700, FALSE);
548 testAlloc(0x36421500, 0x36b87d23, 13700, FALSE);
549 testAlloc(0x36b87122, 0x36b87d23, 13700, FALSE);
550 testAlloc(0x49000000, 0x4a600000, 13700, FALSE);
551 testAlloc(0x9fffffff, 0xd0000000, 13700, FALSE);
552 testAlloc(0x9fffffff, 0xd0000000, 67400, FALSE);
553 testAlloc(0x9fffffff, 0xa0030000, 67400, FALSE);
554 testAlloc(0x9fffffff, 0xa0030000, 40000, FALSE);
555 testAlloc(0xa0000000, 0xa0030000, 40000, FALSE);
556 testAlloc(0xa0031100, 0xa0030000, 40000, FALSE);
557 #if 0
558 #endif
559 return 0;
560 }
561
562 #endif
563
564 #endif /* #if !UCONFIG_NO_COLLATION */
565