1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2002-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: propsvec.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2002feb22
14 * created by: Markus W. Scherer
15 *
16 * Store bits (Unicode character properties) in bit set vectors.
17 */
18
19 #include <stdlib.h>
20 #include "unicode/utypes.h"
21 #include "cmemory.h"
22 #include "utrie.h"
23 #include "utrie2.h"
24 #include "uarrsort.h"
25 #include "propsvec.h"
26
27 struct UPropsVectors {
28 uint32_t *v;
29 int32_t columns; /* number of columns, plus two for start & limit values */
30 int32_t maxRows;
31 int32_t rows;
32 int32_t prevRow; /* search optimization: remember last row seen */
33 UBool isCompacted;
34 };
35
36 #define UPVEC_INITIAL_ROWS (1<<12)
37 #define UPVEC_MEDIUM_ROWS ((int32_t)1<<16)
38 #define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1)
39
40 U_CAPI UPropsVectors * U_EXPORT2
upvec_open(int32_t columns,UErrorCode * pErrorCode)41 upvec_open(int32_t columns, UErrorCode *pErrorCode) {
42 UPropsVectors *pv;
43 uint32_t *v, *row;
44 uint32_t cp;
45
46 if(U_FAILURE(*pErrorCode)) {
47 return NULL;
48 }
49 if(columns<1) {
50 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
51 return NULL;
52 }
53 columns+=2; /* count range start and limit columns */
54
55 pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors));
56 v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4);
57 if(pv==NULL || v==NULL) {
58 uprv_free(pv);
59 uprv_free(v);
60 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
61 return NULL;
62 }
63 uprv_memset(pv, 0, sizeof(UPropsVectors));
64 pv->v=v;
65 pv->columns=columns;
66 pv->maxRows=UPVEC_INITIAL_ROWS;
67 pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP);
68
69 /* set the all-Unicode row and the special-value rows */
70 row=pv->v;
71 uprv_memset(row, 0, pv->rows*columns*4);
72 row[0]=0;
73 row[1]=0x110000;
74 row+=columns;
75 for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) {
76 row[0]=cp;
77 row[1]=cp+1;
78 row+=columns;
79 }
80 return pv;
81 }
82
83 U_CAPI void U_EXPORT2
upvec_close(UPropsVectors * pv)84 upvec_close(UPropsVectors *pv) {
85 if(pv!=NULL) {
86 uprv_free(pv->v);
87 uprv_free(pv);
88 }
89 }
90
91 static uint32_t *
_findRow(UPropsVectors * pv,UChar32 rangeStart)92 _findRow(UPropsVectors *pv, UChar32 rangeStart) {
93 uint32_t *row;
94 int32_t columns, i, start, limit, prevRow, rows;
95
96 columns=pv->columns;
97 rows=limit=pv->rows;
98 prevRow=pv->prevRow;
99
100 /* check the vicinity of the last-seen row (start searching with an unrolled loop) */
101 row=pv->v+prevRow*columns;
102 if(rangeStart>=(UChar32)row[0]) {
103 if(rangeStart<(UChar32)row[1]) {
104 /* same row as last seen */
105 return row;
106 } else if(rangeStart<(UChar32)(row+=columns)[1]) {
107 /* next row after the last one */
108 pv->prevRow=prevRow+1;
109 return row;
110 } else if(rangeStart<(UChar32)(row+=columns)[1]) {
111 /* second row after the last one */
112 pv->prevRow=prevRow+2;
113 return row;
114 } else if((rangeStart-(UChar32)row[1])<10) {
115 /* we are close, continue looping */
116 prevRow+=2;
117 do {
118 ++prevRow;
119 row+=columns;
120 } while(rangeStart>=(UChar32)row[1]);
121 pv->prevRow=prevRow;
122 return row;
123 }
124 } else if(rangeStart<(UChar32)pv->v[1]) {
125 /* the very first row */
126 pv->prevRow=0;
127 return pv->v;
128 }
129
130 /* do a binary search for the start of the range */
131 start=0;
132 while(start<limit-1) {
133 i=(start+limit)/2;
134 row=pv->v+i*columns;
135 if(rangeStart<(UChar32)row[0]) {
136 limit=i;
137 } else if(rangeStart<(UChar32)row[1]) {
138 pv->prevRow=i;
139 return row;
140 } else {
141 start=i;
142 }
143 }
144
145 /* must be found because all ranges together always cover all of Unicode */
146 pv->prevRow=start;
147 return pv->v+start*columns;
148 }
149
150 U_CAPI void U_EXPORT2
upvec_setValue(UPropsVectors * pv,UChar32 start,UChar32 end,int32_t column,uint32_t value,uint32_t mask,UErrorCode * pErrorCode)151 upvec_setValue(UPropsVectors *pv,
152 UChar32 start, UChar32 end,
153 int32_t column,
154 uint32_t value, uint32_t mask,
155 UErrorCode *pErrorCode) {
156 uint32_t *firstRow, *lastRow;
157 int32_t columns;
158 UChar32 limit;
159 UBool splitFirstRow, splitLastRow;
160
161 /* argument checking */
162 if(U_FAILURE(*pErrorCode)) {
163 return;
164 }
165 if( pv==NULL ||
166 start<0 || start>end || end>UPVEC_MAX_CP ||
167 column<0 || column>=(pv->columns-2)
168 ) {
169 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
170 return;
171 }
172 if(pv->isCompacted) {
173 *pErrorCode=U_NO_WRITE_PERMISSION;
174 return;
175 }
176 limit=end+1;
177
178 /* initialize */
179 columns=pv->columns;
180 column+=2; /* skip range start and limit columns */
181 value&=mask;
182
183 /* find the rows whose ranges overlap with the input range */
184
185 /* find the first and last rows, always successful */
186 firstRow=_findRow(pv, start);
187 lastRow=_findRow(pv, end);
188
189 /*
190 * Rows need to be split if they partially overlap with the
191 * input range (only possible for the first and last rows)
192 * and if their value differs from the input value.
193 */
194 splitFirstRow= (UBool)(start!=(UChar32)firstRow[0] && value!=(firstRow[column]&mask));
195 splitLastRow= (UBool)(limit!=(UChar32)lastRow[1] && value!=(lastRow[column]&mask));
196
197 /* split first/last rows if necessary */
198 if(splitFirstRow || splitLastRow) {
199 int32_t count, rows;
200
201 rows=pv->rows;
202 if((rows+splitFirstRow+splitLastRow)>pv->maxRows) {
203 uint32_t *newVectors;
204 int32_t newMaxRows;
205
206 if(pv->maxRows<UPVEC_MEDIUM_ROWS) {
207 newMaxRows=UPVEC_MEDIUM_ROWS;
208 } else if(pv->maxRows<UPVEC_MAX_ROWS) {
209 newMaxRows=UPVEC_MAX_ROWS;
210 } else {
211 /* Implementation bug, or UPVEC_MAX_ROWS too low. */
212 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
213 return;
214 }
215 newVectors=(uint32_t *)uprv_malloc(newMaxRows*columns*4);
216 if(newVectors==NULL) {
217 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
218 return;
219 }
220 uprv_memcpy(newVectors, pv->v, rows*columns*4);
221 firstRow=newVectors+(firstRow-pv->v);
222 lastRow=newVectors+(lastRow-pv->v);
223 uprv_free(pv->v);
224 pv->v=newVectors;
225 pv->maxRows=newMaxRows;
226 }
227
228 /* count the number of row cells to move after the last row, and move them */
229 count = (int32_t)((pv->v+rows*columns)-(lastRow+columns));
230 if(count>0) {
231 uprv_memmove(
232 lastRow+(1+splitFirstRow+splitLastRow)*columns,
233 lastRow+columns,
234 count*4);
235 }
236 pv->rows=rows+splitFirstRow+splitLastRow;
237
238 /* split the first row, and move the firstRow pointer to the second part */
239 if(splitFirstRow) {
240 /* copy all affected rows up one and move the lastRow pointer */
241 count = (int32_t)((lastRow-firstRow)+columns);
242 uprv_memmove(firstRow+columns, firstRow, count*4);
243 lastRow+=columns;
244
245 /* split the range and move the firstRow pointer */
246 firstRow[1]=firstRow[columns]=(uint32_t)start;
247 firstRow+=columns;
248 }
249
250 /* split the last row */
251 if(splitLastRow) {
252 /* copy the last row data */
253 uprv_memcpy(lastRow+columns, lastRow, columns*4);
254
255 /* split the range and move the firstRow pointer */
256 lastRow[1]=lastRow[columns]=(uint32_t)limit;
257 }
258 }
259
260 /* set the "row last seen" to the last row for the range */
261 pv->prevRow=(int32_t)((lastRow-(pv->v))/columns);
262
263 /* set the input value in all remaining rows */
264 firstRow+=column;
265 lastRow+=column;
266 mask=~mask;
267 for(;;) {
268 *firstRow=(*firstRow&mask)|value;
269 if(firstRow==lastRow) {
270 break;
271 }
272 firstRow+=columns;
273 }
274 }
275
276 U_CAPI uint32_t U_EXPORT2
upvec_getValue(const UPropsVectors * pv,UChar32 c,int32_t column)277 upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) {
278 uint32_t *row;
279 UPropsVectors *ncpv;
280
281 if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) {
282 return 0;
283 }
284 ncpv=(UPropsVectors *)pv;
285 row=_findRow(ncpv, c);
286 return row[2+column];
287 }
288
289 U_CAPI uint32_t * U_EXPORT2
upvec_getRow(const UPropsVectors * pv,int32_t rowIndex,UChar32 * pRangeStart,UChar32 * pRangeEnd)290 upvec_getRow(const UPropsVectors *pv, int32_t rowIndex,
291 UChar32 *pRangeStart, UChar32 *pRangeEnd) {
292 uint32_t *row;
293 int32_t columns;
294
295 if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) {
296 return NULL;
297 }
298
299 columns=pv->columns;
300 row=pv->v+rowIndex*columns;
301 if(pRangeStart!=NULL) {
302 *pRangeStart=(UChar32)row[0];
303 }
304 if(pRangeEnd!=NULL) {
305 *pRangeEnd=(UChar32)row[1]-1;
306 }
307 return row+2;
308 }
309
310 static int32_t U_CALLCONV
upvec_compareRows(const void * context,const void * l,const void * r)311 upvec_compareRows(const void *context, const void *l, const void *r) {
312 const uint32_t *left=(const uint32_t *)l, *right=(const uint32_t *)r;
313 const UPropsVectors *pv=(const UPropsVectors *)context;
314 int32_t i, count, columns;
315
316 count=columns=pv->columns; /* includes start/limit columns */
317
318 /* start comparing after start/limit but wrap around to them */
319 i=2;
320 do {
321 if(left[i]!=right[i]) {
322 return left[i]<right[i] ? -1 : 1;
323 }
324 if(++i==columns) {
325 i=0;
326 }
327 } while(--count>0);
328
329 return 0;
330 }
331
332 U_CAPI void U_EXPORT2
upvec_compact(UPropsVectors * pv,UPVecCompactHandler * handler,void * context,UErrorCode * pErrorCode)333 upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) {
334 uint32_t *row;
335 int32_t i, columns, valueColumns, rows, count;
336 UChar32 start, limit;
337
338 /* argument checking */
339 if(U_FAILURE(*pErrorCode)) {
340 return;
341 }
342 if(handler==NULL) {
343 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
344 return;
345 }
346 if(pv->isCompacted) {
347 return;
348 }
349
350 /* Set the flag now: Sorting and compacting destroys the builder data structure. */
351 pv->isCompacted=TRUE;
352
353 rows=pv->rows;
354 columns=pv->columns;
355 valueColumns=columns-2; /* not counting start & limit */
356
357 /* sort the properties vectors to find unique vector values */
358 uprv_sortArray(pv->v, rows, columns*4,
359 upvec_compareRows, pv, FALSE, pErrorCode);
360 if(U_FAILURE(*pErrorCode)) {
361 return;
362 }
363
364 /*
365 * Find and set the special values.
366 * This has to do almost the same work as the compaction below,
367 * to find the indexes where the special-value rows will move.
368 */
369 row=pv->v;
370 count=-valueColumns;
371 for(i=0; i<rows; ++i) {
372 start=(UChar32)row[0];
373
374 /* count a new values vector if it is different from the current one */
375 if(count<0 || 0!=uprv_memcmp(row+2, row-valueColumns, valueColumns*4)) {
376 count+=valueColumns;
377 }
378
379 if(start>=UPVEC_FIRST_SPECIAL_CP) {
380 handler(context, start, start, count, row+2, valueColumns, pErrorCode);
381 if(U_FAILURE(*pErrorCode)) {
382 return;
383 }
384 }
385
386 row+=columns;
387 }
388
389 /* count is at the beginning of the last vector, add valueColumns to include that last vector */
390 count+=valueColumns;
391
392 /* Call the handler once more to signal the start of delivering real values. */
393 handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP,
394 count, row-valueColumns, valueColumns, pErrorCode);
395 if(U_FAILURE(*pErrorCode)) {
396 return;
397 }
398
399 /*
400 * Move vector contents up to a contiguous array with only unique
401 * vector values, and call the handler function for each vector.
402 *
403 * This destroys the Properties Vector structure and replaces it
404 * with an array of just vector values.
405 */
406 row=pv->v;
407 count=-valueColumns;
408 for(i=0; i<rows; ++i) {
409 /* fetch these first before memmove() may overwrite them */
410 start=(UChar32)row[0];
411 limit=(UChar32)row[1];
412
413 /* add a new values vector if it is different from the current one */
414 if(count<0 || 0!=uprv_memcmp(row+2, pv->v+count, valueColumns*4)) {
415 count+=valueColumns;
416 uprv_memmove(pv->v+count, row+2, valueColumns*4);
417 }
418
419 if(start<UPVEC_FIRST_SPECIAL_CP) {
420 handler(context, start, limit-1, count, pv->v+count, valueColumns, pErrorCode);
421 if(U_FAILURE(*pErrorCode)) {
422 return;
423 }
424 }
425
426 row+=columns;
427 }
428
429 /* count is at the beginning of the last vector, add one to include that last vector */
430 pv->rows=count/valueColumns+1;
431 }
432
433 U_CAPI const uint32_t * U_EXPORT2
upvec_getArray(const UPropsVectors * pv,int32_t * pRows,int32_t * pColumns)434 upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) {
435 if(!pv->isCompacted) {
436 return NULL;
437 }
438 if(pRows!=NULL) {
439 *pRows=pv->rows;
440 }
441 if(pColumns!=NULL) {
442 *pColumns=pv->columns-2;
443 }
444 return pv->v;
445 }
446
447 U_CAPI uint32_t * U_EXPORT2
upvec_cloneArray(const UPropsVectors * pv,int32_t * pRows,int32_t * pColumns,UErrorCode * pErrorCode)448 upvec_cloneArray(const UPropsVectors *pv,
449 int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) {
450 uint32_t *clonedArray;
451 int32_t byteLength;
452
453 if(U_FAILURE(*pErrorCode)) {
454 return NULL;
455 }
456 if(!pv->isCompacted) {
457 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
458 return NULL;
459 }
460 byteLength=pv->rows*(pv->columns-2)*4;
461 clonedArray=(uint32_t *)uprv_malloc(byteLength);
462 if(clonedArray==NULL) {
463 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
464 return NULL;
465 }
466 uprv_memcpy(clonedArray, pv->v, byteLength);
467 if(pRows!=NULL) {
468 *pRows=pv->rows;
469 }
470 if(pColumns!=NULL) {
471 *pColumns=pv->columns-2;
472 }
473 return clonedArray;
474 }
475
476 U_CAPI UTrie2 * U_EXPORT2
upvec_compactToUTrie2WithRowIndexes(UPropsVectors * pv,UErrorCode * pErrorCode)477 upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) {
478 UPVecToUTrie2Context toUTrie2={ NULL };
479 upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode);
480 utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode);
481 if(U_FAILURE(*pErrorCode)) {
482 utrie2_close(toUTrie2.trie);
483 toUTrie2.trie=NULL;
484 }
485 return toUTrie2.trie;
486 }
487
488 /*
489 * TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts
490 * some 16-bit field and builds and returns a UTrie2.
491 */
492
493 U_CAPI void U_CALLCONV
upvec_compactToUTrie2Handler(void * context,UChar32 start,UChar32 end,int32_t rowIndex,uint32_t * row,int32_t columns,UErrorCode * pErrorCode)494 upvec_compactToUTrie2Handler(void *context,
495 UChar32 start, UChar32 end,
496 int32_t rowIndex, uint32_t *row, int32_t columns,
497 UErrorCode *pErrorCode) {
498 UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context;
499 if(start<UPVEC_FIRST_SPECIAL_CP) {
500 utrie2_setRange32(toUTrie2->trie, start, end, (uint32_t)rowIndex, TRUE, pErrorCode);
501 } else {
502 switch(start) {
503 case UPVEC_INITIAL_VALUE_CP:
504 toUTrie2->initialValue=rowIndex;
505 break;
506 case UPVEC_ERROR_VALUE_CP:
507 toUTrie2->errorValue=rowIndex;
508 break;
509 case UPVEC_START_REAL_VALUES_CP:
510 toUTrie2->maxValue=rowIndex;
511 if(rowIndex>0xffff) {
512 /* too many rows for a 16-bit trie */
513 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
514 } else {
515 toUTrie2->trie=utrie2_open(toUTrie2->initialValue,
516 toUTrie2->errorValue, pErrorCode);
517 }
518 break;
519 default:
520 break;
521 }
522 }
523 }
524