1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2002-2011, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: propsvec.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2002feb22
14 * created by: Markus W. Scherer
15 *
16 * Store bits (Unicode character properties) in bit set vectors.
17 */
18
19 #include <stdlib.h>
20 #include "unicode/utypes.h"
21 #include "cmemory.h"
22 #include "utrie.h"
23 #include "utrie2.h"
24 #include "uarrsort.h"
25 #include "propsvec.h"
26 #include "uassert.h"
27
28 struct UPropsVectors {
29 uint32_t *v;
30 int32_t columns; /* number of columns, plus two for start & limit values */
31 int32_t maxRows;
32 int32_t rows;
33 int32_t prevRow; /* search optimization: remember last row seen */
34 UBool isCompacted;
35 };
36
37 #define UPVEC_INITIAL_ROWS (1<<12)
38 #define UPVEC_MEDIUM_ROWS ((int32_t)1<<16)
39 #define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1)
40
41 U_CAPI UPropsVectors * U_EXPORT2
upvec_open(int32_t columns,UErrorCode * pErrorCode)42 upvec_open(int32_t columns, UErrorCode *pErrorCode) {
43 UPropsVectors *pv;
44 uint32_t *v, *row;
45 uint32_t cp;
46
47 if(U_FAILURE(*pErrorCode)) {
48 return NULL;
49 }
50 if(columns<1) {
51 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
52 return NULL;
53 }
54 columns+=2; /* count range start and limit columns */
55
56 pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors));
57 v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4);
58 if(pv==NULL || v==NULL) {
59 uprv_free(pv);
60 uprv_free(v);
61 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
62 return NULL;
63 }
64 uprv_memset(pv, 0, sizeof(UPropsVectors));
65 pv->v=v;
66 pv->columns=columns;
67 pv->maxRows=UPVEC_INITIAL_ROWS;
68 pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP);
69
70 /* set the all-Unicode row and the special-value rows */
71 row=pv->v;
72 uprv_memset(row, 0, pv->rows*columns*4);
73 row[0]=0;
74 row[1]=0x110000;
75 row+=columns;
76 for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) {
77 row[0]=cp;
78 row[1]=cp+1;
79 row+=columns;
80 }
81 return pv;
82 }
83
84 U_CAPI void U_EXPORT2
upvec_close(UPropsVectors * pv)85 upvec_close(UPropsVectors *pv) {
86 if(pv!=NULL) {
87 uprv_free(pv->v);
88 uprv_free(pv);
89 }
90 }
91
92 static uint32_t *
_findRow(UPropsVectors * pv,UChar32 rangeStart)93 _findRow(UPropsVectors *pv, UChar32 rangeStart) {
94 uint32_t *row;
95 int32_t columns, i, start, limit, prevRow;
96
97 columns=pv->columns;
98 limit=pv->rows;
99 prevRow=pv->prevRow;
100
101 /* check the vicinity of the last-seen row (start searching with an unrolled loop) */
102 row=pv->v+prevRow*columns;
103 if(rangeStart>=(UChar32)row[0]) {
104 if(rangeStart<(UChar32)row[1]) {
105 /* same row as last seen */
106 return row;
107 } else if(rangeStart<(UChar32)(row+=columns)[1]) {
108 /* next row after the last one */
109 pv->prevRow=prevRow+1;
110 return row;
111 } else if(rangeStart<(UChar32)(row+=columns)[1]) {
112 /* second row after the last one */
113 pv->prevRow=prevRow+2;
114 return row;
115 } else if((rangeStart-(UChar32)row[1])<10) {
116 /* we are close, continue looping */
117 prevRow+=2;
118 do {
119 ++prevRow;
120 row+=columns;
121 } while(rangeStart>=(UChar32)row[1]);
122 pv->prevRow=prevRow;
123 return row;
124 }
125 } else if(rangeStart<(UChar32)pv->v[1]) {
126 /* the very first row */
127 pv->prevRow=0;
128 return pv->v;
129 }
130
131 /* do a binary search for the start of the range */
132 start=0;
133 while(start<limit-1) {
134 i=(start+limit)/2;
135 row=pv->v+i*columns;
136 if(rangeStart<(UChar32)row[0]) {
137 limit=i;
138 } else if(rangeStart<(UChar32)row[1]) {
139 pv->prevRow=i;
140 return row;
141 } else {
142 start=i;
143 }
144 }
145
146 /* must be found because all ranges together always cover all of Unicode */
147 pv->prevRow=start;
148 return pv->v+start*columns;
149 }
150
151 U_CAPI void U_EXPORT2
upvec_setValue(UPropsVectors * pv,UChar32 start,UChar32 end,int32_t column,uint32_t value,uint32_t mask,UErrorCode * pErrorCode)152 upvec_setValue(UPropsVectors *pv,
153 UChar32 start, UChar32 end,
154 int32_t column,
155 uint32_t value, uint32_t mask,
156 UErrorCode *pErrorCode) {
157 uint32_t *firstRow, *lastRow;
158 int32_t columns;
159 UChar32 limit;
160 UBool splitFirstRow, splitLastRow;
161
162 /* argument checking */
163 if(U_FAILURE(*pErrorCode)) {
164 return;
165 }
166 if( pv==NULL ||
167 start<0 || start>end || end>UPVEC_MAX_CP ||
168 column<0 || column>=(pv->columns-2)
169 ) {
170 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
171 return;
172 }
173 if(pv->isCompacted) {
174 *pErrorCode=U_NO_WRITE_PERMISSION;
175 return;
176 }
177 limit=end+1;
178
179 /* initialize */
180 columns=pv->columns;
181 column+=2; /* skip range start and limit columns */
182 value&=mask;
183
184 /* find the rows whose ranges overlap with the input range */
185
186 /* find the first and last rows, always successful */
187 firstRow=_findRow(pv, start);
188 lastRow=_findRow(pv, end);
189
190 /*
191 * Rows need to be split if they partially overlap with the
192 * input range (only possible for the first and last rows)
193 * and if their value differs from the input value.
194 */
195 splitFirstRow= (UBool)(start!=(UChar32)firstRow[0] && value!=(firstRow[column]&mask));
196 splitLastRow= (UBool)(limit!=(UChar32)lastRow[1] && value!=(lastRow[column]&mask));
197
198 /* split first/last rows if necessary */
199 if(splitFirstRow || splitLastRow) {
200 int32_t count, rows;
201
202 rows=pv->rows;
203 if((rows+splitFirstRow+splitLastRow)>pv->maxRows) {
204 uint32_t *newVectors;
205 int32_t newMaxRows;
206
207 if(pv->maxRows<UPVEC_MEDIUM_ROWS) {
208 newMaxRows=UPVEC_MEDIUM_ROWS;
209 } else if(pv->maxRows<UPVEC_MAX_ROWS) {
210 newMaxRows=UPVEC_MAX_ROWS;
211 } else {
212 /* Implementation bug, or UPVEC_MAX_ROWS too low. */
213 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
214 return;
215 }
216 newVectors=(uint32_t *)uprv_malloc(newMaxRows*columns*4);
217 if(newVectors==NULL) {
218 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
219 return;
220 }
221 uprv_memcpy(newVectors, pv->v, rows*columns*4);
222 firstRow=newVectors+(firstRow-pv->v);
223 lastRow=newVectors+(lastRow-pv->v);
224 uprv_free(pv->v);
225 pv->v=newVectors;
226 pv->maxRows=newMaxRows;
227 }
228
229 /* count the number of row cells to move after the last row, and move them */
230 count = (int32_t)((pv->v+rows*columns)-(lastRow+columns));
231 if(count>0) {
232 uprv_memmove(
233 lastRow+(1+splitFirstRow+splitLastRow)*columns,
234 lastRow+columns,
235 count*4);
236 }
237 pv->rows=rows+splitFirstRow+splitLastRow;
238
239 /* split the first row, and move the firstRow pointer to the second part */
240 if(splitFirstRow) {
241 /* copy all affected rows up one and move the lastRow pointer */
242 count = (int32_t)((lastRow-firstRow)+columns);
243 uprv_memmove(firstRow+columns, firstRow, count*4);
244 lastRow+=columns;
245
246 /* split the range and move the firstRow pointer */
247 firstRow[1]=firstRow[columns]=(uint32_t)start;
248 firstRow+=columns;
249 }
250
251 /* split the last row */
252 if(splitLastRow) {
253 /* copy the last row data */
254 uprv_memcpy(lastRow+columns, lastRow, columns*4);
255
256 /* split the range and move the firstRow pointer */
257 lastRow[1]=lastRow[columns]=(uint32_t)limit;
258 }
259 }
260
261 /* set the "row last seen" to the last row for the range */
262 pv->prevRow=(int32_t)((lastRow-(pv->v))/columns);
263
264 /* set the input value in all remaining rows */
265 firstRow+=column;
266 lastRow+=column;
267 mask=~mask;
268 for(;;) {
269 *firstRow=(*firstRow&mask)|value;
270 if(firstRow==lastRow) {
271 break;
272 }
273 firstRow+=columns;
274 }
275 }
276
277 U_CAPI uint32_t U_EXPORT2
upvec_getValue(const UPropsVectors * pv,UChar32 c,int32_t column)278 upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) {
279 uint32_t *row;
280 UPropsVectors *ncpv;
281
282 if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) {
283 return 0;
284 }
285 ncpv=(UPropsVectors *)pv;
286 row=_findRow(ncpv, c);
287 return row[2+column];
288 }
289
290 U_CAPI uint32_t * U_EXPORT2
upvec_getRow(const UPropsVectors * pv,int32_t rowIndex,UChar32 * pRangeStart,UChar32 * pRangeEnd)291 upvec_getRow(const UPropsVectors *pv, int32_t rowIndex,
292 UChar32 *pRangeStart, UChar32 *pRangeEnd) {
293 uint32_t *row;
294 int32_t columns;
295
296 if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) {
297 return NULL;
298 }
299
300 columns=pv->columns;
301 row=pv->v+rowIndex*columns;
302 if(pRangeStart!=NULL) {
303 *pRangeStart=(UChar32)row[0];
304 }
305 if(pRangeEnd!=NULL) {
306 *pRangeEnd=(UChar32)row[1]-1;
307 }
308 return row+2;
309 }
310
311 static int32_t U_CALLCONV
upvec_compareRows(const void * context,const void * l,const void * r)312 upvec_compareRows(const void *context, const void *l, const void *r) {
313 const uint32_t *left=(const uint32_t *)l, *right=(const uint32_t *)r;
314 const UPropsVectors *pv=(const UPropsVectors *)context;
315 int32_t i, count, columns;
316
317 count=columns=pv->columns; /* includes start/limit columns */
318
319 /* start comparing after start/limit but wrap around to them */
320 i=2;
321 do {
322 if(left[i]!=right[i]) {
323 return left[i]<right[i] ? -1 : 1;
324 }
325 if(++i==columns) {
326 i=0;
327 }
328 } while(--count>0);
329
330 return 0;
331 }
332
333 U_CAPI void U_EXPORT2
upvec_compact(UPropsVectors * pv,UPVecCompactHandler * handler,void * context,UErrorCode * pErrorCode)334 upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) {
335 uint32_t *row;
336 int32_t i, columns, valueColumns, rows, count;
337 UChar32 start, limit;
338
339 /* argument checking */
340 if(U_FAILURE(*pErrorCode)) {
341 return;
342 }
343 if(handler==NULL) {
344 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
345 return;
346 }
347 if(pv->isCompacted) {
348 return;
349 }
350
351 /* Set the flag now: Sorting and compacting destroys the builder data structure. */
352 pv->isCompacted=TRUE;
353
354 rows=pv->rows;
355 columns=pv->columns;
356 U_ASSERT(columns>=3); /* upvec_open asserts this */
357 valueColumns=columns-2; /* not counting start & limit */
358
359 /* sort the properties vectors to find unique vector values */
360 uprv_sortArray(pv->v, rows, columns*4,
361 upvec_compareRows, pv, FALSE, pErrorCode);
362 if(U_FAILURE(*pErrorCode)) {
363 return;
364 }
365
366 /*
367 * Find and set the special values.
368 * This has to do almost the same work as the compaction below,
369 * to find the indexes where the special-value rows will move.
370 */
371 row=pv->v;
372 count=-valueColumns;
373 for(i=0; i<rows; ++i) {
374 start=(UChar32)row[0];
375
376 /* count a new values vector if it is different from the current one */
377 if(count<0 || 0!=uprv_memcmp(row+2, row-valueColumns, valueColumns*4)) {
378 count+=valueColumns;
379 }
380
381 if(start>=UPVEC_FIRST_SPECIAL_CP) {
382 handler(context, start, start, count, row+2, valueColumns, pErrorCode);
383 if(U_FAILURE(*pErrorCode)) {
384 return;
385 }
386 }
387
388 row+=columns;
389 }
390
391 /* count is at the beginning of the last vector, add valueColumns to include that last vector */
392 count+=valueColumns;
393
394 /* Call the handler once more to signal the start of delivering real values. */
395 handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP,
396 count, row-valueColumns, valueColumns, pErrorCode);
397 if(U_FAILURE(*pErrorCode)) {
398 return;
399 }
400
401 /*
402 * Move vector contents up to a contiguous array with only unique
403 * vector values, and call the handler function for each vector.
404 *
405 * This destroys the Properties Vector structure and replaces it
406 * with an array of just vector values.
407 */
408 row=pv->v;
409 count=-valueColumns;
410 for(i=0; i<rows; ++i) {
411 /* fetch these first before memmove() may overwrite them */
412 start=(UChar32)row[0];
413 limit=(UChar32)row[1];
414
415 /* add a new values vector if it is different from the current one */
416 if(count<0 || 0!=uprv_memcmp(row+2, pv->v+count, valueColumns*4)) {
417 count+=valueColumns;
418 uprv_memmove(pv->v+count, row+2, valueColumns*4);
419 }
420
421 if(start<UPVEC_FIRST_SPECIAL_CP) {
422 handler(context, start, limit-1, count, pv->v+count, valueColumns, pErrorCode);
423 if(U_FAILURE(*pErrorCode)) {
424 return;
425 }
426 }
427
428 row+=columns;
429 }
430
431 /* count is at the beginning of the last vector, add one to include that last vector */
432 pv->rows=count/valueColumns+1;
433 }
434
435 U_CAPI const uint32_t * U_EXPORT2
upvec_getArray(const UPropsVectors * pv,int32_t * pRows,int32_t * pColumns)436 upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) {
437 if(!pv->isCompacted) {
438 return NULL;
439 }
440 if(pRows!=NULL) {
441 *pRows=pv->rows;
442 }
443 if(pColumns!=NULL) {
444 *pColumns=pv->columns-2;
445 }
446 return pv->v;
447 }
448
449 U_CAPI uint32_t * U_EXPORT2
upvec_cloneArray(const UPropsVectors * pv,int32_t * pRows,int32_t * pColumns,UErrorCode * pErrorCode)450 upvec_cloneArray(const UPropsVectors *pv,
451 int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) {
452 uint32_t *clonedArray;
453 int32_t byteLength;
454
455 if(U_FAILURE(*pErrorCode)) {
456 return NULL;
457 }
458 if(!pv->isCompacted) {
459 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
460 return NULL;
461 }
462 byteLength=pv->rows*(pv->columns-2)*4;
463 clonedArray=(uint32_t *)uprv_malloc(byteLength);
464 if(clonedArray==NULL) {
465 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
466 return NULL;
467 }
468 uprv_memcpy(clonedArray, pv->v, byteLength);
469 if(pRows!=NULL) {
470 *pRows=pv->rows;
471 }
472 if(pColumns!=NULL) {
473 *pColumns=pv->columns-2;
474 }
475 return clonedArray;
476 }
477
478 U_CAPI UTrie2 * U_EXPORT2
upvec_compactToUTrie2WithRowIndexes(UPropsVectors * pv,UErrorCode * pErrorCode)479 upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) {
480 UPVecToUTrie2Context toUTrie2={ NULL };
481 upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode);
482 utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode);
483 if(U_FAILURE(*pErrorCode)) {
484 utrie2_close(toUTrie2.trie);
485 toUTrie2.trie=NULL;
486 }
487 return toUTrie2.trie;
488 }
489
490 /*
491 * TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts
492 * some 16-bit field and builds and returns a UTrie2.
493 */
494
495 U_CAPI void U_CALLCONV
upvec_compactToUTrie2Handler(void * context,UChar32 start,UChar32 end,int32_t rowIndex,uint32_t * row,int32_t columns,UErrorCode * pErrorCode)496 upvec_compactToUTrie2Handler(void *context,
497 UChar32 start, UChar32 end,
498 int32_t rowIndex, uint32_t *row, int32_t columns,
499 UErrorCode *pErrorCode) {
500 UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context;
501 if(start<UPVEC_FIRST_SPECIAL_CP) {
502 utrie2_setRange32(toUTrie2->trie, start, end, (uint32_t)rowIndex, TRUE, pErrorCode);
503 } else {
504 switch(start) {
505 case UPVEC_INITIAL_VALUE_CP:
506 toUTrie2->initialValue=rowIndex;
507 break;
508 case UPVEC_ERROR_VALUE_CP:
509 toUTrie2->errorValue=rowIndex;
510 break;
511 case UPVEC_START_REAL_VALUES_CP:
512 toUTrie2->maxValue=rowIndex;
513 if(rowIndex>0xffff) {
514 /* too many rows for a 16-bit trie */
515 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
516 } else {
517 toUTrie2->trie=utrie2_open(toUTrie2->initialValue,
518 toUTrie2->errorValue, pErrorCode);
519 }
520 break;
521 default:
522 break;
523 }
524 }
525 }
526