• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2002-2010, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  propsvec.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2002feb22
14 *   created by: Markus W. Scherer
15 *
16 *   Store bits (Unicode character properties) in bit set vectors.
17 */
18 
19 #include <stdlib.h>
20 #include "unicode/utypes.h"
21 #include "cmemory.h"
22 #include "utrie.h"
23 #include "utrie2.h"
24 #include "uarrsort.h"
25 #include "propsvec.h"
26 
27 struct UPropsVectors {
28     uint32_t *v;
29     int32_t columns;  /* number of columns, plus two for start & limit values */
30     int32_t maxRows;
31     int32_t rows;
32     int32_t prevRow;  /* search optimization: remember last row seen */
33     UBool isCompacted;
34 };
35 
36 #define UPVEC_INITIAL_ROWS (1<<12)
37 #define UPVEC_MEDIUM_ROWS ((int32_t)1<<16)
38 #define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1)
39 
40 U_CAPI UPropsVectors * U_EXPORT2
upvec_open(int32_t columns,UErrorCode * pErrorCode)41 upvec_open(int32_t columns, UErrorCode *pErrorCode) {
42     UPropsVectors *pv;
43     uint32_t *v, *row;
44     uint32_t cp;
45 
46     if(U_FAILURE(*pErrorCode)) {
47         return NULL;
48     }
49     if(columns<1) {
50         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
51         return NULL;
52     }
53     columns+=2; /* count range start and limit columns */
54 
55     pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors));
56     v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4);
57     if(pv==NULL || v==NULL) {
58         uprv_free(pv);
59         uprv_free(v);
60         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
61         return NULL;
62     }
63     uprv_memset(pv, 0, sizeof(UPropsVectors));
64     pv->v=v;
65     pv->columns=columns;
66     pv->maxRows=UPVEC_INITIAL_ROWS;
67     pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP);
68 
69     /* set the all-Unicode row and the special-value rows */
70     row=pv->v;
71     uprv_memset(row, 0, pv->rows*columns*4);
72     row[0]=0;
73     row[1]=0x110000;
74     row+=columns;
75     for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) {
76         row[0]=cp;
77         row[1]=cp+1;
78         row+=columns;
79     }
80     return pv;
81 }
82 
83 U_CAPI void U_EXPORT2
upvec_close(UPropsVectors * pv)84 upvec_close(UPropsVectors *pv) {
85     if(pv!=NULL) {
86         uprv_free(pv->v);
87         uprv_free(pv);
88     }
89 }
90 
91 static uint32_t *
_findRow(UPropsVectors * pv,UChar32 rangeStart)92 _findRow(UPropsVectors *pv, UChar32 rangeStart) {
93     uint32_t *row;
94     int32_t columns, i, start, limit, prevRow, rows;
95 
96     columns=pv->columns;
97     rows=limit=pv->rows;
98     prevRow=pv->prevRow;
99 
100     /* check the vicinity of the last-seen row (start searching with an unrolled loop) */
101     row=pv->v+prevRow*columns;
102     if(rangeStart>=(UChar32)row[0]) {
103         if(rangeStart<(UChar32)row[1]) {
104             /* same row as last seen */
105             return row;
106         } else if(rangeStart<(UChar32)(row+=columns)[1]) {
107             /* next row after the last one */
108             pv->prevRow=prevRow+1;
109             return row;
110         } else if(rangeStart<(UChar32)(row+=columns)[1]) {
111             /* second row after the last one */
112             pv->prevRow=prevRow+2;
113             return row;
114         } else if((rangeStart-(UChar32)row[1])<10) {
115             /* we are close, continue looping */
116             prevRow+=2;
117             do {
118                 ++prevRow;
119                 row+=columns;
120             } while(rangeStart>=(UChar32)row[1]);
121             pv->prevRow=prevRow;
122             return row;
123         }
124     } else if(rangeStart<(UChar32)pv->v[1]) {
125         /* the very first row */
126         pv->prevRow=0;
127         return pv->v;
128     }
129 
130     /* do a binary search for the start of the range */
131     start=0;
132     while(start<limit-1) {
133         i=(start+limit)/2;
134         row=pv->v+i*columns;
135         if(rangeStart<(UChar32)row[0]) {
136             limit=i;
137         } else if(rangeStart<(UChar32)row[1]) {
138             pv->prevRow=i;
139             return row;
140         } else {
141             start=i;
142         }
143     }
144 
145     /* must be found because all ranges together always cover all of Unicode */
146     pv->prevRow=start;
147     return pv->v+start*columns;
148 }
149 
150 U_CAPI void U_EXPORT2
upvec_setValue(UPropsVectors * pv,UChar32 start,UChar32 end,int32_t column,uint32_t value,uint32_t mask,UErrorCode * pErrorCode)151 upvec_setValue(UPropsVectors *pv,
152                UChar32 start, UChar32 end,
153                int32_t column,
154                uint32_t value, uint32_t mask,
155                UErrorCode *pErrorCode) {
156     uint32_t *firstRow, *lastRow;
157     int32_t columns;
158     UChar32 limit;
159     UBool splitFirstRow, splitLastRow;
160 
161     /* argument checking */
162     if(U_FAILURE(*pErrorCode)) {
163         return;
164     }
165     if( pv==NULL ||
166         start<0 || start>end || end>UPVEC_MAX_CP ||
167         column<0 || column>=(pv->columns-2)
168     ) {
169         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
170         return;
171     }
172     if(pv->isCompacted) {
173         *pErrorCode=U_NO_WRITE_PERMISSION;
174         return;
175     }
176     limit=end+1;
177 
178     /* initialize */
179     columns=pv->columns;
180     column+=2; /* skip range start and limit columns */
181     value&=mask;
182 
183     /* find the rows whose ranges overlap with the input range */
184 
185     /* find the first and last rows, always successful */
186     firstRow=_findRow(pv, start);
187     lastRow=_findRow(pv, end);
188 
189     /*
190      * Rows need to be split if they partially overlap with the
191      * input range (only possible for the first and last rows)
192      * and if their value differs from the input value.
193      */
194     splitFirstRow= (UBool)(start!=(UChar32)firstRow[0] && value!=(firstRow[column]&mask));
195     splitLastRow= (UBool)(limit!=(UChar32)lastRow[1] && value!=(lastRow[column]&mask));
196 
197     /* split first/last rows if necessary */
198     if(splitFirstRow || splitLastRow) {
199         int32_t count, rows;
200 
201         rows=pv->rows;
202         if((rows+splitFirstRow+splitLastRow)>pv->maxRows) {
203             uint32_t *newVectors;
204             int32_t newMaxRows;
205 
206             if(pv->maxRows<UPVEC_MEDIUM_ROWS) {
207                 newMaxRows=UPVEC_MEDIUM_ROWS;
208             } else if(pv->maxRows<UPVEC_MAX_ROWS) {
209                 newMaxRows=UPVEC_MAX_ROWS;
210             } else {
211                 /* Implementation bug, or UPVEC_MAX_ROWS too low. */
212                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
213                 return;
214             }
215             newVectors=(uint32_t *)uprv_malloc(newMaxRows*columns*4);
216             if(newVectors==NULL) {
217                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
218                 return;
219             }
220             uprv_memcpy(newVectors, pv->v, rows*columns*4);
221             firstRow=newVectors+(firstRow-pv->v);
222             lastRow=newVectors+(lastRow-pv->v);
223             uprv_free(pv->v);
224             pv->v=newVectors;
225             pv->maxRows=newMaxRows;
226         }
227 
228         /* count the number of row cells to move after the last row, and move them */
229         count = (int32_t)((pv->v+rows*columns)-(lastRow+columns));
230         if(count>0) {
231             uprv_memmove(
232                 lastRow+(1+splitFirstRow+splitLastRow)*columns,
233                 lastRow+columns,
234                 count*4);
235         }
236         pv->rows=rows+splitFirstRow+splitLastRow;
237 
238         /* split the first row, and move the firstRow pointer to the second part */
239         if(splitFirstRow) {
240             /* copy all affected rows up one and move the lastRow pointer */
241             count = (int32_t)((lastRow-firstRow)+columns);
242             uprv_memmove(firstRow+columns, firstRow, count*4);
243             lastRow+=columns;
244 
245             /* split the range and move the firstRow pointer */
246             firstRow[1]=firstRow[columns]=(uint32_t)start;
247             firstRow+=columns;
248         }
249 
250         /* split the last row */
251         if(splitLastRow) {
252             /* copy the last row data */
253             uprv_memcpy(lastRow+columns, lastRow, columns*4);
254 
255             /* split the range and move the firstRow pointer */
256             lastRow[1]=lastRow[columns]=(uint32_t)limit;
257         }
258     }
259 
260     /* set the "row last seen" to the last row for the range */
261     pv->prevRow=(int32_t)((lastRow-(pv->v))/columns);
262 
263     /* set the input value in all remaining rows */
264     firstRow+=column;
265     lastRow+=column;
266     mask=~mask;
267     for(;;) {
268         *firstRow=(*firstRow&mask)|value;
269         if(firstRow==lastRow) {
270             break;
271         }
272         firstRow+=columns;
273     }
274 }
275 
276 U_CAPI uint32_t U_EXPORT2
upvec_getValue(const UPropsVectors * pv,UChar32 c,int32_t column)277 upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) {
278     uint32_t *row;
279     UPropsVectors *ncpv;
280 
281     if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) {
282         return 0;
283     }
284     ncpv=(UPropsVectors *)pv;
285     row=_findRow(ncpv, c);
286     return row[2+column];
287 }
288 
289 U_CAPI uint32_t * U_EXPORT2
upvec_getRow(const UPropsVectors * pv,int32_t rowIndex,UChar32 * pRangeStart,UChar32 * pRangeEnd)290 upvec_getRow(const UPropsVectors *pv, int32_t rowIndex,
291              UChar32 *pRangeStart, UChar32 *pRangeEnd) {
292     uint32_t *row;
293     int32_t columns;
294 
295     if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) {
296         return NULL;
297     }
298 
299     columns=pv->columns;
300     row=pv->v+rowIndex*columns;
301     if(pRangeStart!=NULL) {
302         *pRangeStart=(UChar32)row[0];
303     }
304     if(pRangeEnd!=NULL) {
305         *pRangeEnd=(UChar32)row[1]-1;
306     }
307     return row+2;
308 }
309 
310 static int32_t U_CALLCONV
upvec_compareRows(const void * context,const void * l,const void * r)311 upvec_compareRows(const void *context, const void *l, const void *r) {
312     const uint32_t *left=(const uint32_t *)l, *right=(const uint32_t *)r;
313     const UPropsVectors *pv=(const UPropsVectors *)context;
314     int32_t i, count, columns;
315 
316     count=columns=pv->columns; /* includes start/limit columns */
317 
318     /* start comparing after start/limit but wrap around to them */
319     i=2;
320     do {
321         if(left[i]!=right[i]) {
322             return left[i]<right[i] ? -1 : 1;
323         }
324         if(++i==columns) {
325             i=0;
326         }
327     } while(--count>0);
328 
329     return 0;
330 }
331 
332 U_CAPI void U_EXPORT2
upvec_compact(UPropsVectors * pv,UPVecCompactHandler * handler,void * context,UErrorCode * pErrorCode)333 upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) {
334     uint32_t *row;
335     int32_t i, columns, valueColumns, rows, count;
336     UChar32 start, limit;
337 
338     /* argument checking */
339     if(U_FAILURE(*pErrorCode)) {
340         return;
341     }
342     if(handler==NULL) {
343         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
344         return;
345     }
346     if(pv->isCompacted) {
347         return;
348     }
349 
350     /* Set the flag now: Sorting and compacting destroys the builder data structure. */
351     pv->isCompacted=TRUE;
352 
353     rows=pv->rows;
354     columns=pv->columns;
355     valueColumns=columns-2; /* not counting start & limit */
356 
357     /* sort the properties vectors to find unique vector values */
358     uprv_sortArray(pv->v, rows, columns*4,
359                    upvec_compareRows, pv, FALSE, pErrorCode);
360     if(U_FAILURE(*pErrorCode)) {
361         return;
362     }
363 
364     /*
365      * Find and set the special values.
366      * This has to do almost the same work as the compaction below,
367      * to find the indexes where the special-value rows will move.
368      */
369     row=pv->v;
370     count=-valueColumns;
371     for(i=0; i<rows; ++i) {
372         start=(UChar32)row[0];
373 
374         /* count a new values vector if it is different from the current one */
375         if(count<0 || 0!=uprv_memcmp(row+2, row-valueColumns, valueColumns*4)) {
376             count+=valueColumns;
377         }
378 
379         if(start>=UPVEC_FIRST_SPECIAL_CP) {
380             handler(context, start, start, count, row+2, valueColumns, pErrorCode);
381             if(U_FAILURE(*pErrorCode)) {
382                 return;
383             }
384         }
385 
386         row+=columns;
387     }
388 
389     /* count is at the beginning of the last vector, add valueColumns to include that last vector */
390     count+=valueColumns;
391 
392     /* Call the handler once more to signal the start of delivering real values. */
393     handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP,
394             count, row-valueColumns, valueColumns, pErrorCode);
395     if(U_FAILURE(*pErrorCode)) {
396         return;
397     }
398 
399     /*
400      * Move vector contents up to a contiguous array with only unique
401      * vector values, and call the handler function for each vector.
402      *
403      * This destroys the Properties Vector structure and replaces it
404      * with an array of just vector values.
405      */
406     row=pv->v;
407     count=-valueColumns;
408     for(i=0; i<rows; ++i) {
409         /* fetch these first before memmove() may overwrite them */
410         start=(UChar32)row[0];
411         limit=(UChar32)row[1];
412 
413         /* add a new values vector if it is different from the current one */
414         if(count<0 || 0!=uprv_memcmp(row+2, pv->v+count, valueColumns*4)) {
415             count+=valueColumns;
416             uprv_memmove(pv->v+count, row+2, valueColumns*4);
417         }
418 
419         if(start<UPVEC_FIRST_SPECIAL_CP) {
420             handler(context, start, limit-1, count, pv->v+count, valueColumns, pErrorCode);
421             if(U_FAILURE(*pErrorCode)) {
422                 return;
423             }
424         }
425 
426         row+=columns;
427     }
428 
429     /* count is at the beginning of the last vector, add one to include that last vector */
430     pv->rows=count/valueColumns+1;
431 }
432 
433 U_CAPI const uint32_t * U_EXPORT2
upvec_getArray(const UPropsVectors * pv,int32_t * pRows,int32_t * pColumns)434 upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) {
435     if(!pv->isCompacted) {
436         return NULL;
437     }
438     if(pRows!=NULL) {
439         *pRows=pv->rows;
440     }
441     if(pColumns!=NULL) {
442         *pColumns=pv->columns-2;
443     }
444     return pv->v;
445 }
446 
447 U_CAPI uint32_t * U_EXPORT2
upvec_cloneArray(const UPropsVectors * pv,int32_t * pRows,int32_t * pColumns,UErrorCode * pErrorCode)448 upvec_cloneArray(const UPropsVectors *pv,
449                  int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) {
450     uint32_t *clonedArray;
451     int32_t byteLength;
452 
453     if(U_FAILURE(*pErrorCode)) {
454         return NULL;
455     }
456     if(!pv->isCompacted) {
457         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
458         return NULL;
459     }
460     byteLength=pv->rows*(pv->columns-2)*4;
461     clonedArray=(uint32_t *)uprv_malloc(byteLength);
462     if(clonedArray==NULL) {
463         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
464         return NULL;
465     }
466     uprv_memcpy(clonedArray, pv->v, byteLength);
467     if(pRows!=NULL) {
468         *pRows=pv->rows;
469     }
470     if(pColumns!=NULL) {
471         *pColumns=pv->columns-2;
472     }
473     return clonedArray;
474 }
475 
476 U_CAPI UTrie2 * U_EXPORT2
upvec_compactToUTrie2WithRowIndexes(UPropsVectors * pv,UErrorCode * pErrorCode)477 upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) {
478     UPVecToUTrie2Context toUTrie2={ NULL };
479     upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode);
480     utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode);
481     if(U_FAILURE(*pErrorCode)) {
482         utrie2_close(toUTrie2.trie);
483         toUTrie2.trie=NULL;
484     }
485     return toUTrie2.trie;
486 }
487 
488 /*
489  * TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts
490  * some 16-bit field and builds and returns a UTrie2.
491  */
492 
493 U_CAPI void U_CALLCONV
upvec_compactToUTrie2Handler(void * context,UChar32 start,UChar32 end,int32_t rowIndex,uint32_t * row,int32_t columns,UErrorCode * pErrorCode)494 upvec_compactToUTrie2Handler(void *context,
495                              UChar32 start, UChar32 end,
496                              int32_t rowIndex, uint32_t *row, int32_t columns,
497                              UErrorCode *pErrorCode) {
498     UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context;
499     if(start<UPVEC_FIRST_SPECIAL_CP) {
500         utrie2_setRange32(toUTrie2->trie, start, end, (uint32_t)rowIndex, TRUE, pErrorCode);
501     } else {
502         switch(start) {
503         case UPVEC_INITIAL_VALUE_CP:
504             toUTrie2->initialValue=rowIndex;
505             break;
506         case UPVEC_ERROR_VALUE_CP:
507             toUTrie2->errorValue=rowIndex;
508             break;
509         case UPVEC_START_REAL_VALUES_CP:
510             toUTrie2->maxValue=rowIndex;
511             if(rowIndex>0xffff) {
512                 /* too many rows for a 16-bit trie */
513                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
514             } else {
515                 toUTrie2->trie=utrie2_open(toUTrie2->initialValue,
516                                            toUTrie2->errorValue, pErrorCode);
517             }
518             break;
519         default:
520             break;
521         }
522     }
523 }
524