• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2003-2008, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  unorm_it.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2003jan21
14 *   created by: Markus W. Scherer
15 */
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION
20 
21 #include "unicode/uiter.h"
22 #include "unicode/unorm.h"
23 #include "unorm_it.h"
24 #include "cmemory.h"
25 
26 /* UNormIterator ------------------------------------------------------------ */
27 
28 enum {
29     INITIAL_CAPACITY=100
30 };
31 
32 struct UNormIterator {
33     UCharIterator api;
34     UCharIterator *iter;
35 
36     /*
37      * chars and states either use the static buffers
38      * or are allocated in the same memory block
39      *
40      * They are parallel arrays with states[] holding the getState() values
41      * from normalization boundaries, and UITER_NO_STATE in between.
42      */
43     UChar *chars;
44     uint32_t *states;
45 
46     /*
47      * api.start: first valid character & state in the arrays
48      * api.index: current position
49      * api.limit: one past the last valid character in chars[], but states[limit] is valid
50      * capacity: length of allocated arrays
51      */
52     int32_t capacity;
53 
54     /* the current iter->getState(), saved to avoid unnecessary setState() calls; may not correspond to api->index! */
55     uint32_t state;
56 
57     /* there are UChars available before start or after limit? */
58     UBool hasPrevious, hasNext, isStackAllocated;
59 
60     UNormalizationMode mode;
61 
62     UChar charsBuffer[INITIAL_CAPACITY];
63     uint32_t statesBuffer[INITIAL_CAPACITY+1]; /* one more than charsBuffer[]! */
64 };
65 
66 static void
initIndexes(UNormIterator * uni,UCharIterator * iter)67 initIndexes(UNormIterator *uni, UCharIterator *iter) {
68     /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
69     UCharIterator *api=&uni->api;
70 
71     if(!iter->hasPrevious(iter)) {
72         /* set indexes to the beginning of the arrays */
73         api->start=api->index=api->limit=0;
74         uni->hasPrevious=FALSE;
75         uni->hasNext=iter->hasNext(iter);
76     } else if(!iter->hasNext(iter)) {
77         /* set indexes to the end of the arrays */
78         api->start=api->index=api->limit=uni->capacity;
79         uni->hasNext=FALSE;
80         uni->hasPrevious=iter->hasPrevious(iter);
81     } else {
82         /* set indexes into the middle of the arrays */
83         api->start=api->index=api->limit=uni->capacity/2;
84         uni->hasPrevious=uni->hasNext=TRUE;
85     }
86 }
87 
88 static UBool
reallocArrays(UNormIterator * uni,int32_t capacity,UBool addAtStart)89 reallocArrays(UNormIterator *uni, int32_t capacity, UBool addAtStart) {
90     /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
91     UCharIterator *api=&uni->api;
92 
93     uint32_t *states;
94     UChar *chars;
95     int32_t start, limit;
96 
97     states=(uint32_t *)uprv_malloc((capacity+1)*4+capacity*2);
98     if(states==NULL) {
99         return FALSE;
100     }
101 
102     chars=(UChar *)(states+(capacity+1));
103     uni->capacity=capacity;
104 
105     start=api->start;
106     limit=api->limit;
107 
108     if(addAtStart) {
109         /* copy old contents to the end of the new arrays */
110         int32_t delta;
111 
112         delta=capacity-uni->capacity;
113         uprv_memcpy(states+delta+start, uni->states+start, (limit-start+1)*4);
114         uprv_memcpy(chars+delta+start, uni->chars+start, (limit-start)*4);
115 
116         api->start=start+delta;
117         api->index+=delta;
118         api->limit=limit+delta;
119     } else {
120         /* copy old contents to the beginning of the new arrays */
121         uprv_memcpy(states+start, uni->states+start, (limit-start+1)*4);
122         uprv_memcpy(chars+start, uni->chars+start, (limit-start)*4);
123     }
124 
125     uni->chars=chars;
126     uni->states=states;
127 
128     return TRUE;
129 }
130 
131 static void
moveContentsTowardStart(UCharIterator * api,UChar chars[],uint32_t states[],int32_t delta)132 moveContentsTowardStart(UCharIterator *api, UChar chars[], uint32_t states[], int32_t delta) {
133     /* move array contents up to make room */
134     int32_t srcIndex, destIndex, limit;
135 
136     limit=api->limit;
137     srcIndex=delta;
138     if(srcIndex>api->start) {
139         /* look for a position in the arrays with a known state */
140         while(srcIndex<limit && states[srcIndex]==UITER_NO_STATE) {
141             ++srcIndex;
142         }
143     }
144 
145     /* now actually move the array contents */
146     api->start=destIndex=0;
147     while(srcIndex<limit) {
148         chars[destIndex]=chars[srcIndex];
149         states[destIndex++]=states[srcIndex++];
150     }
151 
152     /* copy states[limit] as well! */
153     states[destIndex]=states[srcIndex];
154 
155     api->limit=destIndex;
156 }
157 
158 static void
moveContentsTowardEnd(UCharIterator * api,UChar chars[],uint32_t states[],int32_t delta)159 moveContentsTowardEnd(UCharIterator *api, UChar chars[], uint32_t states[], int32_t delta) {
160     /* move array contents up to make room */
161     int32_t srcIndex, destIndex, start;
162 
163     start=api->start;
164     destIndex=((UNormIterator *)api)->capacity;
165     srcIndex=destIndex-delta;
166     if(srcIndex<api->limit) {
167         /* look for a position in the arrays with a known state */
168         while(srcIndex>start && states[srcIndex]==UITER_NO_STATE) {
169             --srcIndex;
170         }
171     }
172 
173     /* now actually move the array contents */
174     api->limit=destIndex;
175 
176     /* copy states[limit] as well! */
177     states[destIndex]=states[srcIndex];
178 
179     while(srcIndex>start) {
180         chars[--destIndex]=chars[--srcIndex];
181         states[destIndex]=states[srcIndex];
182     }
183 
184     api->start=destIndex;
185 }
186 
187 /* normalize forward from the limit, assume hasNext is true */
188 static UBool
readNext(UNormIterator * uni,UCharIterator * iter)189 readNext(UNormIterator *uni, UCharIterator *iter) {
190     /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
191     UCharIterator *api=&uni->api;
192 
193     /* make capacity/4 room at the end of the arrays */
194     int32_t limit, capacity, room;
195     UErrorCode errorCode;
196 
197     limit=api->limit;
198     capacity=uni->capacity;
199     room=capacity/4;
200     if(room>(capacity-limit)) {
201         /* move array contents to make room */
202         moveContentsTowardStart(api, uni->chars, uni->states, room);
203         api->index=limit=api->limit;
204         uni->hasPrevious=TRUE;
205     }
206 
207     /* normalize starting from the limit position */
208     errorCode=U_ZERO_ERROR;
209     if(uni->state!=uni->states[limit]) {
210         uiter_setState(iter, uni->states[limit], &errorCode);
211         if(U_FAILURE(errorCode)) {
212             uni->state=UITER_NO_STATE;
213             uni->hasNext=FALSE;
214             return FALSE;
215         }
216     }
217 
218     room=unorm_next(iter, uni->chars+limit, capacity-limit, uni->mode, 0, TRUE, NULL, &errorCode);
219     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
220         if(room<=capacity) {
221             /* empty and re-use the arrays */
222             uni->states[0]=uni->states[limit];
223             api->start=api->index=api->limit=limit=0;
224             uni->hasPrevious=TRUE;
225         } else {
226             capacity+=room+100;
227             if(!reallocArrays(uni, capacity, FALSE)) {
228                 uni->state=UITER_NO_STATE;
229                 uni->hasNext=FALSE;
230                 return FALSE;
231             }
232             limit=api->limit;
233         }
234 
235         errorCode=U_ZERO_ERROR;
236         uiter_setState(iter, uni->states[limit], &errorCode);
237         room=unorm_next(iter, uni->chars+limit, capacity-limit, uni->mode, 0, TRUE, NULL, &errorCode);
238     }
239     if(U_FAILURE(errorCode) || room==0) {
240         uni->state=UITER_NO_STATE;
241         uni->hasNext=FALSE;
242         return FALSE;
243     }
244 
245     /* room>0 */
246     ++limit; /* leave the known states[limit] alone */
247     for(--room; room>0; --room) {
248         /* set unknown states for all but the normalization boundaries */
249         uni->states[limit++]=UITER_NO_STATE;
250     }
251     uni->states[limit]=uni->state=uiter_getState(iter);
252     uni->hasNext=iter->hasNext(iter);
253     api->limit=limit;
254     return TRUE;
255 }
256 
257 /* normalize backward from the start, assume hasPrevious is true */
258 static UBool
readPrevious(UNormIterator * uni,UCharIterator * iter)259 readPrevious(UNormIterator *uni, UCharIterator *iter) {
260     /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
261     UCharIterator *api=&uni->api;
262 
263     /* make capacity/4 room at the start of the arrays */
264     int32_t start, capacity, room;
265     UErrorCode errorCode;
266 
267     start=api->start;
268     capacity=uni->capacity;
269     room=capacity/4;
270     if(room>start) {
271         /* move array contents to make room */
272         moveContentsTowardEnd(api, uni->chars, uni->states, room);
273         api->index=start=api->start;
274         uni->hasNext=TRUE;
275     }
276 
277     /* normalize ending at the start position */
278     errorCode=U_ZERO_ERROR;
279     if(uni->state!=uni->states[start]) {
280         uiter_setState(iter, uni->states[start], &errorCode);
281         if(U_FAILURE(errorCode)) {
282             uni->state=UITER_NO_STATE;
283             uni->hasPrevious=FALSE;
284             return FALSE;
285         }
286     }
287 
288     room=unorm_previous(iter, uni->chars, start, uni->mode, 0, TRUE, NULL, &errorCode);
289     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
290         if(room<=capacity) {
291             /* empty and re-use the arrays */
292             uni->states[capacity]=uni->states[start];
293             api->start=api->index=api->limit=start=capacity;
294             uni->hasNext=TRUE;
295         } else {
296             capacity+=room+100;
297             if(!reallocArrays(uni, capacity, TRUE)) {
298                 uni->state=UITER_NO_STATE;
299                 uni->hasPrevious=FALSE;
300                 return FALSE;
301             }
302             start=api->start;
303         }
304 
305         errorCode=U_ZERO_ERROR;
306         uiter_setState(iter, uni->states[start], &errorCode);
307         room=unorm_previous(iter, uni->chars, start, uni->mode, 0, TRUE, NULL, &errorCode);
308     }
309     if(U_FAILURE(errorCode) || room==0) {
310         uni->state=UITER_NO_STATE;
311         uni->hasPrevious=FALSE;
312         return FALSE;
313     }
314 
315     /* room>0 */
316     do {
317         /* copy the UChars from chars[0..room[ to chars[(start-room)..start[ */
318         uni->chars[--start]=uni->chars[--room];
319         /* set unknown states for all but the normalization boundaries */
320         uni->states[start]=UITER_NO_STATE;
321     } while(room>0);
322     uni->states[start]=uni->state=uiter_getState(iter);
323     uni->hasPrevious=iter->hasPrevious(iter);
324     api->start=start;
325     return TRUE;
326 }
327 
328 /* Iterator runtime API functions ------------------------------------------- */
329 
330 static int32_t U_CALLCONV
unormIteratorGetIndex(UCharIterator * api,UCharIteratorOrigin origin)331 unormIteratorGetIndex(UCharIterator *api, UCharIteratorOrigin origin) {
332     switch(origin) {
333     case UITER_ZERO:
334     case UITER_START:
335         return 0;
336     case UITER_CURRENT:
337     case UITER_LIMIT:
338     case UITER_LENGTH:
339         return UITER_UNKNOWN_INDEX;
340     default:
341         /* not a valid origin */
342         /* Should never get here! */
343         return -1;
344     }
345 }
346 
347 static int32_t U_CALLCONV
unormIteratorMove(UCharIterator * api,int32_t delta,UCharIteratorOrigin origin)348 unormIteratorMove(UCharIterator *api, int32_t delta, UCharIteratorOrigin origin) {
349     UNormIterator *uni=(UNormIterator *)api;
350     UCharIterator *iter=uni->iter;
351     int32_t pos;
352 
353     switch(origin) {
354     case UITER_ZERO:
355     case UITER_START:
356         /* restart from the beginning */
357         if(uni->hasPrevious) {
358             iter->move(iter, 0, UITER_START);
359             api->start=api->index=api->limit=0;
360             uni->states[api->limit]=uni->state=uiter_getState(iter);
361             uni->hasPrevious=FALSE;
362             uni->hasNext=iter->hasNext(iter);
363         } else {
364             /* we already have the beginning of the normalized text */
365             api->index=api->start;
366         }
367         break;
368     case UITER_CURRENT:
369         break;
370     case UITER_LIMIT:
371     case UITER_LENGTH:
372         /* restart from the end */
373         if(uni->hasNext) {
374             iter->move(iter, 0, UITER_LIMIT);
375             api->start=api->index=api->limit=uni->capacity;
376             uni->states[api->limit]=uni->state=uiter_getState(iter);
377             uni->hasPrevious=iter->hasPrevious(iter);
378             uni->hasNext=FALSE;
379         } else {
380             /* we already have the end of the normalized text */
381             api->index=api->limit;
382         }
383         break;
384     default:
385         return -1;  /* Error */
386     }
387 
388     /* move relative to the current position by delta normalized UChars */
389     if(delta==0) {
390         /* nothing to do */
391     } else if(delta>0) {
392         /* go forward until the requested position is in the buffer */
393         for(;;) {
394             pos=api->index+delta;   /* requested position */
395             delta=pos-api->limit;   /* remainder beyond buffered text */
396             if(delta<=0) {
397                 api->index=pos;     /* position reached */
398                 break;
399             }
400 
401             /* go to end of buffer and normalize further */
402             api->index=api->limit;
403             if(!uni->hasNext || !readNext(uni, iter)) {
404                 break;              /* reached end of text */
405             }
406         }
407     } else /* delta<0 */ {
408         /* go backward until the requested position is in the buffer */
409         for(;;) {
410             pos=api->index+delta;   /* requested position */
411             delta=pos-api->start;   /* remainder beyond buffered text */
412             if(delta>=0) {
413                 api->index=pos;     /* position reached */
414                 break;
415             }
416 
417             /* go to start of buffer and normalize further */
418             api->index=api->start;
419             if(!uni->hasPrevious || !readPrevious(uni, iter)) {
420                 break;              /* reached start of text */
421             }
422         }
423     }
424 
425     if(api->index==api->start && !uni->hasPrevious) {
426         return 0;
427     } else {
428         return UITER_UNKNOWN_INDEX;
429     }
430 }
431 
432 static UBool U_CALLCONV
unormIteratorHasNext(UCharIterator * api)433 unormIteratorHasNext(UCharIterator *api) {
434     return api->index<api->limit || ((UNormIterator *)api)->hasNext;
435 }
436 
437 static UBool U_CALLCONV
unormIteratorHasPrevious(UCharIterator * api)438 unormIteratorHasPrevious(UCharIterator *api) {
439     return api->index>api->start || ((UNormIterator *)api)->hasPrevious;
440 }
441 
442 static UChar32 U_CALLCONV
unormIteratorCurrent(UCharIterator * api)443 unormIteratorCurrent(UCharIterator *api) {
444     UNormIterator *uni=(UNormIterator *)api;
445 
446     if( api->index<api->limit ||
447         (uni->hasNext && readNext(uni, uni->iter))
448     ) {
449         return uni->chars[api->index];
450     } else {
451         return U_SENTINEL;
452     }
453 }
454 
455 static UChar32 U_CALLCONV
unormIteratorNext(UCharIterator * api)456 unormIteratorNext(UCharIterator *api) {
457     UNormIterator *uni=(UNormIterator *)api;
458 
459     if( api->index<api->limit ||
460         (uni->hasNext && readNext(uni, uni->iter))
461     ) {
462         return uni->chars[api->index++];
463     } else {
464         return U_SENTINEL;
465     }
466 }
467 
468 static UChar32 U_CALLCONV
unormIteratorPrevious(UCharIterator * api)469 unormIteratorPrevious(UCharIterator *api) {
470     UNormIterator *uni=(UNormIterator *)api;
471 
472     if( api->index>api->start ||
473         (uni->hasPrevious && readPrevious(uni, uni->iter))
474     ) {
475         return uni->chars[--api->index];
476     } else {
477         return U_SENTINEL;
478     }
479 }
480 
481 static uint32_t U_CALLCONV
unormIteratorGetState(const UCharIterator * api)482 unormIteratorGetState(const UCharIterator *api) {
483     /* not uni->state because that may not be at api->index */
484     return ((UNormIterator *)api)->states[api->index];
485 }
486 
487 static void U_CALLCONV
unormIteratorSetState(UCharIterator * api,uint32_t state,UErrorCode * pErrorCode)488 unormIteratorSetState(UCharIterator *api, uint32_t state, UErrorCode *pErrorCode) {
489     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
490         /* do nothing */
491     } else if(api==NULL) {
492         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
493     } else if(state==UITER_NO_STATE) {
494         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
495     } else {
496         UNormIterator *uni=(UNormIterator *)api;
497         UCharIterator *iter=((UNormIterator *)api)->iter;
498         if(state!=uni->state) {
499             uni->state=state;
500             uiter_setState(iter, state, pErrorCode);
501         }
502 
503         /*
504          * Try shortcuts: If the requested state is in the array contents
505          * then just set the index there.
506          *
507          * We assume that the state is unique per position!
508          */
509         if(state==uni->states[api->index]) {
510             return;
511         } else if(state==uni->states[api->limit]) {
512             api->index=api->limit;
513             return;
514         } else {
515             /* search for the index with this state */
516             int32_t i;
517 
518             for(i=api->start; i<api->limit; ++i) {
519                 if(state==uni->states[i]) {
520                     api->index=i;
521                     return;
522                 }
523             }
524         }
525 
526         /* there is no array index for this state, reset for fresh contents */
527         initIndexes((UNormIterator *)api, iter);
528         uni->states[api->limit]=state;
529     }
530 }
531 
532 static const UCharIterator unormIterator={
533     NULL, 0, 0, 0, 0, 0,
534     unormIteratorGetIndex,
535     unormIteratorMove,
536     unormIteratorHasNext,
537     unormIteratorHasPrevious,
538     unormIteratorCurrent,
539     unormIteratorNext,
540     unormIteratorPrevious,
541     NULL,
542     unormIteratorGetState,
543     unormIteratorSetState
544 };
545 
546 /* Setup functions ---------------------------------------------------------- */
547 
548 U_CAPI UNormIterator * U_EXPORT2
unorm_openIter(void * stackMem,int32_t stackMemSize,UErrorCode * pErrorCode)549 unorm_openIter(void *stackMem, int32_t stackMemSize, UErrorCode *pErrorCode) {
550     UNormIterator *uni;
551 
552     /* argument checking */
553     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
554         return NULL;
555     }
556 
557     /* allocate */
558     uni=NULL;
559     if(stackMem!=NULL && stackMemSize>=sizeof(UNormIterator)) {
560         if(U_ALIGNMENT_OFFSET(stackMem)==0) {
561             /* already aligned */
562             uni=(UNormIterator *)stackMem;
563         } else {
564             int32_t align=(int32_t)U_ALIGNMENT_OFFSET_UP(stackMem);
565             if((stackMemSize-=align)>=(int32_t)sizeof(UNormIterator)) {
566                 /* needs alignment */
567                 uni=(UNormIterator *)((char *)stackMem+align);
568             }
569         }
570         /* else does not fit */
571     }
572 
573     if(uni!=NULL) {
574         uni->isStackAllocated=TRUE;
575     } else {
576         uni=(UNormIterator *)uprv_malloc(sizeof(UNormIterator));
577         if(uni==NULL) {
578             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
579             return NULL;
580         }
581         uni->isStackAllocated=FALSE;
582     }
583 
584     /*
585      * initialize
586      * do not memset because that would unnecessarily initialize the arrays
587      */
588     uni->iter=NULL;
589     uni->chars=uni->charsBuffer;
590     uni->states=uni->statesBuffer;
591     uni->capacity=INITIAL_CAPACITY;
592     uni->state=UITER_NO_STATE;
593     uni->hasPrevious=uni->hasNext=FALSE;
594     uni->mode=UNORM_NONE;
595 
596     /* set a no-op iterator into the api */
597     uiter_setString(&uni->api, NULL, 0);
598     return uni;
599 }
600 
601 U_CAPI void U_EXPORT2
unorm_closeIter(UNormIterator * uni)602 unorm_closeIter(UNormIterator *uni) {
603     if(uni!=NULL) {
604         if(uni->states!=uni->statesBuffer) {
605             /* chars and states are allocated in the same memory block */
606             uprv_free(uni->states);
607         }
608         if(!uni->isStackAllocated) {
609             uprv_free(uni);
610         }
611     }
612 }
613 
614 U_CAPI UCharIterator * U_EXPORT2
unorm_setIter(UNormIterator * uni,UCharIterator * iter,UNormalizationMode mode,UErrorCode * pErrorCode)615 unorm_setIter(UNormIterator *uni, UCharIterator *iter, UNormalizationMode mode, UErrorCode *pErrorCode) {
616     /* argument checking */
617     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
618         return NULL;
619     }
620     if(uni==NULL) {
621         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
622         return NULL;
623     }
624     if( iter==NULL || iter->getState==NULL || iter->setState==NULL ||
625         mode<UNORM_NONE || UNORM_MODE_COUNT<=mode
626     ) {
627         /* set a no-op iterator into the api */
628         uiter_setString(&uni->api, NULL, 0);
629         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
630         return NULL;
631     }
632 
633     /* set the iterator and initialize */
634     uprv_memcpy(&uni->api, &unormIterator, sizeof(unormIterator));
635 
636     uni->iter=iter;
637     uni->mode=mode;
638 
639     initIndexes(uni, iter);
640     uni->states[uni->api.limit]=uni->state=uiter_getState(iter);
641 
642     return &uni->api;
643 }
644 
645 #endif /* uconfig.h switches */
646