• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2003-2011, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  unorm_it.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2003jan21
14 *   created by: Markus W. Scherer
15 */
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION
20 
21 #include "unicode/uiter.h"
22 #include "unicode/unorm.h"
23 #include "unicode/utf.h"
24 #include "unorm_it.h"
25 #include "cmemory.h"
26 
27 /* UNormIterator ------------------------------------------------------------ */
28 
29 enum {
30     INITIAL_CAPACITY=100
31 };
32 
33 struct UNormIterator {
34     UCharIterator api;
35     UCharIterator *iter;
36 
37     /*
38      * chars and states either use the static buffers
39      * or are allocated in the same memory block
40      *
41      * They are parallel arrays with states[] holding the getState() values
42      * from normalization boundaries, and UITER_NO_STATE in between.
43      */
44     UChar *chars;
45     uint32_t *states;
46 
47     /*
48      * api.start: first valid character & state in the arrays
49      * api.index: current position
50      * api.limit: one past the last valid character in chars[], but states[limit] is valid
51      * capacity: length of allocated arrays
52      */
53     int32_t capacity;
54 
55     /* the current iter->getState(), saved to avoid unnecessary setState() calls; may not correspond to api->index! */
56     uint32_t state;
57 
58     /* there are UChars available before start or after limit? */
59     UBool hasPrevious, hasNext, isStackAllocated;
60 
61     UNormalizationMode mode;
62 
63     UChar charsBuffer[INITIAL_CAPACITY];
64     uint32_t statesBuffer[INITIAL_CAPACITY+1]; /* one more than charsBuffer[]! */
65 };
66 
67 static void
initIndexes(UNormIterator * uni,UCharIterator * iter)68 initIndexes(UNormIterator *uni, UCharIterator *iter) {
69     /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
70     UCharIterator *api=&uni->api;
71 
72     if(!iter->hasPrevious(iter)) {
73         /* set indexes to the beginning of the arrays */
74         api->start=api->index=api->limit=0;
75         uni->hasPrevious=FALSE;
76         uni->hasNext=iter->hasNext(iter);
77     } else if(!iter->hasNext(iter)) {
78         /* set indexes to the end of the arrays */
79         api->start=api->index=api->limit=uni->capacity;
80         uni->hasNext=FALSE;
81         uni->hasPrevious=iter->hasPrevious(iter);
82     } else {
83         /* set indexes into the middle of the arrays */
84         api->start=api->index=api->limit=uni->capacity/2;
85         uni->hasPrevious=uni->hasNext=TRUE;
86     }
87 }
88 
89 static UBool
reallocArrays(UNormIterator * uni,int32_t capacity,UBool addAtStart)90 reallocArrays(UNormIterator *uni, int32_t capacity, UBool addAtStart) {
91     /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
92     UCharIterator *api=&uni->api;
93 
94     uint32_t *states;
95     UChar *chars;
96     int32_t start, limit;
97 
98     states=(uint32_t *)uprv_malloc((capacity+1)*4+capacity*2);
99     if(states==NULL) {
100         return FALSE;
101     }
102 
103     chars=(UChar *)(states+(capacity+1));
104     uni->capacity=capacity;
105 
106     start=api->start;
107     limit=api->limit;
108 
109     if(addAtStart) {
110         /* copy old contents to the end of the new arrays */
111         int32_t delta;
112 
113         delta=capacity-uni->capacity;
114         uprv_memcpy(states+delta+start, uni->states+start, (limit-start+1)*4);
115         uprv_memcpy(chars+delta+start, uni->chars+start, (limit-start)*4);
116 
117         api->start=start+delta;
118         api->index+=delta;
119         api->limit=limit+delta;
120     } else {
121         /* copy old contents to the beginning of the new arrays */
122         uprv_memcpy(states+start, uni->states+start, (limit-start+1)*4);
123         uprv_memcpy(chars+start, uni->chars+start, (limit-start)*4);
124     }
125 
126     uni->chars=chars;
127     uni->states=states;
128 
129     return TRUE;
130 }
131 
132 static void
moveContentsTowardStart(UCharIterator * api,UChar chars[],uint32_t states[],int32_t delta)133 moveContentsTowardStart(UCharIterator *api, UChar chars[], uint32_t states[], int32_t delta) {
134     /* move array contents up to make room */
135     int32_t srcIndex, destIndex, limit;
136 
137     limit=api->limit;
138     srcIndex=delta;
139     if(srcIndex>api->start) {
140         /* look for a position in the arrays with a known state */
141         while(srcIndex<limit && states[srcIndex]==UITER_NO_STATE) {
142             ++srcIndex;
143         }
144     }
145 
146     /* now actually move the array contents */
147     api->start=destIndex=0;
148     while(srcIndex<limit) {
149         chars[destIndex]=chars[srcIndex];
150         states[destIndex++]=states[srcIndex++];
151     }
152 
153     /* copy states[limit] as well! */
154     states[destIndex]=states[srcIndex];
155 
156     api->limit=destIndex;
157 }
158 
159 static void
moveContentsTowardEnd(UCharIterator * api,UChar chars[],uint32_t states[],int32_t delta)160 moveContentsTowardEnd(UCharIterator *api, UChar chars[], uint32_t states[], int32_t delta) {
161     /* move array contents up to make room */
162     int32_t srcIndex, destIndex, start;
163 
164     start=api->start;
165     destIndex=((UNormIterator *)api)->capacity;
166     srcIndex=destIndex-delta;
167     if(srcIndex<api->limit) {
168         /* look for a position in the arrays with a known state */
169         while(srcIndex>start && states[srcIndex]==UITER_NO_STATE) {
170             --srcIndex;
171         }
172     }
173 
174     /* now actually move the array contents */
175     api->limit=destIndex;
176 
177     /* copy states[limit] as well! */
178     states[destIndex]=states[srcIndex];
179 
180     while(srcIndex>start) {
181         chars[--destIndex]=chars[--srcIndex];
182         states[destIndex]=states[srcIndex];
183     }
184 
185     api->start=destIndex;
186 }
187 
188 /* normalize forward from the limit, assume hasNext is true */
189 static UBool
readNext(UNormIterator * uni,UCharIterator * iter)190 readNext(UNormIterator *uni, UCharIterator *iter) {
191     /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
192     UCharIterator *api=&uni->api;
193 
194     /* make capacity/4 room at the end of the arrays */
195     int32_t limit, capacity, room;
196     UErrorCode errorCode;
197 
198     limit=api->limit;
199     capacity=uni->capacity;
200     room=capacity/4;
201     if(room>(capacity-limit)) {
202         /* move array contents to make room */
203         moveContentsTowardStart(api, uni->chars, uni->states, room);
204         api->index=limit=api->limit;
205         uni->hasPrevious=TRUE;
206     }
207 
208     /* normalize starting from the limit position */
209     errorCode=U_ZERO_ERROR;
210     if(uni->state!=uni->states[limit]) {
211         uiter_setState(iter, uni->states[limit], &errorCode);
212         if(U_FAILURE(errorCode)) {
213             uni->state=UITER_NO_STATE;
214             uni->hasNext=FALSE;
215             return FALSE;
216         }
217     }
218 
219     room=unorm_next(iter, uni->chars+limit, capacity-limit, uni->mode, 0, TRUE, NULL, &errorCode);
220     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
221         if(room<=capacity) {
222             /* empty and re-use the arrays */
223             uni->states[0]=uni->states[limit];
224             api->start=api->index=api->limit=limit=0;
225             uni->hasPrevious=TRUE;
226         } else {
227             capacity+=room+100;
228             if(!reallocArrays(uni, capacity, FALSE)) {
229                 uni->state=UITER_NO_STATE;
230                 uni->hasNext=FALSE;
231                 return FALSE;
232             }
233             limit=api->limit;
234         }
235 
236         errorCode=U_ZERO_ERROR;
237         uiter_setState(iter, uni->states[limit], &errorCode);
238         room=unorm_next(iter, uni->chars+limit, capacity-limit, uni->mode, 0, TRUE, NULL, &errorCode);
239     }
240     if(U_FAILURE(errorCode) || room==0) {
241         uni->state=UITER_NO_STATE;
242         uni->hasNext=FALSE;
243         return FALSE;
244     }
245 
246     /* room>0 */
247     ++limit; /* leave the known states[limit] alone */
248     for(--room; room>0; --room) {
249         /* set unknown states for all but the normalization boundaries */
250         uni->states[limit++]=UITER_NO_STATE;
251     }
252     uni->states[limit]=uni->state=uiter_getState(iter);
253     uni->hasNext=iter->hasNext(iter);
254     api->limit=limit;
255     return TRUE;
256 }
257 
258 /* normalize backward from the start, assume hasPrevious is true */
259 static UBool
readPrevious(UNormIterator * uni,UCharIterator * iter)260 readPrevious(UNormIterator *uni, UCharIterator *iter) {
261     /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
262     UCharIterator *api=&uni->api;
263 
264     /* make capacity/4 room at the start of the arrays */
265     int32_t start, capacity, room;
266     UErrorCode errorCode;
267 
268     start=api->start;
269     capacity=uni->capacity;
270     room=capacity/4;
271     if(room>start) {
272         /* move array contents to make room */
273         moveContentsTowardEnd(api, uni->chars, uni->states, room);
274         api->index=start=api->start;
275         uni->hasNext=TRUE;
276     }
277 
278     /* normalize ending at the start position */
279     errorCode=U_ZERO_ERROR;
280     if(uni->state!=uni->states[start]) {
281         uiter_setState(iter, uni->states[start], &errorCode);
282         if(U_FAILURE(errorCode)) {
283             uni->state=UITER_NO_STATE;
284             uni->hasPrevious=FALSE;
285             return FALSE;
286         }
287     }
288 
289     room=unorm_previous(iter, uni->chars, start, uni->mode, 0, TRUE, NULL, &errorCode);
290     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
291         if(room<=capacity) {
292             /* empty and re-use the arrays */
293             uni->states[capacity]=uni->states[start];
294             api->start=api->index=api->limit=start=capacity;
295             uni->hasNext=TRUE;
296         } else {
297             capacity+=room+100;
298             if(!reallocArrays(uni, capacity, TRUE)) {
299                 uni->state=UITER_NO_STATE;
300                 uni->hasPrevious=FALSE;
301                 return FALSE;
302             }
303             start=api->start;
304         }
305 
306         errorCode=U_ZERO_ERROR;
307         uiter_setState(iter, uni->states[start], &errorCode);
308         room=unorm_previous(iter, uni->chars, start, uni->mode, 0, TRUE, NULL, &errorCode);
309     }
310     if(U_FAILURE(errorCode) || room==0) {
311         uni->state=UITER_NO_STATE;
312         uni->hasPrevious=FALSE;
313         return FALSE;
314     }
315 
316     /* room>0 */
317     do {
318         /* copy the UChars from chars[0..room[ to chars[(start-room)..start[ */
319         uni->chars[--start]=uni->chars[--room];
320         /* set unknown states for all but the normalization boundaries */
321         uni->states[start]=UITER_NO_STATE;
322     } while(room>0);
323     uni->states[start]=uni->state=uiter_getState(iter);
324     uni->hasPrevious=iter->hasPrevious(iter);
325     api->start=start;
326     return TRUE;
327 }
328 
329 /* Iterator runtime API functions ------------------------------------------- */
330 
331 static int32_t U_CALLCONV
unormIteratorGetIndex(UCharIterator * api,UCharIteratorOrigin origin)332 unormIteratorGetIndex(UCharIterator *api, UCharIteratorOrigin origin) {
333     switch(origin) {
334     case UITER_ZERO:
335     case UITER_START:
336         return 0;
337     case UITER_CURRENT:
338     case UITER_LIMIT:
339     case UITER_LENGTH:
340         return UITER_UNKNOWN_INDEX;
341     default:
342         /* not a valid origin */
343         /* Should never get here! */
344         return -1;
345     }
346 }
347 
348 static int32_t U_CALLCONV
unormIteratorMove(UCharIterator * api,int32_t delta,UCharIteratorOrigin origin)349 unormIteratorMove(UCharIterator *api, int32_t delta, UCharIteratorOrigin origin) {
350     UNormIterator *uni=(UNormIterator *)api;
351     UCharIterator *iter=uni->iter;
352     int32_t pos;
353 
354     switch(origin) {
355     case UITER_ZERO:
356     case UITER_START:
357         /* restart from the beginning */
358         if(uni->hasPrevious) {
359             iter->move(iter, 0, UITER_START);
360             api->start=api->index=api->limit=0;
361             uni->states[api->limit]=uni->state=uiter_getState(iter);
362             uni->hasPrevious=FALSE;
363             uni->hasNext=iter->hasNext(iter);
364         } else {
365             /* we already have the beginning of the normalized text */
366             api->index=api->start;
367         }
368         break;
369     case UITER_CURRENT:
370         break;
371     case UITER_LIMIT:
372     case UITER_LENGTH:
373         /* restart from the end */
374         if(uni->hasNext) {
375             iter->move(iter, 0, UITER_LIMIT);
376             api->start=api->index=api->limit=uni->capacity;
377             uni->states[api->limit]=uni->state=uiter_getState(iter);
378             uni->hasPrevious=iter->hasPrevious(iter);
379             uni->hasNext=FALSE;
380         } else {
381             /* we already have the end of the normalized text */
382             api->index=api->limit;
383         }
384         break;
385     default:
386         return -1;  /* Error */
387     }
388 
389     /* move relative to the current position by delta normalized UChars */
390     if(delta==0) {
391         /* nothing to do */
392     } else if(delta>0) {
393         /* go forward until the requested position is in the buffer */
394         for(;;) {
395             pos=api->index+delta;   /* requested position */
396             delta=pos-api->limit;   /* remainder beyond buffered text */
397             if(delta<=0) {
398                 api->index=pos;     /* position reached */
399                 break;
400             }
401 
402             /* go to end of buffer and normalize further */
403             api->index=api->limit;
404             if(!uni->hasNext || !readNext(uni, iter)) {
405                 break;              /* reached end of text */
406             }
407         }
408     } else /* delta<0 */ {
409         /* go backward until the requested position is in the buffer */
410         for(;;) {
411             pos=api->index+delta;   /* requested position */
412             delta=pos-api->start;   /* remainder beyond buffered text */
413             if(delta>=0) {
414                 api->index=pos;     /* position reached */
415                 break;
416             }
417 
418             /* go to start of buffer and normalize further */
419             api->index=api->start;
420             if(!uni->hasPrevious || !readPrevious(uni, iter)) {
421                 break;              /* reached start of text */
422             }
423         }
424     }
425 
426     if(api->index==api->start && !uni->hasPrevious) {
427         return 0;
428     } else {
429         return UITER_UNKNOWN_INDEX;
430     }
431 }
432 
433 static UBool U_CALLCONV
unormIteratorHasNext(UCharIterator * api)434 unormIteratorHasNext(UCharIterator *api) {
435     return api->index<api->limit || ((UNormIterator *)api)->hasNext;
436 }
437 
438 static UBool U_CALLCONV
unormIteratorHasPrevious(UCharIterator * api)439 unormIteratorHasPrevious(UCharIterator *api) {
440     return api->index>api->start || ((UNormIterator *)api)->hasPrevious;
441 }
442 
443 static UChar32 U_CALLCONV
unormIteratorCurrent(UCharIterator * api)444 unormIteratorCurrent(UCharIterator *api) {
445     UNormIterator *uni=(UNormIterator *)api;
446 
447     if( api->index<api->limit ||
448         (uni->hasNext && readNext(uni, uni->iter))
449     ) {
450         return uni->chars[api->index];
451     } else {
452         return U_SENTINEL;
453     }
454 }
455 
456 static UChar32 U_CALLCONV
unormIteratorNext(UCharIterator * api)457 unormIteratorNext(UCharIterator *api) {
458     UNormIterator *uni=(UNormIterator *)api;
459 
460     if( api->index<api->limit ||
461         (uni->hasNext && readNext(uni, uni->iter))
462     ) {
463         return uni->chars[api->index++];
464     } else {
465         return U_SENTINEL;
466     }
467 }
468 
469 static UChar32 U_CALLCONV
unormIteratorPrevious(UCharIterator * api)470 unormIteratorPrevious(UCharIterator *api) {
471     UNormIterator *uni=(UNormIterator *)api;
472 
473     if( api->index>api->start ||
474         (uni->hasPrevious && readPrevious(uni, uni->iter))
475     ) {
476         return uni->chars[--api->index];
477     } else {
478         return U_SENTINEL;
479     }
480 }
481 
482 static uint32_t U_CALLCONV
unormIteratorGetState(const UCharIterator * api)483 unormIteratorGetState(const UCharIterator *api) {
484     /* not uni->state because that may not be at api->index */
485     return ((UNormIterator *)api)->states[api->index];
486 }
487 
488 static void U_CALLCONV
unormIteratorSetState(UCharIterator * api,uint32_t state,UErrorCode * pErrorCode)489 unormIteratorSetState(UCharIterator *api, uint32_t state, UErrorCode *pErrorCode) {
490     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
491         /* do nothing */
492     } else if(api==NULL) {
493         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
494     } else if(state==UITER_NO_STATE) {
495         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
496     } else {
497         UNormIterator *uni=(UNormIterator *)api;
498         UCharIterator *iter=((UNormIterator *)api)->iter;
499         if(state!=uni->state) {
500             uni->state=state;
501             uiter_setState(iter, state, pErrorCode);
502         }
503 
504         /*
505          * Try shortcuts: If the requested state is in the array contents
506          * then just set the index there.
507          *
508          * We assume that the state is unique per position!
509          */
510         if(state==uni->states[api->index]) {
511             return;
512         } else if(state==uni->states[api->limit]) {
513             api->index=api->limit;
514             return;
515         } else {
516             /* search for the index with this state */
517             int32_t i;
518 
519             for(i=api->start; i<api->limit; ++i) {
520                 if(state==uni->states[i]) {
521                     api->index=i;
522                     return;
523                 }
524             }
525         }
526 
527         /* there is no array index for this state, reset for fresh contents */
528         initIndexes((UNormIterator *)api, iter);
529         uni->states[api->limit]=state;
530     }
531 }
532 
533 static const UCharIterator unormIterator={
534     NULL, 0, 0, 0, 0, 0,
535     unormIteratorGetIndex,
536     unormIteratorMove,
537     unormIteratorHasNext,
538     unormIteratorHasPrevious,
539     unormIteratorCurrent,
540     unormIteratorNext,
541     unormIteratorPrevious,
542     NULL,
543     unormIteratorGetState,
544     unormIteratorSetState
545 };
546 
547 /* Setup functions ---------------------------------------------------------- */
548 
549 U_CAPI UNormIterator * U_EXPORT2
unorm_openIter(void * stackMem,int32_t stackMemSize,UErrorCode * pErrorCode)550 unorm_openIter(void *stackMem, int32_t stackMemSize, UErrorCode *pErrorCode) {
551     UNormIterator *uni;
552 
553     /* argument checking */
554     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
555         return NULL;
556     }
557 
558     /* allocate */
559     uni=NULL;
560     if(stackMem!=NULL && stackMemSize>=sizeof(UNormIterator)) {
561         if(U_ALIGNMENT_OFFSET(stackMem)==0) {
562             /* already aligned */
563             uni=(UNormIterator *)stackMem;
564         } else {
565             int32_t align=(int32_t)U_ALIGNMENT_OFFSET_UP(stackMem);
566             if((stackMemSize-=align)>=(int32_t)sizeof(UNormIterator)) {
567                 /* needs alignment */
568                 uni=(UNormIterator *)((char *)stackMem+align);
569             }
570         }
571         /* else does not fit */
572     }
573 
574     if(uni!=NULL) {
575         uni->isStackAllocated=TRUE;
576     } else {
577         uni=(UNormIterator *)uprv_malloc(sizeof(UNormIterator));
578         if(uni==NULL) {
579             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
580             return NULL;
581         }
582         uni->isStackAllocated=FALSE;
583     }
584 
585     /*
586      * initialize
587      * do not memset because that would unnecessarily initialize the arrays
588      */
589     uni->iter=NULL;
590     uni->chars=uni->charsBuffer;
591     uni->states=uni->statesBuffer;
592     uni->capacity=INITIAL_CAPACITY;
593     uni->state=UITER_NO_STATE;
594     uni->hasPrevious=uni->hasNext=FALSE;
595     uni->mode=UNORM_NONE;
596 
597     /* set a no-op iterator into the api */
598     uiter_setString(&uni->api, NULL, 0);
599     return uni;
600 }
601 
602 U_CAPI void U_EXPORT2
unorm_closeIter(UNormIterator * uni)603 unorm_closeIter(UNormIterator *uni) {
604     if(uni!=NULL) {
605         if(uni->states!=uni->statesBuffer) {
606             /* chars and states are allocated in the same memory block */
607             uprv_free(uni->states);
608         }
609         if(!uni->isStackAllocated) {
610             uprv_free(uni);
611         }
612     }
613 }
614 
615 U_CAPI UCharIterator * U_EXPORT2
unorm_setIter(UNormIterator * uni,UCharIterator * iter,UNormalizationMode mode,UErrorCode * pErrorCode)616 unorm_setIter(UNormIterator *uni, UCharIterator *iter, UNormalizationMode mode, UErrorCode *pErrorCode) {
617     /* argument checking */
618     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
619         return NULL;
620     }
621     if(uni==NULL) {
622         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
623         return NULL;
624     }
625     if( iter==NULL || iter->getState==NULL || iter->setState==NULL ||
626         mode<UNORM_NONE || UNORM_MODE_COUNT<=mode
627     ) {
628         /* set a no-op iterator into the api */
629         uiter_setString(&uni->api, NULL, 0);
630         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
631         return NULL;
632     }
633 
634     /* set the iterator and initialize */
635     uprv_memcpy(&uni->api, &unormIterator, sizeof(unormIterator));
636 
637     uni->iter=iter;
638     uni->mode=mode;
639 
640     initIndexes(uni, iter);
641     uni->states[uni->api.limit]=uni->state=uiter_getState(iter);
642 
643     return &uni->api;
644 }
645 
646 #endif /* uconfig.h switches */
647