1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2003-2008, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: unorm_it.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2003jan21
14 * created by: Markus W. Scherer
15 */
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION
20
21 #include "unicode/uiter.h"
22 #include "unicode/unorm.h"
23 #include "unorm_it.h"
24 #include "cmemory.h"
25
26 /* UNormIterator ------------------------------------------------------------ */
27
28 enum {
29 INITIAL_CAPACITY=100
30 };
31
32 struct UNormIterator {
33 UCharIterator api;
34 UCharIterator *iter;
35
36 /*
37 * chars and states either use the static buffers
38 * or are allocated in the same memory block
39 *
40 * They are parallel arrays with states[] holding the getState() values
41 * from normalization boundaries, and UITER_NO_STATE in between.
42 */
43 UChar *chars;
44 uint32_t *states;
45
46 /*
47 * api.start: first valid character & state in the arrays
48 * api.index: current position
49 * api.limit: one past the last valid character in chars[], but states[limit] is valid
50 * capacity: length of allocated arrays
51 */
52 int32_t capacity;
53
54 /* the current iter->getState(), saved to avoid unnecessary setState() calls; may not correspond to api->index! */
55 uint32_t state;
56
57 /* there are UChars available before start or after limit? */
58 UBool hasPrevious, hasNext, isStackAllocated;
59
60 UNormalizationMode mode;
61
62 UChar charsBuffer[INITIAL_CAPACITY];
63 uint32_t statesBuffer[INITIAL_CAPACITY+1]; /* one more than charsBuffer[]! */
64 };
65
66 static void
initIndexes(UNormIterator * uni,UCharIterator * iter)67 initIndexes(UNormIterator *uni, UCharIterator *iter) {
68 /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
69 UCharIterator *api=&uni->api;
70
71 if(!iter->hasPrevious(iter)) {
72 /* set indexes to the beginning of the arrays */
73 api->start=api->index=api->limit=0;
74 uni->hasPrevious=FALSE;
75 uni->hasNext=iter->hasNext(iter);
76 } else if(!iter->hasNext(iter)) {
77 /* set indexes to the end of the arrays */
78 api->start=api->index=api->limit=uni->capacity;
79 uni->hasNext=FALSE;
80 uni->hasPrevious=iter->hasPrevious(iter);
81 } else {
82 /* set indexes into the middle of the arrays */
83 api->start=api->index=api->limit=uni->capacity/2;
84 uni->hasPrevious=uni->hasNext=TRUE;
85 }
86 }
87
88 static UBool
reallocArrays(UNormIterator * uni,int32_t capacity,UBool addAtStart)89 reallocArrays(UNormIterator *uni, int32_t capacity, UBool addAtStart) {
90 /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
91 UCharIterator *api=&uni->api;
92
93 uint32_t *states;
94 UChar *chars;
95 int32_t start, limit;
96
97 states=(uint32_t *)uprv_malloc((capacity+1)*4+capacity*2);
98 if(states==NULL) {
99 return FALSE;
100 }
101
102 chars=(UChar *)(states+(capacity+1));
103 uni->capacity=capacity;
104
105 start=api->start;
106 limit=api->limit;
107
108 if(addAtStart) {
109 /* copy old contents to the end of the new arrays */
110 int32_t delta;
111
112 delta=capacity-uni->capacity;
113 uprv_memcpy(states+delta+start, uni->states+start, (limit-start+1)*4);
114 uprv_memcpy(chars+delta+start, uni->chars+start, (limit-start)*4);
115
116 api->start=start+delta;
117 api->index+=delta;
118 api->limit=limit+delta;
119 } else {
120 /* copy old contents to the beginning of the new arrays */
121 uprv_memcpy(states+start, uni->states+start, (limit-start+1)*4);
122 uprv_memcpy(chars+start, uni->chars+start, (limit-start)*4);
123 }
124
125 uni->chars=chars;
126 uni->states=states;
127
128 return TRUE;
129 }
130
131 static void
moveContentsTowardStart(UCharIterator * api,UChar chars[],uint32_t states[],int32_t delta)132 moveContentsTowardStart(UCharIterator *api, UChar chars[], uint32_t states[], int32_t delta) {
133 /* move array contents up to make room */
134 int32_t srcIndex, destIndex, limit;
135
136 limit=api->limit;
137 srcIndex=delta;
138 if(srcIndex>api->start) {
139 /* look for a position in the arrays with a known state */
140 while(srcIndex<limit && states[srcIndex]==UITER_NO_STATE) {
141 ++srcIndex;
142 }
143 }
144
145 /* now actually move the array contents */
146 api->start=destIndex=0;
147 while(srcIndex<limit) {
148 chars[destIndex]=chars[srcIndex];
149 states[destIndex++]=states[srcIndex++];
150 }
151
152 /* copy states[limit] as well! */
153 states[destIndex]=states[srcIndex];
154
155 api->limit=destIndex;
156 }
157
158 static void
moveContentsTowardEnd(UCharIterator * api,UChar chars[],uint32_t states[],int32_t delta)159 moveContentsTowardEnd(UCharIterator *api, UChar chars[], uint32_t states[], int32_t delta) {
160 /* move array contents up to make room */
161 int32_t srcIndex, destIndex, start;
162
163 start=api->start;
164 destIndex=((UNormIterator *)api)->capacity;
165 srcIndex=destIndex-delta;
166 if(srcIndex<api->limit) {
167 /* look for a position in the arrays with a known state */
168 while(srcIndex>start && states[srcIndex]==UITER_NO_STATE) {
169 --srcIndex;
170 }
171 }
172
173 /* now actually move the array contents */
174 api->limit=destIndex;
175
176 /* copy states[limit] as well! */
177 states[destIndex]=states[srcIndex];
178
179 while(srcIndex>start) {
180 chars[--destIndex]=chars[--srcIndex];
181 states[destIndex]=states[srcIndex];
182 }
183
184 api->start=destIndex;
185 }
186
187 /* normalize forward from the limit, assume hasNext is true */
188 static UBool
readNext(UNormIterator * uni,UCharIterator * iter)189 readNext(UNormIterator *uni, UCharIterator *iter) {
190 /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
191 UCharIterator *api=&uni->api;
192
193 /* make capacity/4 room at the end of the arrays */
194 int32_t limit, capacity, room;
195 UErrorCode errorCode;
196
197 limit=api->limit;
198 capacity=uni->capacity;
199 room=capacity/4;
200 if(room>(capacity-limit)) {
201 /* move array contents to make room */
202 moveContentsTowardStart(api, uni->chars, uni->states, room);
203 api->index=limit=api->limit;
204 uni->hasPrevious=TRUE;
205 }
206
207 /* normalize starting from the limit position */
208 errorCode=U_ZERO_ERROR;
209 if(uni->state!=uni->states[limit]) {
210 uiter_setState(iter, uni->states[limit], &errorCode);
211 if(U_FAILURE(errorCode)) {
212 uni->state=UITER_NO_STATE;
213 uni->hasNext=FALSE;
214 return FALSE;
215 }
216 }
217
218 room=unorm_next(iter, uni->chars+limit, capacity-limit, uni->mode, 0, TRUE, NULL, &errorCode);
219 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
220 if(room<=capacity) {
221 /* empty and re-use the arrays */
222 uni->states[0]=uni->states[limit];
223 api->start=api->index=api->limit=limit=0;
224 uni->hasPrevious=TRUE;
225 } else {
226 capacity+=room+100;
227 if(!reallocArrays(uni, capacity, FALSE)) {
228 uni->state=UITER_NO_STATE;
229 uni->hasNext=FALSE;
230 return FALSE;
231 }
232 limit=api->limit;
233 }
234
235 errorCode=U_ZERO_ERROR;
236 uiter_setState(iter, uni->states[limit], &errorCode);
237 room=unorm_next(iter, uni->chars+limit, capacity-limit, uni->mode, 0, TRUE, NULL, &errorCode);
238 }
239 if(U_FAILURE(errorCode) || room==0) {
240 uni->state=UITER_NO_STATE;
241 uni->hasNext=FALSE;
242 return FALSE;
243 }
244
245 /* room>0 */
246 ++limit; /* leave the known states[limit] alone */
247 for(--room; room>0; --room) {
248 /* set unknown states for all but the normalization boundaries */
249 uni->states[limit++]=UITER_NO_STATE;
250 }
251 uni->states[limit]=uni->state=uiter_getState(iter);
252 uni->hasNext=iter->hasNext(iter);
253 api->limit=limit;
254 return TRUE;
255 }
256
257 /* normalize backward from the start, assume hasPrevious is true */
258 static UBool
readPrevious(UNormIterator * uni,UCharIterator * iter)259 readPrevious(UNormIterator *uni, UCharIterator *iter) {
260 /* do not pass api so that the compiler knows it's an alias pointer to uni itself */
261 UCharIterator *api=&uni->api;
262
263 /* make capacity/4 room at the start of the arrays */
264 int32_t start, capacity, room;
265 UErrorCode errorCode;
266
267 start=api->start;
268 capacity=uni->capacity;
269 room=capacity/4;
270 if(room>start) {
271 /* move array contents to make room */
272 moveContentsTowardEnd(api, uni->chars, uni->states, room);
273 api->index=start=api->start;
274 uni->hasNext=TRUE;
275 }
276
277 /* normalize ending at the start position */
278 errorCode=U_ZERO_ERROR;
279 if(uni->state!=uni->states[start]) {
280 uiter_setState(iter, uni->states[start], &errorCode);
281 if(U_FAILURE(errorCode)) {
282 uni->state=UITER_NO_STATE;
283 uni->hasPrevious=FALSE;
284 return FALSE;
285 }
286 }
287
288 room=unorm_previous(iter, uni->chars, start, uni->mode, 0, TRUE, NULL, &errorCode);
289 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
290 if(room<=capacity) {
291 /* empty and re-use the arrays */
292 uni->states[capacity]=uni->states[start];
293 api->start=api->index=api->limit=start=capacity;
294 uni->hasNext=TRUE;
295 } else {
296 capacity+=room+100;
297 if(!reallocArrays(uni, capacity, TRUE)) {
298 uni->state=UITER_NO_STATE;
299 uni->hasPrevious=FALSE;
300 return FALSE;
301 }
302 start=api->start;
303 }
304
305 errorCode=U_ZERO_ERROR;
306 uiter_setState(iter, uni->states[start], &errorCode);
307 room=unorm_previous(iter, uni->chars, start, uni->mode, 0, TRUE, NULL, &errorCode);
308 }
309 if(U_FAILURE(errorCode) || room==0) {
310 uni->state=UITER_NO_STATE;
311 uni->hasPrevious=FALSE;
312 return FALSE;
313 }
314
315 /* room>0 */
316 do {
317 /* copy the UChars from chars[0..room[ to chars[(start-room)..start[ */
318 uni->chars[--start]=uni->chars[--room];
319 /* set unknown states for all but the normalization boundaries */
320 uni->states[start]=UITER_NO_STATE;
321 } while(room>0);
322 uni->states[start]=uni->state=uiter_getState(iter);
323 uni->hasPrevious=iter->hasPrevious(iter);
324 api->start=start;
325 return TRUE;
326 }
327
328 /* Iterator runtime API functions ------------------------------------------- */
329
330 static int32_t U_CALLCONV
unormIteratorGetIndex(UCharIterator * api,UCharIteratorOrigin origin)331 unormIteratorGetIndex(UCharIterator *api, UCharIteratorOrigin origin) {
332 switch(origin) {
333 case UITER_ZERO:
334 case UITER_START:
335 return 0;
336 case UITER_CURRENT:
337 case UITER_LIMIT:
338 case UITER_LENGTH:
339 return UITER_UNKNOWN_INDEX;
340 default:
341 /* not a valid origin */
342 /* Should never get here! */
343 return -1;
344 }
345 }
346
347 static int32_t U_CALLCONV
unormIteratorMove(UCharIterator * api,int32_t delta,UCharIteratorOrigin origin)348 unormIteratorMove(UCharIterator *api, int32_t delta, UCharIteratorOrigin origin) {
349 UNormIterator *uni=(UNormIterator *)api;
350 UCharIterator *iter=uni->iter;
351 int32_t pos;
352
353 switch(origin) {
354 case UITER_ZERO:
355 case UITER_START:
356 /* restart from the beginning */
357 if(uni->hasPrevious) {
358 iter->move(iter, 0, UITER_START);
359 api->start=api->index=api->limit=0;
360 uni->states[api->limit]=uni->state=uiter_getState(iter);
361 uni->hasPrevious=FALSE;
362 uni->hasNext=iter->hasNext(iter);
363 } else {
364 /* we already have the beginning of the normalized text */
365 api->index=api->start;
366 }
367 break;
368 case UITER_CURRENT:
369 break;
370 case UITER_LIMIT:
371 case UITER_LENGTH:
372 /* restart from the end */
373 if(uni->hasNext) {
374 iter->move(iter, 0, UITER_LIMIT);
375 api->start=api->index=api->limit=uni->capacity;
376 uni->states[api->limit]=uni->state=uiter_getState(iter);
377 uni->hasPrevious=iter->hasPrevious(iter);
378 uni->hasNext=FALSE;
379 } else {
380 /* we already have the end of the normalized text */
381 api->index=api->limit;
382 }
383 break;
384 default:
385 return -1; /* Error */
386 }
387
388 /* move relative to the current position by delta normalized UChars */
389 if(delta==0) {
390 /* nothing to do */
391 } else if(delta>0) {
392 /* go forward until the requested position is in the buffer */
393 for(;;) {
394 pos=api->index+delta; /* requested position */
395 delta=pos-api->limit; /* remainder beyond buffered text */
396 if(delta<=0) {
397 api->index=pos; /* position reached */
398 break;
399 }
400
401 /* go to end of buffer and normalize further */
402 api->index=api->limit;
403 if(!uni->hasNext || !readNext(uni, iter)) {
404 break; /* reached end of text */
405 }
406 }
407 } else /* delta<0 */ {
408 /* go backward until the requested position is in the buffer */
409 for(;;) {
410 pos=api->index+delta; /* requested position */
411 delta=pos-api->start; /* remainder beyond buffered text */
412 if(delta>=0) {
413 api->index=pos; /* position reached */
414 break;
415 }
416
417 /* go to start of buffer and normalize further */
418 api->index=api->start;
419 if(!uni->hasPrevious || !readPrevious(uni, iter)) {
420 break; /* reached start of text */
421 }
422 }
423 }
424
425 if(api->index==api->start && !uni->hasPrevious) {
426 return 0;
427 } else {
428 return UITER_UNKNOWN_INDEX;
429 }
430 }
431
432 static UBool U_CALLCONV
unormIteratorHasNext(UCharIterator * api)433 unormIteratorHasNext(UCharIterator *api) {
434 return api->index<api->limit || ((UNormIterator *)api)->hasNext;
435 }
436
437 static UBool U_CALLCONV
unormIteratorHasPrevious(UCharIterator * api)438 unormIteratorHasPrevious(UCharIterator *api) {
439 return api->index>api->start || ((UNormIterator *)api)->hasPrevious;
440 }
441
442 static UChar32 U_CALLCONV
unormIteratorCurrent(UCharIterator * api)443 unormIteratorCurrent(UCharIterator *api) {
444 UNormIterator *uni=(UNormIterator *)api;
445
446 if( api->index<api->limit ||
447 (uni->hasNext && readNext(uni, uni->iter))
448 ) {
449 return uni->chars[api->index];
450 } else {
451 return U_SENTINEL;
452 }
453 }
454
455 static UChar32 U_CALLCONV
unormIteratorNext(UCharIterator * api)456 unormIteratorNext(UCharIterator *api) {
457 UNormIterator *uni=(UNormIterator *)api;
458
459 if( api->index<api->limit ||
460 (uni->hasNext && readNext(uni, uni->iter))
461 ) {
462 return uni->chars[api->index++];
463 } else {
464 return U_SENTINEL;
465 }
466 }
467
468 static UChar32 U_CALLCONV
unormIteratorPrevious(UCharIterator * api)469 unormIteratorPrevious(UCharIterator *api) {
470 UNormIterator *uni=(UNormIterator *)api;
471
472 if( api->index>api->start ||
473 (uni->hasPrevious && readPrevious(uni, uni->iter))
474 ) {
475 return uni->chars[--api->index];
476 } else {
477 return U_SENTINEL;
478 }
479 }
480
481 static uint32_t U_CALLCONV
unormIteratorGetState(const UCharIterator * api)482 unormIteratorGetState(const UCharIterator *api) {
483 /* not uni->state because that may not be at api->index */
484 return ((UNormIterator *)api)->states[api->index];
485 }
486
487 static void U_CALLCONV
unormIteratorSetState(UCharIterator * api,uint32_t state,UErrorCode * pErrorCode)488 unormIteratorSetState(UCharIterator *api, uint32_t state, UErrorCode *pErrorCode) {
489 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
490 /* do nothing */
491 } else if(api==NULL) {
492 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
493 } else if(state==UITER_NO_STATE) {
494 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
495 } else {
496 UNormIterator *uni=(UNormIterator *)api;
497 UCharIterator *iter=((UNormIterator *)api)->iter;
498 if(state!=uni->state) {
499 uni->state=state;
500 uiter_setState(iter, state, pErrorCode);
501 }
502
503 /*
504 * Try shortcuts: If the requested state is in the array contents
505 * then just set the index there.
506 *
507 * We assume that the state is unique per position!
508 */
509 if(state==uni->states[api->index]) {
510 return;
511 } else if(state==uni->states[api->limit]) {
512 api->index=api->limit;
513 return;
514 } else {
515 /* search for the index with this state */
516 int32_t i;
517
518 for(i=api->start; i<api->limit; ++i) {
519 if(state==uni->states[i]) {
520 api->index=i;
521 return;
522 }
523 }
524 }
525
526 /* there is no array index for this state, reset for fresh contents */
527 initIndexes((UNormIterator *)api, iter);
528 uni->states[api->limit]=state;
529 }
530 }
531
532 static const UCharIterator unormIterator={
533 NULL, 0, 0, 0, 0, 0,
534 unormIteratorGetIndex,
535 unormIteratorMove,
536 unormIteratorHasNext,
537 unormIteratorHasPrevious,
538 unormIteratorCurrent,
539 unormIteratorNext,
540 unormIteratorPrevious,
541 NULL,
542 unormIteratorGetState,
543 unormIteratorSetState
544 };
545
546 /* Setup functions ---------------------------------------------------------- */
547
548 U_CAPI UNormIterator * U_EXPORT2
unorm_openIter(void * stackMem,int32_t stackMemSize,UErrorCode * pErrorCode)549 unorm_openIter(void *stackMem, int32_t stackMemSize, UErrorCode *pErrorCode) {
550 UNormIterator *uni;
551
552 /* argument checking */
553 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
554 return NULL;
555 }
556
557 /* allocate */
558 uni=NULL;
559 if(stackMem!=NULL && stackMemSize>=sizeof(UNormIterator)) {
560 if(U_ALIGNMENT_OFFSET(stackMem)==0) {
561 /* already aligned */
562 uni=(UNormIterator *)stackMem;
563 } else {
564 int32_t align=(int32_t)U_ALIGNMENT_OFFSET_UP(stackMem);
565 if((stackMemSize-=align)>=(int32_t)sizeof(UNormIterator)) {
566 /* needs alignment */
567 uni=(UNormIterator *)((char *)stackMem+align);
568 }
569 }
570 /* else does not fit */
571 }
572
573 if(uni!=NULL) {
574 uni->isStackAllocated=TRUE;
575 } else {
576 uni=(UNormIterator *)uprv_malloc(sizeof(UNormIterator));
577 if(uni==NULL) {
578 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
579 return NULL;
580 }
581 uni->isStackAllocated=FALSE;
582 }
583
584 /*
585 * initialize
586 * do not memset because that would unnecessarily initialize the arrays
587 */
588 uni->iter=NULL;
589 uni->chars=uni->charsBuffer;
590 uni->states=uni->statesBuffer;
591 uni->capacity=INITIAL_CAPACITY;
592 uni->state=UITER_NO_STATE;
593 uni->hasPrevious=uni->hasNext=FALSE;
594 uni->mode=UNORM_NONE;
595
596 /* set a no-op iterator into the api */
597 uiter_setString(&uni->api, NULL, 0);
598 return uni;
599 }
600
601 U_CAPI void U_EXPORT2
unorm_closeIter(UNormIterator * uni)602 unorm_closeIter(UNormIterator *uni) {
603 if(uni!=NULL) {
604 if(uni->states!=uni->statesBuffer) {
605 /* chars and states are allocated in the same memory block */
606 uprv_free(uni->states);
607 }
608 if(!uni->isStackAllocated) {
609 uprv_free(uni);
610 }
611 }
612 }
613
614 U_CAPI UCharIterator * U_EXPORT2
unorm_setIter(UNormIterator * uni,UCharIterator * iter,UNormalizationMode mode,UErrorCode * pErrorCode)615 unorm_setIter(UNormIterator *uni, UCharIterator *iter, UNormalizationMode mode, UErrorCode *pErrorCode) {
616 /* argument checking */
617 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
618 return NULL;
619 }
620 if(uni==NULL) {
621 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
622 return NULL;
623 }
624 if( iter==NULL || iter->getState==NULL || iter->setState==NULL ||
625 mode<UNORM_NONE || UNORM_MODE_COUNT<=mode
626 ) {
627 /* set a no-op iterator into the api */
628 uiter_setString(&uni->api, NULL, 0);
629 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
630 return NULL;
631 }
632
633 /* set the iterator and initialize */
634 uprv_memcpy(&uni->api, &unormIterator, sizeof(unormIterator));
635
636 uni->iter=iter;
637 uni->mode=mode;
638
639 initIndexes(uni, iter);
640 uni->states[uni->api.limit]=uni->state=uiter_getState(iter);
641
642 return &uni->api;
643 }
644
645 #endif /* uconfig.h switches */
646