1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2005-2011, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: utext.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2005apr12
14 * created by: Markus W. Scherer
15 */
16
17 #include "unicode/utypes.h"
18 #include "unicode/ustring.h"
19 #include "unicode/unistr.h"
20 #include "unicode/chariter.h"
21 #include "unicode/utext.h"
22 #include "ustr_imp.h"
23 #include "cmemory.h"
24 #include "cstring.h"
25 #include "uassert.h"
26 #include "putilimp.h"
27
28 U_NAMESPACE_USE
29
30 #define I32_FLAG(bitIndex) ((int32_t)1<<(bitIndex))
31
32
33 static UBool
utext_access(UText * ut,int64_t index,UBool forward)34 utext_access(UText *ut, int64_t index, UBool forward) {
35 return ut->pFuncs->access(ut, index, forward);
36 }
37
38
39
40 U_CAPI UBool U_EXPORT2
utext_moveIndex32(UText * ut,int32_t delta)41 utext_moveIndex32(UText *ut, int32_t delta) {
42 UChar32 c;
43 if (delta > 0) {
44 do {
45 if(ut->chunkOffset>=ut->chunkLength && !utext_access(ut, ut->chunkNativeLimit, TRUE)) {
46 return FALSE;
47 }
48 c = ut->chunkContents[ut->chunkOffset];
49 if (U16_IS_SURROGATE(c)) {
50 c = utext_next32(ut);
51 if (c == U_SENTINEL) {
52 return FALSE;
53 }
54 } else {
55 ut->chunkOffset++;
56 }
57 } while(--delta>0);
58
59 } else if (delta<0) {
60 do {
61 if(ut->chunkOffset<=0 && !utext_access(ut, ut->chunkNativeStart, FALSE)) {
62 return FALSE;
63 }
64 c = ut->chunkContents[ut->chunkOffset-1];
65 if (U16_IS_SURROGATE(c)) {
66 c = utext_previous32(ut);
67 if (c == U_SENTINEL) {
68 return FALSE;
69 }
70 } else {
71 ut->chunkOffset--;
72 }
73 } while(++delta<0);
74 }
75
76 return TRUE;
77 }
78
79
80 U_CAPI int64_t U_EXPORT2
utext_nativeLength(UText * ut)81 utext_nativeLength(UText *ut) {
82 return ut->pFuncs->nativeLength(ut);
83 }
84
85
86 U_CAPI UBool U_EXPORT2
utext_isLengthExpensive(const UText * ut)87 utext_isLengthExpensive(const UText *ut) {
88 UBool r = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE)) != 0;
89 return r;
90 }
91
92
93 U_CAPI int64_t U_EXPORT2
utext_getNativeIndex(const UText * ut)94 utext_getNativeIndex(const UText *ut) {
95 if(ut->chunkOffset <= ut->nativeIndexingLimit) {
96 return ut->chunkNativeStart+ut->chunkOffset;
97 } else {
98 return ut->pFuncs->mapOffsetToNative(ut);
99 }
100 }
101
102
103 U_CAPI void U_EXPORT2
utext_setNativeIndex(UText * ut,int64_t index)104 utext_setNativeIndex(UText *ut, int64_t index) {
105 if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) {
106 // The desired position is outside of the current chunk.
107 // Access the new position. Assume a forward iteration from here,
108 // which will also be optimimum for a single random access.
109 // Reverse iterations may suffer slightly.
110 ut->pFuncs->access(ut, index, TRUE);
111 } else if((int32_t)(index - ut->chunkNativeStart) <= ut->nativeIndexingLimit) {
112 // utf-16 indexing.
113 ut->chunkOffset=(int32_t)(index-ut->chunkNativeStart);
114 } else {
115 ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index);
116 }
117 // The convention is that the index must always be on a code point boundary.
118 // Adjust the index position if it is in the middle of a surrogate pair.
119 if (ut->chunkOffset<ut->chunkLength) {
120 UChar c= ut->chunkContents[ut->chunkOffset];
121 if (UTF16_IS_TRAIL(c)) {
122 if (ut->chunkOffset==0) {
123 ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE);
124 }
125 if (ut->chunkOffset>0) {
126 UChar lead = ut->chunkContents[ut->chunkOffset-1];
127 if (UTF16_IS_LEAD(lead)) {
128 ut->chunkOffset--;
129 }
130 }
131 }
132 }
133 }
134
135
136
137 U_CAPI int64_t U_EXPORT2
utext_getPreviousNativeIndex(UText * ut)138 utext_getPreviousNativeIndex(UText *ut) {
139 //
140 // Fast-path the common case.
141 // Common means current position is not at the beginning of a chunk
142 // and the preceding character is not supplementary.
143 //
144 int32_t i = ut->chunkOffset - 1;
145 int64_t result;
146 if (i >= 0) {
147 UChar c = ut->chunkContents[i];
148 if (U16_IS_TRAIL(c) == FALSE) {
149 if (i <= ut->nativeIndexingLimit) {
150 result = ut->chunkNativeStart + i;
151 } else {
152 ut->chunkOffset = i;
153 result = ut->pFuncs->mapOffsetToNative(ut);
154 ut->chunkOffset++;
155 }
156 return result;
157 }
158 }
159
160 // If at the start of text, simply return 0.
161 if (ut->chunkOffset==0 && ut->chunkNativeStart==0) {
162 return 0;
163 }
164
165 // Harder, less common cases. We are at a chunk boundary, or on a surrogate.
166 // Keep it simple, use other functions to handle the edges.
167 //
168 utext_previous32(ut);
169 result = UTEXT_GETNATIVEINDEX(ut);
170 utext_next32(ut);
171 return result;
172 }
173
174
175 //
176 // utext_current32. Get the UChar32 at the current position.
177 // UText iteration position is always on a code point boundary,
178 // never on the trail half of a surrogate pair.
179 //
180 U_CAPI UChar32 U_EXPORT2
utext_current32(UText * ut)181 utext_current32(UText *ut) {
182 UChar32 c;
183 if (ut->chunkOffset==ut->chunkLength) {
184 // Current position is just off the end of the chunk.
185 if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
186 // Off the end of the text.
187 return U_SENTINEL;
188 }
189 }
190
191 c = ut->chunkContents[ut->chunkOffset];
192 if (U16_IS_LEAD(c) == FALSE) {
193 // Normal, non-supplementary case.
194 return c;
195 }
196
197 //
198 // Possible supplementary char.
199 //
200 UChar32 trail = 0;
201 UChar32 supplementaryC = c;
202 if ((ut->chunkOffset+1) < ut->chunkLength) {
203 // The trail surrogate is in the same chunk.
204 trail = ut->chunkContents[ut->chunkOffset+1];
205 } else {
206 // The trail surrogate is in a different chunk.
207 // Because we must maintain the iteration position, we need to switch forward
208 // into the new chunk, get the trail surrogate, then revert the chunk back to the
209 // original one.
210 // An edge case to be careful of: the entire text may end with an unpaired
211 // leading surrogate. The attempt to access the trail will fail, but
212 // the original position before the unpaired lead still needs to be restored.
213 int64_t nativePosition = ut->chunkNativeLimit;
214 int32_t originalOffset = ut->chunkOffset;
215 if (ut->pFuncs->access(ut, nativePosition, TRUE)) {
216 trail = ut->chunkContents[ut->chunkOffset];
217 }
218 UBool r = ut->pFuncs->access(ut, nativePosition, FALSE); // reverse iteration flag loads preceding chunk
219 U_ASSERT(r==TRUE);
220 ut->chunkOffset = originalOffset;
221 if(!r) {
222 return U_SENTINEL;
223 }
224 }
225
226 if (U16_IS_TRAIL(trail)) {
227 supplementaryC = U16_GET_SUPPLEMENTARY(c, trail);
228 }
229 return supplementaryC;
230
231 }
232
233
234 U_CAPI UChar32 U_EXPORT2
utext_char32At(UText * ut,int64_t nativeIndex)235 utext_char32At(UText *ut, int64_t nativeIndex) {
236 UChar32 c = U_SENTINEL;
237
238 // Fast path the common case.
239 if (nativeIndex>=ut->chunkNativeStart && nativeIndex < ut->chunkNativeStart + ut->nativeIndexingLimit) {
240 ut->chunkOffset = (int32_t)(nativeIndex - ut->chunkNativeStart);
241 c = ut->chunkContents[ut->chunkOffset];
242 if (U16_IS_SURROGATE(c) == FALSE) {
243 return c;
244 }
245 }
246
247
248 utext_setNativeIndex(ut, nativeIndex);
249 if (nativeIndex>=ut->chunkNativeStart && ut->chunkOffset<ut->chunkLength) {
250 c = ut->chunkContents[ut->chunkOffset];
251 if (U16_IS_SURROGATE(c)) {
252 // For surrogates, let current32() deal with the complications
253 // of supplementaries that may span chunk boundaries.
254 c = utext_current32(ut);
255 }
256 }
257 return c;
258 }
259
260
261 U_CAPI UChar32 U_EXPORT2
utext_next32(UText * ut)262 utext_next32(UText *ut) {
263 UChar32 c;
264
265 if (ut->chunkOffset >= ut->chunkLength) {
266 if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
267 return U_SENTINEL;
268 }
269 }
270
271 c = ut->chunkContents[ut->chunkOffset++];
272 if (U16_IS_LEAD(c) == FALSE) {
273 // Normal case, not supplementary.
274 // (A trail surrogate seen here is just returned as is, as a surrogate value.
275 // It cannot be part of a pair.)
276 return c;
277 }
278
279 if (ut->chunkOffset >= ut->chunkLength) {
280 if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
281 // c is an unpaired lead surrogate at the end of the text.
282 // return it as it is.
283 return c;
284 }
285 }
286 UChar32 trail = ut->chunkContents[ut->chunkOffset];
287 if (U16_IS_TRAIL(trail) == FALSE) {
288 // c was an unpaired lead surrogate, not at the end of the text.
289 // return it as it is (unpaired). Iteration position is on the
290 // following character, possibly in the next chunk, where the
291 // trail surrogate would have been if it had existed.
292 return c;
293 }
294
295 UChar32 supplementary = U16_GET_SUPPLEMENTARY(c, trail);
296 ut->chunkOffset++; // move iteration position over the trail surrogate.
297 return supplementary;
298 }
299
300
301 U_CAPI UChar32 U_EXPORT2
utext_previous32(UText * ut)302 utext_previous32(UText *ut) {
303 UChar32 c;
304
305 if (ut->chunkOffset <= 0) {
306 if (ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE) == FALSE) {
307 return U_SENTINEL;
308 }
309 }
310 ut->chunkOffset--;
311 c = ut->chunkContents[ut->chunkOffset];
312 if (U16_IS_TRAIL(c) == FALSE) {
313 // Normal case, not supplementary.
314 // (A lead surrogate seen here is just returned as is, as a surrogate value.
315 // It cannot be part of a pair.)
316 return c;
317 }
318
319 if (ut->chunkOffset <= 0) {
320 if (ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE) == FALSE) {
321 // c is an unpaired trail surrogate at the start of the text.
322 // return it as it is.
323 return c;
324 }
325 }
326
327 UChar32 lead = ut->chunkContents[ut->chunkOffset-1];
328 if (U16_IS_LEAD(lead) == FALSE) {
329 // c was an unpaired trail surrogate, not at the end of the text.
330 // return it as it is (unpaired). Iteration position is at c
331 return c;
332 }
333
334 UChar32 supplementary = U16_GET_SUPPLEMENTARY(lead, c);
335 ut->chunkOffset--; // move iteration position over the lead surrogate.
336 return supplementary;
337 }
338
339
340
341 U_CAPI UChar32 U_EXPORT2
utext_next32From(UText * ut,int64_t index)342 utext_next32From(UText *ut, int64_t index) {
343 UChar32 c = U_SENTINEL;
344
345 if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) {
346 // Desired position is outside of the current chunk.
347 if(!ut->pFuncs->access(ut, index, TRUE)) {
348 // no chunk available here
349 return U_SENTINEL;
350 }
351 } else if (index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) {
352 // Desired position is in chunk, with direct 1:1 native to UTF16 indexing
353 ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
354 } else {
355 // Desired position is in chunk, with non-UTF16 indexing.
356 ut->chunkOffset = ut->pFuncs->mapNativeIndexToUTF16(ut, index);
357 }
358
359 c = ut->chunkContents[ut->chunkOffset++];
360 if (U16_IS_SURROGATE(c)) {
361 // Surrogates. Many edge cases. Use other functions that already
362 // deal with the problems.
363 utext_setNativeIndex(ut, index);
364 c = utext_next32(ut);
365 }
366 return c;
367 }
368
369
370 U_CAPI UChar32 U_EXPORT2
utext_previous32From(UText * ut,int64_t index)371 utext_previous32From(UText *ut, int64_t index) {
372 //
373 // Return the character preceding the specified index.
374 // Leave the iteration position at the start of the character that was returned.
375 //
376 UChar32 cPrev; // The character preceding cCurr, which is what we will return.
377
378 // Address the chunk containg the position preceding the incoming index
379 // A tricky edge case:
380 // We try to test the requested native index against the chunkNativeStart to determine
381 // whether the character preceding the one at the index is in the current chunk.
382 // BUT, this test can fail with UTF-8 (or any other multibyte encoding), when the
383 // requested index is on something other than the first position of the first char.
384 //
385 if(index<=ut->chunkNativeStart || index>ut->chunkNativeLimit) {
386 // Requested native index is outside of the current chunk.
387 if(!ut->pFuncs->access(ut, index, FALSE)) {
388 // no chunk available here
389 return U_SENTINEL;
390 }
391 } else if(index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) {
392 // Direct UTF-16 indexing.
393 ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
394 } else {
395 ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index);
396 if (ut->chunkOffset==0 && !ut->pFuncs->access(ut, index, FALSE)) {
397 // no chunk available here
398 return U_SENTINEL;
399 }
400 }
401
402 //
403 // Simple case with no surrogates.
404 //
405 ut->chunkOffset--;
406 cPrev = ut->chunkContents[ut->chunkOffset];
407
408 if (U16_IS_SURROGATE(cPrev)) {
409 // Possible supplementary. Many edge cases.
410 // Let other functions do the heavy lifting.
411 utext_setNativeIndex(ut, index);
412 cPrev = utext_previous32(ut);
413 }
414 return cPrev;
415 }
416
417
418 U_CAPI int32_t U_EXPORT2
utext_extract(UText * ut,int64_t start,int64_t limit,UChar * dest,int32_t destCapacity,UErrorCode * status)419 utext_extract(UText *ut,
420 int64_t start, int64_t limit,
421 UChar *dest, int32_t destCapacity,
422 UErrorCode *status) {
423 return ut->pFuncs->extract(ut, start, limit, dest, destCapacity, status);
424 }
425
426
427
428 U_CAPI UBool U_EXPORT2
utext_equals(const UText * a,const UText * b)429 utext_equals(const UText *a, const UText *b) {
430 if (a==NULL || b==NULL ||
431 a->magic != UTEXT_MAGIC ||
432 b->magic != UTEXT_MAGIC) {
433 // Null or invalid arguments don't compare equal to anything.
434 return FALSE;
435 }
436
437 if (a->pFuncs != b->pFuncs) {
438 // Different types of text providers.
439 return FALSE;
440 }
441
442 if (a->context != b->context) {
443 // Different sources (different strings)
444 return FALSE;
445 }
446 if (utext_getNativeIndex(a) != utext_getNativeIndex(b)) {
447 // Different current position in the string.
448 return FALSE;
449 }
450
451 return TRUE;
452 }
453
454 U_CAPI int32_t U_EXPORT2
utext_compare(UText * s1,int32_t length1,UText * s2,int32_t length2)455 utext_compare(UText *s1, int32_t length1,
456 UText *s2, int32_t length2) {
457 UChar32 c1 = 0, c2 = 0;
458
459 if(length1<0 && length2<0) {
460 /* strcmp style, go until end of string */
461 for(;;) {
462 c1 = UTEXT_NEXT32(s1);
463 c2 = UTEXT_NEXT32(s2);
464 if(c1 != c2) {
465 break;
466 } else if(c1 == U_SENTINEL) {
467 return 0;
468 }
469 }
470 } else {
471 if(length1 < 0) {
472 length1 = INT32_MIN;
473 } else if (length2 < 0) {
474 length2 = INT32_MIN;
475 }
476
477 /* memcmp/UnicodeString style, both length-specified */
478 while((length1 > 0 || length1 == INT32_MIN) && (length2 > 0 || length2 == INT32_MIN)) {
479 c1 = UTEXT_NEXT32(s1);
480 c2 = UTEXT_NEXT32(s2);
481
482 if(c1 != c2) {
483 break;
484 } else if(c1 == U_SENTINEL) {
485 return 0;
486 }
487
488 if (length1 != INT32_MIN) {
489 length1 -= 1;
490 }
491 if (length2 != INT32_MIN) {
492 length2 -= 1;
493 }
494 }
495
496 if(length1 <= 0 && length1 != INT32_MIN) {
497 if(length2 <= 0) {
498 return 0;
499 } else {
500 return -1;
501 }
502 } else if(length2 <= 0 && length2 != INT32_MIN) {
503 if (length1 <= 0) {
504 return 0;
505 } else {
506 return 1;
507 }
508 }
509 }
510
511 return (int32_t)c1-(int32_t)c2;
512 }
513
514 U_CAPI int32_t U_EXPORT2
utext_compareNativeLimit(UText * s1,int64_t limit1,UText * s2,int64_t limit2)515 utext_compareNativeLimit(UText *s1, int64_t limit1,
516 UText *s2, int64_t limit2) {
517 UChar32 c1, c2;
518
519 if(limit1<0 && limit2<0) {
520 /* strcmp style, go until end of string */
521 for(;;) {
522 c1 = UTEXT_NEXT32(s1);
523 c2 = UTEXT_NEXT32(s2);
524 if(c1 != c2) {
525 return (int32_t)c1-(int32_t)c2;
526 } else if(c1 == U_SENTINEL) {
527 return 0;
528 }
529 }
530 } else {
531 /* memcmp/UnicodeString style, both length-specified */
532 int64_t index1 = (limit1 >= 0 ? UTEXT_GETNATIVEINDEX(s1) : 0);
533 int64_t index2 = (limit2 >= 0 ? UTEXT_GETNATIVEINDEX(s2) : 0);
534
535 while((limit1 < 0 || index1 < limit1) && (limit2 < 0 || index2 < limit2)) {
536 c1 = UTEXT_NEXT32(s1);
537 c2 = UTEXT_NEXT32(s2);
538
539 if(c1 != c2) {
540 return (int32_t)c1-(int32_t)c2;
541 } else if(c1 == U_SENTINEL) {
542 return 0;
543 }
544
545 if (limit1 >= 0) {
546 index1 = UTEXT_GETNATIVEINDEX(s1);
547 }
548 if (limit2 >= 0) {
549 index2 = UTEXT_GETNATIVEINDEX(s2);
550 }
551 }
552
553 if(limit1 >= 0 && index1 >= limit1) {
554 if(index2 >= limit2) {
555 return 0;
556 } else {
557 return -1;
558 }
559 } else {
560 if(index1 >= limit1) {
561 return 0;
562 } else {
563 return 1;
564 }
565 }
566 }
567 }
568
569 U_CAPI int32_t U_EXPORT2
utext_caseCompare(UText * s1,int32_t length1,UText * s2,int32_t length2,uint32_t options,UErrorCode * pErrorCode)570 utext_caseCompare(UText *s1, int32_t length1,
571 UText *s2, int32_t length2,
572 uint32_t options, UErrorCode *pErrorCode) {
573 const UCaseProps *csp;
574
575 /* case folding variables */
576 const UChar *p;
577 int32_t length;
578
579 /* case folding buffers, only use current-level start/limit */
580 UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
581 int32_t foldOffset1, foldOffset2, foldLength1, foldLength2;
582
583 /* current code points */
584 UChar32 c1, c2;
585 uint8_t cLength1, cLength2;
586
587 /* argument checking */
588 if(U_FAILURE(*pErrorCode)) {
589 return 0;
590 }
591 if(s1==NULL || s2==NULL) {
592 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
593 return 0;
594 }
595
596 csp=ucase_getSingleton();
597
598 /* for variable-length strings */
599 if(length1 < 0) {
600 length1 = INT32_MIN;
601 }
602 if (length2 < 0) {
603 length2 = INT32_MIN;
604 }
605
606 /* initialize */
607 foldOffset1 = foldOffset2 = foldLength1 = foldLength2 = 0;
608
609 /* comparison loop */
610 while((foldOffset1 < foldLength1 || length1 > 0 || length1 == INT32_MIN) &&
611 (foldOffset2 < foldLength2 || length2 > 0 || length2 == INT32_MIN)) {
612 if(foldOffset1 < foldLength1) {
613 U16_NEXT_UNSAFE(fold1, foldOffset1, c1);
614 cLength1 = 0;
615 } else {
616 c1 = UTEXT_NEXT32(s1);
617 if (c1 != U_SENTINEL) {
618 cLength1 = U16_LENGTH(c1);
619
620 length = ucase_toFullFolding(csp, c1, &p, options);
621 if(length >= 0) {
622 if(length <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle 0-length folded-case strings
623 u_memcpy(fold1, p, length);
624 foldOffset1 = 0;
625 foldLength1 = length;
626 U16_NEXT_UNSAFE(fold1, foldOffset1, c1);
627 } else {
628 c1 = length;
629 }
630 }
631 }
632
633 if(length1 != INT32_MIN) {
634 length1 -= 1;
635 }
636 }
637
638 if(foldOffset2 < foldLength2) {
639 U16_NEXT_UNSAFE(fold2, foldOffset2, c2);
640 cLength2 = 0;
641 } else {
642 c2 = UTEXT_NEXT32(s2);
643 if (c2 != U_SENTINEL) {
644 cLength2 = U16_LENGTH(c2);
645
646 length = ucase_toFullFolding(csp, c2, &p, options);
647 if(length >= 0) {
648 if(length <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle 0-length folded-case strings
649 u_memcpy(fold2, p, length);
650 foldOffset2 = 0;
651 foldLength2 = length;
652 U16_NEXT_UNSAFE(fold2, foldOffset2, c2);
653 } else {
654 c2 = length;
655 }
656 }
657 } else if(c1 == U_SENTINEL) {
658 return 0; // end of both strings at once
659 }
660
661 if(length2 != INT32_MIN) {
662 length2 -= 1;
663 }
664 }
665
666 if(c1 != c2) {
667 return (int32_t)c1-(int32_t)c2;
668 }
669 }
670
671 /* By now at least one of the strings is out of characters */
672 length1 += foldLength1 - foldOffset1;
673 length2 += foldLength2 - foldOffset2;
674
675 if(length1 <= 0 && length1 != INT32_MIN) {
676 if(length2 <= 0) {
677 return 0;
678 } else {
679 return -1;
680 }
681 } else {
682 if (length1 <= 0) {
683 return 0;
684 } else {
685 return 1;
686 }
687 }
688 }
689
690 U_CAPI int32_t U_EXPORT2
utext_caseCompareNativeLimit(UText * s1,int64_t limit1,UText * s2,int64_t limit2,uint32_t options,UErrorCode * pErrorCode)691 utext_caseCompareNativeLimit(UText *s1, int64_t limit1,
692 UText *s2, int64_t limit2,
693 uint32_t options, UErrorCode *pErrorCode) {
694 const UCaseProps *csp;
695
696 /* case folding variables */
697 const UChar *p;
698 int32_t length;
699
700 /* case folding buffers, only use current-level start/limit */
701 UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
702 int32_t foldOffset1, foldOffset2, foldLength1, foldLength2;
703
704 /* current code points */
705 UChar32 c1, c2;
706
707 /* native indexes into s1 and s2 */
708 int64_t index1, index2;
709
710 /* argument checking */
711 if(U_FAILURE(*pErrorCode)) {
712 return 0;
713 }
714 if(s1==NULL || s2==NULL) {
715 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
716 return 0;
717 }
718
719 csp=ucase_getSingleton();
720
721 /* initialize */
722 index1 = (limit1 >= 0 ? UTEXT_GETNATIVEINDEX(s1) : 0);
723 index2 = (limit2 >= 0 ? UTEXT_GETNATIVEINDEX(s2) : 0);
724
725 foldOffset1 = foldOffset2 = foldLength1 = foldLength2 = 0;
726
727 /* comparison loop */
728 while((foldOffset1 < foldLength1 || limit1 < 0 || index1 < limit1) &&
729 (foldOffset2 < foldLength2 || limit2 < 0 || index2 < limit2)) {
730 if(foldOffset1 < foldLength1) {
731 U16_NEXT_UNSAFE(fold1, foldOffset1, c1);
732 } else {
733 c1 = UTEXT_NEXT32(s1);
734 if (c1 != U_SENTINEL) {
735 length = ucase_toFullFolding(csp, c1, &p, options);
736 if(length >= 0) {
737 if(length <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle 0-length folded-case strings
738 u_memcpy(fold1, p, length);
739 foldOffset1 = 0;
740 foldLength1 = length;
741 U16_NEXT_UNSAFE(fold1, foldOffset1, c1);
742 } else {
743 c1 = length;
744 }
745 }
746 }
747
748 if (limit1 >= 0) {
749 index1 = UTEXT_GETNATIVEINDEX(s1);
750 }
751 }
752
753 if(foldOffset2 < foldLength2) {
754 U16_NEXT_UNSAFE(fold2, foldOffset2, c2);
755 } else {
756 c2 = UTEXT_NEXT32(s2);
757 if (c2 != U_SENTINEL) {
758 length = ucase_toFullFolding(csp, c2, &p, options);
759 if(length >= 0) {
760 if(length <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle 0-length folded-case strings
761 u_memcpy(fold2, p, length);
762 foldOffset2 = 0;
763 foldLength2 = length;
764 U16_NEXT_UNSAFE(fold2, foldOffset2, c2);
765 } else {
766 c2 = length;
767 }
768 }
769 } else if(c1 == U_SENTINEL) {
770 return 0;
771 }
772
773 if (limit2 >= 0) {
774 index2 = UTEXT_GETNATIVEINDEX(s2);
775 }
776 }
777
778 if(c1 != c2) {
779 return (int32_t)c1-(int32_t)c2;
780 }
781 }
782
783 /* By now at least one of the strings is out of characters */
784 index1 -= foldLength1 - foldOffset1;
785 index2 -= foldLength2 - foldOffset2;
786
787 if(limit1 >= 0 && index1 >= limit1) {
788 if(index2 >= limit2) {
789 return 0;
790 } else {
791 return -1;
792 }
793 } else {
794 if(index1 >= limit1) {
795 return 0;
796 } else {
797 return 1;
798 }
799 }
800 }
801
802
803 U_CAPI UBool U_EXPORT2
utext_isWritable(const UText * ut)804 utext_isWritable(const UText *ut)
805 {
806 UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) != 0;
807 return b;
808 }
809
810
811 U_CAPI void U_EXPORT2
utext_freeze(UText * ut)812 utext_freeze(UText *ut) {
813 // Zero out the WRITABLE flag.
814 ut->providerProperties &= ~(I32_FLAG(UTEXT_PROVIDER_WRITABLE));
815 }
816
817
818 U_CAPI UBool U_EXPORT2
utext_hasMetaData(const UText * ut)819 utext_hasMetaData(const UText *ut)
820 {
821 UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA)) != 0;
822 return b;
823 }
824
825
826
827 U_CAPI int32_t U_EXPORT2
utext_replace(UText * ut,int64_t nativeStart,int64_t nativeLimit,const UChar * replacementText,int32_t replacementLength,UErrorCode * status)828 utext_replace(UText *ut,
829 int64_t nativeStart, int64_t nativeLimit,
830 const UChar *replacementText, int32_t replacementLength,
831 UErrorCode *status)
832 {
833 if (U_FAILURE(*status)) {
834 return 0;
835 }
836 if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) {
837 *status = U_NO_WRITE_PERMISSION;
838 return 0;
839 }
840 int32_t i = ut->pFuncs->replace(ut, nativeStart, nativeLimit, replacementText, replacementLength, status);
841 return i;
842 }
843
844 U_CAPI void U_EXPORT2
utext_copy(UText * ut,int64_t nativeStart,int64_t nativeLimit,int64_t destIndex,UBool move,UErrorCode * status)845 utext_copy(UText *ut,
846 int64_t nativeStart, int64_t nativeLimit,
847 int64_t destIndex,
848 UBool move,
849 UErrorCode *status)
850 {
851 if (U_FAILURE(*status)) {
852 return;
853 }
854 if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) {
855 *status = U_NO_WRITE_PERMISSION;
856 return;
857 }
858 ut->pFuncs->copy(ut, nativeStart, nativeLimit, destIndex, move, status);
859 }
860
861
862
863 U_CAPI UText * U_EXPORT2
utext_clone(UText * dest,const UText * src,UBool deep,UBool readOnly,UErrorCode * status)864 utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status) {
865 UText *result;
866 result = src->pFuncs->clone(dest, src, deep, status);
867 if (readOnly) {
868 utext_freeze(result);
869 }
870 return result;
871 }
872
873
874
875 //------------------------------------------------------------------------------
876 //
877 // UText common functions implementation
878 //
879 //------------------------------------------------------------------------------
880
881 //
882 // UText.flags bit definitions
883 //
884 enum {
885 UTEXT_HEAP_ALLOCATED = 1, // 1 if ICU has allocated this UText struct on the heap.
886 // 0 if caller provided storage for the UText.
887
888 UTEXT_EXTRA_HEAP_ALLOCATED = 2, // 1 if ICU has allocated extra storage as a separate
889 // heap block.
890 // 0 if there is no separate allocation. Either no extra
891 // storage was requested, or it is appended to the end
892 // of the main UText storage.
893
894 UTEXT_OPEN = 4 // 1 if this UText is currently open
895 // 0 if this UText is not open.
896 };
897
898
899 //
900 // Extended form of a UText. The purpose is to aid in computing the total size required
901 // when a provider asks for a UText to be allocated with extra storage.
902
903 struct ExtendedUText {
904 UText ut;
905 UAlignedMemory extension;
906 };
907
908 static const UText emptyText = UTEXT_INITIALIZER;
909
910 U_CAPI UText * U_EXPORT2
utext_setup(UText * ut,int32_t extraSpace,UErrorCode * status)911 utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
912 if (U_FAILURE(*status)) {
913 return ut;
914 }
915
916 if (ut == NULL) {
917 // We need to heap-allocate storage for the new UText
918 int32_t spaceRequired = sizeof(UText);
919 if (extraSpace > 0) {
920 spaceRequired = sizeof(ExtendedUText) + extraSpace - sizeof(UAlignedMemory);
921 }
922 ut = (UText *)uprv_malloc(spaceRequired);
923 if (ut == NULL) {
924 *status = U_MEMORY_ALLOCATION_ERROR;
925 return NULL;
926 } else {
927 *ut = emptyText;
928 ut->flags |= UTEXT_HEAP_ALLOCATED;
929 if (spaceRequired>0) {
930 ut->extraSize = extraSpace;
931 ut->pExtra = &((ExtendedUText *)ut)->extension;
932 }
933 }
934 } else {
935 // We have been supplied with an already existing UText.
936 // Verify that it really appears to be a UText.
937 if (ut->magic != UTEXT_MAGIC) {
938 *status = U_ILLEGAL_ARGUMENT_ERROR;
939 return ut;
940 }
941 // If the ut is already open and there's a provider supplied close
942 // function, call it.
943 if ((ut->flags & UTEXT_OPEN) && ut->pFuncs->close != NULL) {
944 ut->pFuncs->close(ut);
945 }
946 ut->flags &= ~UTEXT_OPEN;
947
948 // If extra space was requested by our caller, check whether
949 // sufficient already exists, and allocate new if needed.
950 if (extraSpace > ut->extraSize) {
951 // Need more space. If there is existing separately allocated space,
952 // delete it first, then allocate new space.
953 if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) {
954 uprv_free(ut->pExtra);
955 ut->extraSize = 0;
956 }
957 ut->pExtra = uprv_malloc(extraSpace);
958 if (ut->pExtra == NULL) {
959 *status = U_MEMORY_ALLOCATION_ERROR;
960 } else {
961 ut->extraSize = extraSpace;
962 ut->flags |= UTEXT_EXTRA_HEAP_ALLOCATED;
963 }
964 }
965 }
966 if (U_SUCCESS(*status)) {
967 ut->flags |= UTEXT_OPEN;
968
969 // Initialize all remaining fields of the UText.
970 //
971 ut->context = NULL;
972 ut->chunkContents = NULL;
973 ut->p = NULL;
974 ut->q = NULL;
975 ut->r = NULL;
976 ut->a = 0;
977 ut->b = 0;
978 ut->c = 0;
979 ut->chunkOffset = 0;
980 ut->chunkLength = 0;
981 ut->chunkNativeStart = 0;
982 ut->chunkNativeLimit = 0;
983 ut->nativeIndexingLimit = 0;
984 ut->providerProperties = 0;
985 ut->privA = 0;
986 ut->privB = 0;
987 ut->privC = 0;
988 ut->privP = NULL;
989 if (ut->pExtra!=NULL && ut->extraSize>0)
990 uprv_memset(ut->pExtra, 0, ut->extraSize);
991
992 }
993 return ut;
994 }
995
996
997 U_CAPI UText * U_EXPORT2
utext_close(UText * ut)998 utext_close(UText *ut) {
999 if (ut==NULL ||
1000 ut->magic != UTEXT_MAGIC ||
1001 (ut->flags & UTEXT_OPEN) == 0)
1002 {
1003 // The supplied ut is not an open UText.
1004 // Do nothing.
1005 return ut;
1006 }
1007
1008 // If the provider gave us a close function, call it now.
1009 // This will clean up anything allocated specifically by the provider.
1010 if (ut->pFuncs->close != NULL) {
1011 ut->pFuncs->close(ut);
1012 }
1013 ut->flags &= ~UTEXT_OPEN;
1014
1015 // If we (the framework) allocated the UText or subsidiary storage,
1016 // delete it.
1017 if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) {
1018 uprv_free(ut->pExtra);
1019 ut->pExtra = NULL;
1020 ut->flags &= ~UTEXT_EXTRA_HEAP_ALLOCATED;
1021 ut->extraSize = 0;
1022 }
1023
1024 // Zero out function table of the closed UText. This is a defensive move,
1025 // inteded to cause applications that inadvertantly use a closed
1026 // utext to crash with null pointer errors.
1027 ut->pFuncs = NULL;
1028
1029 if (ut->flags & UTEXT_HEAP_ALLOCATED) {
1030 // This UText was allocated by UText setup. We need to free it.
1031 // Clear magic, so we can detect if the user messes up and immediately
1032 // tries to reopen another UText using the deleted storage.
1033 ut->magic = 0;
1034 uprv_free(ut);
1035 ut = NULL;
1036 }
1037 return ut;
1038 }
1039
1040
1041
1042
1043 //
1044 // invalidateChunk Reset a chunk to have no contents, so that the next call
1045 // to access will cause new data to load.
1046 // This is needed when copy/move/replace operate directly on the
1047 // backing text, potentially putting it out of sync with the
1048 // contents in the chunk.
1049 //
1050 static void
invalidateChunk(UText * ut)1051 invalidateChunk(UText *ut) {
1052 ut->chunkLength = 0;
1053 ut->chunkNativeLimit = 0;
1054 ut->chunkNativeStart = 0;
1055 ut->chunkOffset = 0;
1056 ut->nativeIndexingLimit = 0;
1057 }
1058
1059 //
1060 // pinIndex Do range pinning on a native index parameter.
1061 // 64 bit pinning is done in place.
1062 // 32 bit truncated result is returned as a convenience for
1063 // use in providers that don't need 64 bits.
1064 static int32_t
pinIndex(int64_t & index,int64_t limit)1065 pinIndex(int64_t &index, int64_t limit) {
1066 if (index<0) {
1067 index = 0;
1068 } else if (index > limit) {
1069 index = limit;
1070 }
1071 return (int32_t)index;
1072 }
1073
1074
1075 U_CDECL_BEGIN
1076
1077 //
1078 // Pointer relocation function,
1079 // a utility used by shallow clone.
1080 // Adjust a pointer that refers to something within one UText (the source)
1081 // to refer to the same relative offset within a another UText (the target)
1082 //
adjustPointer(UText * dest,const void ** destPtr,const UText * src)1083 static void adjustPointer(UText *dest, const void **destPtr, const UText *src) {
1084 // convert all pointers to (char *) so that byte address arithmetic will work.
1085 char *dptr = (char *)*destPtr;
1086 char *dUText = (char *)dest;
1087 char *sUText = (char *)src;
1088
1089 if (dptr >= (char *)src->pExtra && dptr < ((char*)src->pExtra)+src->extraSize) {
1090 // target ptr was to something within the src UText's pExtra storage.
1091 // relocate it into the target UText's pExtra region.
1092 *destPtr = ((char *)dest->pExtra) + (dptr - (char *)src->pExtra);
1093 } else if (dptr>=sUText && dptr < sUText+src->sizeOfStruct) {
1094 // target ptr was pointing to somewhere within the source UText itself.
1095 // Move it to the same offset within the target UText.
1096 *destPtr = dUText + (dptr-sUText);
1097 }
1098 }
1099
1100
1101 //
1102 // Clone. This is a generic copy-the-utext-by-value clone function that can be
1103 // used as-is with some utext types, and as a helper by other clones.
1104 //
1105 static UText * U_CALLCONV
shallowTextClone(UText * dest,const UText * src,UErrorCode * status)1106 shallowTextClone(UText * dest, const UText * src, UErrorCode * status) {
1107 if (U_FAILURE(*status)) {
1108 return NULL;
1109 }
1110 int32_t srcExtraSize = src->extraSize;
1111
1112 //
1113 // Use the generic text_setup to allocate storage if required.
1114 //
1115 dest = utext_setup(dest, srcExtraSize, status);
1116 if (U_FAILURE(*status)) {
1117 return dest;
1118 }
1119
1120 //
1121 // flags (how the UText was allocated) and the pointer to the
1122 // extra storage must retain the values in the cloned utext that
1123 // were set up by utext_setup. Save them separately before
1124 // copying the whole struct.
1125 //
1126 void *destExtra = dest->pExtra;
1127 int32_t flags = dest->flags;
1128
1129
1130 //
1131 // Copy the whole UText struct by value.
1132 // Any "Extra" storage is copied also.
1133 //
1134 int sizeToCopy = src->sizeOfStruct;
1135 if (sizeToCopy > dest->sizeOfStruct) {
1136 sizeToCopy = dest->sizeOfStruct;
1137 }
1138 uprv_memcpy(dest, src, sizeToCopy);
1139 dest->pExtra = destExtra;
1140 dest->flags = flags;
1141 if (srcExtraSize > 0) {
1142 uprv_memcpy(dest->pExtra, src->pExtra, srcExtraSize);
1143 }
1144
1145 //
1146 // Relocate any pointers in the target that refer to the UText itself
1147 // to point to the cloned copy rather than the original source.
1148 //
1149 adjustPointer(dest, &dest->context, src);
1150 adjustPointer(dest, &dest->p, src);
1151 adjustPointer(dest, &dest->q, src);
1152 adjustPointer(dest, &dest->r, src);
1153 adjustPointer(dest, (const void **)&dest->chunkContents, src);
1154
1155 return dest;
1156 }
1157
1158
1159 U_CDECL_END
1160
1161
1162
1163 //------------------------------------------------------------------------------
1164 //
1165 // UText implementation for UTF-8 char * strings (read-only)
1166 // Limitation: string length must be <= 0x7fffffff in length.
1167 // (length must for in an int32_t variable)
1168 //
1169 // Use of UText data members:
1170 // context pointer to UTF-8 string
1171 // utext.b is the input string length (bytes).
1172 // utext.c Length scanned so far in string
1173 // (for optimizing finding length of zero terminated strings.)
1174 // utext.p pointer to the current buffer
1175 // utext.q pointer to the other buffer.
1176 //
1177 //------------------------------------------------------------------------------
1178
1179 // Chunk size.
1180 // Must be less than 85, because of byte mapping from UChar indexes to native indexes.
1181 // Worst case is three native bytes to one UChar. (Supplemenaries are 4 native bytes
1182 // to two UChars.)
1183 //
1184 enum { UTF8_TEXT_CHUNK_SIZE=32 };
1185
1186 //
1187 // UTF8Buf Two of these structs will be set up in the UText's extra allocated space.
1188 // Each contains the UChar chunk buffer, the to and from native maps, and
1189 // header info.
1190 //
1191 // because backwards iteration fills the buffers starting at the end and
1192 // working towards the front, the filled part of the buffers may not begin
1193 // at the start of the available storage for the buffers.
1194 //
1195 // Buffer size is one bigger than the specified UTF8_TEXT_CHUNK_SIZE to allow for
1196 // the last character added being a supplementary, and thus requiring a surrogate
1197 // pair. Doing this is simpler than checking for the edge case.
1198 //
1199
1200 struct UTF8Buf {
1201 int32_t bufNativeStart; // Native index of first char in UChar buf
1202 int32_t bufNativeLimit; // Native index following last char in buf.
1203 int32_t bufStartIdx; // First filled position in buf.
1204 int32_t bufLimitIdx; // Limit of filled range in buf.
1205 int32_t bufNILimit; // Limit of native indexing part of buf
1206 int32_t toUCharsMapStart; // Native index corresponding to
1207 // mapToUChars[0].
1208 // Set to bufNativeStart when filling forwards.
1209 // Set to computed value when filling backwards.
1210
1211 UChar buf[UTF8_TEXT_CHUNK_SIZE+4]; // The UChar buffer. Requires one extra position beyond the
1212 // the chunk size, to allow for surrogate at the end.
1213 // Length must be identical to mapToNative array, below,
1214 // because of the way indexing works when the array is
1215 // filled backwards during a reverse iteration. Thus,
1216 // the additional extra size.
1217 uint8_t mapToNative[UTF8_TEXT_CHUNK_SIZE+4]; // map UChar index in buf to
1218 // native offset from bufNativeStart.
1219 // Requires two extra slots,
1220 // one for a supplementary starting in the last normal position,
1221 // and one for an entry for the buffer limit position.
1222 uint8_t mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to
1223 // correspoding offset in filled part of buf.
1224 int32_t align;
1225 };
1226
1227 U_CDECL_BEGIN
1228
1229 //
1230 // utf8TextLength
1231 //
1232 // Get the length of the string. If we don't already know it,
1233 // we'll need to scan for the trailing nul.
1234 //
1235 static int64_t U_CALLCONV
utf8TextLength(UText * ut)1236 utf8TextLength(UText *ut) {
1237 if (ut->b < 0) {
1238 // Zero terminated string, and we haven't scanned to the end yet.
1239 // Scan it now.
1240 const char *r = (const char *)ut->context + ut->c;
1241 while (*r != 0) {
1242 r++;
1243 }
1244 if ((r - (const char *)ut->context) < 0x7fffffff) {
1245 ut->b = (int32_t)(r - (const char *)ut->context);
1246 } else {
1247 // Actual string was bigger (more than 2 gig) than we
1248 // can handle. Clip it to 2 GB.
1249 ut->b = 0x7fffffff;
1250 }
1251 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
1252 }
1253 return ut->b;
1254 }
1255
1256
1257
1258
1259
1260
1261 static UBool U_CALLCONV
utf8TextAccess(UText * ut,int64_t index,UBool forward)1262 utf8TextAccess(UText *ut, int64_t index, UBool forward) {
1263 //
1264 // Apologies to those who are allergic to goto statements.
1265 // Consider each goto to a labelled block to be the equivalent of
1266 // call the named block as if it were a function();
1267 // return;
1268 //
1269 const uint8_t *s8=(const uint8_t *)ut->context;
1270 UTF8Buf *u8b = NULL;
1271 int32_t length = ut->b; // Length of original utf-8
1272 int32_t ix= (int32_t)index; // Requested index, trimmed to 32 bits.
1273 int32_t mapIndex = 0;
1274 if (index<0) {
1275 ix=0;
1276 } else if (index > 0x7fffffff) {
1277 // Strings with 64 bit lengths not supported by this UTF-8 provider.
1278 ix = 0x7fffffff;
1279 }
1280
1281 // Pin requested index to the string length.
1282 if (ix>length) {
1283 if (length>=0) {
1284 ix=length;
1285 } else if (ix>=ut->c) {
1286 // Zero terminated string, and requested index is beyond
1287 // the region that has already been scanned.
1288 // Scan up to either the end of the string or to the
1289 // requested position, whichever comes first.
1290 while (ut->c<ix && s8[ut->c]!=0) {
1291 ut->c++;
1292 }
1293 // TODO: support for null terminated string length > 32 bits.
1294 if (s8[ut->c] == 0) {
1295 // We just found the actual length of the string.
1296 // Trim the requested index back to that.
1297 ix = ut->c;
1298 ut->b = ut->c;
1299 length = ut->c;
1300 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
1301 }
1302 }
1303 }
1304
1305 //
1306 // Dispatch to the appropriate action for a forward iteration request.
1307 //
1308 if (forward) {
1309 if (ix==ut->chunkNativeLimit) {
1310 // Check for normal sequential iteration cases first.
1311 if (ix==length) {
1312 // Just reached end of string
1313 // Don't swap buffers, but do set the
1314 // current buffer position.
1315 ut->chunkOffset = ut->chunkLength;
1316 return FALSE;
1317 } else {
1318 // End of current buffer.
1319 // check whether other buffer already has what we need.
1320 UTF8Buf *altB = (UTF8Buf *)ut->q;
1321 if (ix>=altB->bufNativeStart && ix<altB->bufNativeLimit) {
1322 goto swapBuffers;
1323 }
1324 }
1325 }
1326
1327 // A random access. Desired index could be in either or niether buf.
1328 // For optimizing the order of testing, first check for the index
1329 // being in the other buffer. This will be the case for uses that
1330 // move back and forth over a fairly limited range
1331 {
1332 u8b = (UTF8Buf *)ut->q; // the alternate buffer
1333 if (ix>=u8b->bufNativeStart && ix<u8b->bufNativeLimit) {
1334 // Requested index is in the other buffer.
1335 goto swapBuffers;
1336 }
1337 if (ix == length) {
1338 // Requested index is end-of-string.
1339 // (this is the case of randomly seeking to the end.
1340 // The case of iterating off the end is handled earlier.)
1341 if (ix == ut->chunkNativeLimit) {
1342 // Current buffer extends up to the end of the string.
1343 // Leave it as the current buffer.
1344 ut->chunkOffset = ut->chunkLength;
1345 return FALSE;
1346 }
1347 if (ix == u8b->bufNativeLimit) {
1348 // Alternate buffer extends to the end of string.
1349 // Swap it in as the current buffer.
1350 goto swapBuffersAndFail;
1351 }
1352
1353 // Neither existing buffer extends to the end of the string.
1354 goto makeStubBuffer;
1355 }
1356
1357 if (ix<ut->chunkNativeStart || ix>=ut->chunkNativeLimit) {
1358 // Requested index is in neither buffer.
1359 goto fillForward;
1360 }
1361
1362 // Requested index is in this buffer.
1363 u8b = (UTF8Buf *)ut->p; // the current buffer
1364 mapIndex = ix - u8b->toUCharsMapStart;
1365 ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1366 return TRUE;
1367
1368 }
1369 }
1370
1371
1372 //
1373 // Dispatch to the appropriate action for a
1374 // Backwards Diretion iteration request.
1375 //
1376 if (ix==ut->chunkNativeStart) {
1377 // Check for normal sequential iteration cases first.
1378 if (ix==0) {
1379 // Just reached the start of string
1380 // Don't swap buffers, but do set the
1381 // current buffer position.
1382 ut->chunkOffset = 0;
1383 return FALSE;
1384 } else {
1385 // Start of current buffer.
1386 // check whether other buffer already has what we need.
1387 UTF8Buf *altB = (UTF8Buf *)ut->q;
1388 if (ix>altB->bufNativeStart && ix<=altB->bufNativeLimit) {
1389 goto swapBuffers;
1390 }
1391 }
1392 }
1393
1394 // A random access. Desired index could be in either or niether buf.
1395 // For optimizing the order of testing,
1396 // Most likely case: in the other buffer.
1397 // Second most likely: in neither buffer.
1398 // Unlikely, but must work: in the current buffer.
1399 u8b = (UTF8Buf *)ut->q; // the alternate buffer
1400 if (ix>u8b->bufNativeStart && ix<=u8b->bufNativeLimit) {
1401 // Requested index is in the other buffer.
1402 goto swapBuffers;
1403 }
1404 // Requested index is start-of-string.
1405 // (this is the case of randomly seeking to the start.
1406 // The case of iterating off the start is handled earlier.)
1407 if (ix==0) {
1408 if (u8b->bufNativeStart==0) {
1409 // Alternate buffer contains the data for the start string.
1410 // Make it be the current buffer.
1411 goto swapBuffersAndFail;
1412 } else {
1413 // Request for data before the start of string,
1414 // neither buffer is usable.
1415 // set up a zero-length buffer.
1416 goto makeStubBuffer;
1417 }
1418 }
1419
1420 if (ix<=ut->chunkNativeStart || ix>ut->chunkNativeLimit) {
1421 // Requested index is in neither buffer.
1422 goto fillReverse;
1423 }
1424
1425 // Requested index is in this buffer.
1426 // Set the utf16 buffer index.
1427 u8b = (UTF8Buf *)ut->p;
1428 mapIndex = ix - u8b->toUCharsMapStart;
1429 ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1430 if (ut->chunkOffset==0) {
1431 // This occurs when the first character in the text is
1432 // a multi-byte UTF-8 char, and the requested index is to
1433 // one of the trailing bytes. Because there is no preceding ,
1434 // character, this access fails. We can't pick up on the
1435 // situation sooner because the requested index is not zero.
1436 return FALSE;
1437 } else {
1438 return TRUE;
1439 }
1440
1441
1442
1443 swapBuffers:
1444 // The alternate buffer (ut->q) has the string data that was requested.
1445 // Swap the primary and alternate buffers, and set the
1446 // chunk index into the new primary buffer.
1447 {
1448 u8b = (UTF8Buf *)ut->q;
1449 ut->q = ut->p;
1450 ut->p = u8b;
1451 ut->chunkContents = &u8b->buf[u8b->bufStartIdx];
1452 ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx;
1453 ut->chunkNativeStart = u8b->bufNativeStart;
1454 ut->chunkNativeLimit = u8b->bufNativeLimit;
1455 ut->nativeIndexingLimit = u8b->bufNILimit;
1456
1457 // Index into the (now current) chunk
1458 // Use the map to set the chunk index. It's more trouble than it's worth
1459 // to check whether native indexing can be used.
1460 U_ASSERT(ix>=u8b->bufNativeStart);
1461 U_ASSERT(ix<=u8b->bufNativeLimit);
1462 mapIndex = ix - u8b->toUCharsMapStart;
1463 U_ASSERT(mapIndex>=0);
1464 U_ASSERT(mapIndex<(int32_t)sizeof(u8b->mapToUChars));
1465 ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1466
1467 return TRUE;
1468 }
1469
1470
1471 swapBuffersAndFail:
1472 // We got a request for either the start or end of the string,
1473 // with iteration continuing in the out-of-bounds direction.
1474 // The alternate buffer already contains the data up to the
1475 // start/end.
1476 // Swap the buffers, then return failure, indicating that we couldn't
1477 // make things correct for continuing the iteration in the requested
1478 // direction. The position & buffer are correct should the
1479 // user decide to iterate in the opposite direction.
1480 u8b = (UTF8Buf *)ut->q;
1481 ut->q = ut->p;
1482 ut->p = u8b;
1483 ut->chunkContents = &u8b->buf[u8b->bufStartIdx];
1484 ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx;
1485 ut->chunkNativeStart = u8b->bufNativeStart;
1486 ut->chunkNativeLimit = u8b->bufNativeLimit;
1487 ut->nativeIndexingLimit = u8b->bufNILimit;
1488
1489 // Index into the (now current) chunk
1490 // For this function (swapBuffersAndFail), the requested index
1491 // will always be at either the start or end of the chunk.
1492 if (ix==u8b->bufNativeLimit) {
1493 ut->chunkOffset = ut->chunkLength;
1494 } else {
1495 ut->chunkOffset = 0;
1496 U_ASSERT(ix == u8b->bufNativeStart);
1497 }
1498 return FALSE;
1499
1500 makeStubBuffer:
1501 // The user has done a seek/access past the start or end
1502 // of the string. Rather than loading data that is likely
1503 // to never be used, just set up a zero-length buffer at
1504 // the position.
1505 u8b = (UTF8Buf *)ut->q;
1506 u8b->bufNativeStart = ix;
1507 u8b->bufNativeLimit = ix;
1508 u8b->bufStartIdx = 0;
1509 u8b->bufLimitIdx = 0;
1510 u8b->bufNILimit = 0;
1511 u8b->toUCharsMapStart = ix;
1512 u8b->mapToNative[0] = 0;
1513 u8b->mapToUChars[0] = 0;
1514 goto swapBuffersAndFail;
1515
1516
1517
1518 fillForward:
1519 {
1520 // Move the incoming index to a code point boundary.
1521 U8_SET_CP_START(s8, 0, ix);
1522
1523 // Swap the UText buffers.
1524 // We want to fill what was previously the alternate buffer,
1525 // and make what was the current buffer be the new alternate.
1526 UTF8Buf *u8b = (UTF8Buf *)ut->q;
1527 ut->q = ut->p;
1528 ut->p = u8b;
1529
1530 int32_t strLen = ut->b;
1531 UBool nulTerminated = FALSE;
1532 if (strLen < 0) {
1533 strLen = 0x7fffffff;
1534 nulTerminated = TRUE;
1535 }
1536
1537 UChar *buf = u8b->buf;
1538 uint8_t *mapToNative = u8b->mapToNative;
1539 uint8_t *mapToUChars = u8b->mapToUChars;
1540 int32_t destIx = 0;
1541 int32_t srcIx = ix;
1542 UBool seenNonAscii = FALSE;
1543 UChar32 c = 0;
1544
1545 // Fill the chunk buffer and mapping arrays.
1546 while (destIx<UTF8_TEXT_CHUNK_SIZE) {
1547 c = s8[srcIx];
1548 if (c>0 && c<0x80) {
1549 // Special case ASCII range for speed.
1550 // zero is excluded to simplify bounds checking.
1551 buf[destIx] = (UChar)c;
1552 mapToNative[destIx] = (uint8_t)(srcIx - ix);
1553 mapToUChars[srcIx-ix] = (uint8_t)destIx;
1554 srcIx++;
1555 destIx++;
1556 } else {
1557 // General case, handle everything.
1558 if (seenNonAscii == FALSE) {
1559 seenNonAscii = TRUE;
1560 u8b->bufNILimit = destIx;
1561 }
1562
1563 int32_t cIx = srcIx;
1564 int32_t dIx = destIx;
1565 int32_t dIxSaved = destIx;
1566 U8_NEXT(s8, srcIx, strLen, c);
1567 if (c==0 && nulTerminated) {
1568 srcIx--;
1569 break;
1570 }
1571 if (c<0) {
1572 // Illegal UTF-8. Replace with sub character.
1573 c = 0x0fffd;
1574 }
1575
1576 U16_APPEND_UNSAFE(buf, destIx, c);
1577 do {
1578 mapToNative[dIx++] = (uint8_t)(cIx - ix);
1579 } while (dIx < destIx);
1580
1581 do {
1582 mapToUChars[cIx++ - ix] = (uint8_t)dIxSaved;
1583 } while (cIx < srcIx);
1584 }
1585 if (srcIx>=strLen) {
1586 break;
1587 }
1588
1589 }
1590
1591 // store Native <--> Chunk Map entries for the end of the buffer.
1592 // There is no actual character here, but the index position is valid.
1593 mapToNative[destIx] = (uint8_t)(srcIx - ix);
1594 mapToUChars[srcIx - ix] = (uint8_t)destIx;
1595
1596 // fill in Buffer descriptor
1597 u8b->bufNativeStart = ix;
1598 u8b->bufNativeLimit = srcIx;
1599 u8b->bufStartIdx = 0;
1600 u8b->bufLimitIdx = destIx;
1601 if (seenNonAscii == FALSE) {
1602 u8b->bufNILimit = destIx;
1603 }
1604 u8b->toUCharsMapStart = u8b->bufNativeStart;
1605
1606 // Set UText chunk to refer to this buffer.
1607 ut->chunkContents = buf;
1608 ut->chunkOffset = 0;
1609 ut->chunkLength = u8b->bufLimitIdx;
1610 ut->chunkNativeStart = u8b->bufNativeStart;
1611 ut->chunkNativeLimit = u8b->bufNativeLimit;
1612 ut->nativeIndexingLimit = u8b->bufNILimit;
1613
1614 // For zero terminated strings, keep track of the maximum point
1615 // scanned so far.
1616 if (nulTerminated && srcIx>ut->c) {
1617 ut->c = srcIx;
1618 if (c==0) {
1619 // We scanned to the end.
1620 // Remember the actual length.
1621 ut->b = srcIx;
1622 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
1623 }
1624 }
1625 return TRUE;
1626 }
1627
1628
1629 fillReverse:
1630 {
1631 // Move the incoming index to a code point boundary.
1632 // Can only do this if the incoming index is somewhere in the interior of the string.
1633 // If index is at the end, there is no character there to look at.
1634 if (ix != ut->b) {
1635 U8_SET_CP_START(s8, 0, ix);
1636 }
1637
1638 // Swap the UText buffers.
1639 // We want to fill what was previously the alternate buffer,
1640 // and make what was the current buffer be the new alternate.
1641 UTF8Buf *u8b = (UTF8Buf *)ut->q;
1642 ut->q = ut->p;
1643 ut->p = u8b;
1644
1645 UChar *buf = u8b->buf;
1646 uint8_t *mapToNative = u8b->mapToNative;
1647 uint8_t *mapToUChars = u8b->mapToUChars;
1648 int32_t toUCharsMapStart = ix - (UTF8_TEXT_CHUNK_SIZE*3 + 1);
1649 int32_t destIx = UTF8_TEXT_CHUNK_SIZE+2; // Start in the overflow region
1650 // at end of buffer to leave room
1651 // for a surrogate pair at the
1652 // buffer start.
1653 int32_t srcIx = ix;
1654 int32_t bufNILimit = destIx;
1655 UChar32 c;
1656
1657 // Map to/from Native Indexes, fill in for the position at the end of
1658 // the buffer.
1659 //
1660 mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
1661 mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx;
1662
1663 // Fill the chunk buffer
1664 // Work backwards, filling from the end of the buffer towards the front.
1665 //
1666 while (destIx>2 && (srcIx - toUCharsMapStart > 5) && (srcIx > 0)) {
1667 srcIx--;
1668 destIx--;
1669
1670 // Get last byte of the UTF-8 character
1671 c = s8[srcIx];
1672 if (c<0x80) {
1673 // Special case ASCII range for speed.
1674 buf[destIx] = (UChar)c;
1675 mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx;
1676 mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
1677 } else {
1678 // General case, handle everything non-ASCII.
1679
1680 int32_t sIx = srcIx; // ix of last byte of multi-byte u8 char
1681
1682 // Get the full character from the UTF8 string.
1683 // use code derived from tbe macros in utf.8
1684 // Leaves srcIx pointing at the first byte of the UTF-8 char.
1685 //
1686 if (c<=0xbf) {
1687 c=utf8_prevCharSafeBody(s8, 0, &srcIx, c, -1);
1688 // leaves srcIx at first byte of the multi-byte char.
1689 } else {
1690 c=0x0fffd;
1691 }
1692
1693 // Store the character in UTF-16 buffer.
1694 if (c<0x10000) {
1695 buf[destIx] = (UChar)c;
1696 mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
1697 } else {
1698 buf[destIx] = U16_TRAIL(c);
1699 mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
1700 buf[--destIx] = U16_LEAD(c);
1701 mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
1702 }
1703
1704 // Fill in the map from native indexes to UChars buf index.
1705 do {
1706 mapToUChars[sIx-- - toUCharsMapStart] = (uint8_t)destIx;
1707 } while (sIx >= srcIx);
1708
1709 // Set native indexing limit to be the current position.
1710 // We are processing a non-ascii, non-native-indexing char now;
1711 // the limit will be here if the rest of the chars to be
1712 // added to this buffer are ascii.
1713 bufNILimit = destIx;
1714 }
1715 }
1716 u8b->bufNativeStart = srcIx;
1717 u8b->bufNativeLimit = ix;
1718 u8b->bufStartIdx = destIx;
1719 u8b->bufLimitIdx = UTF8_TEXT_CHUNK_SIZE+2;
1720 u8b->bufNILimit = bufNILimit - u8b->bufStartIdx;
1721 u8b->toUCharsMapStart = toUCharsMapStart;
1722
1723 ut->chunkContents = &buf[u8b->bufStartIdx];
1724 ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx;
1725 ut->chunkOffset = ut->chunkLength;
1726 ut->chunkNativeStart = u8b->bufNativeStart;
1727 ut->chunkNativeLimit = u8b->bufNativeLimit;
1728 ut->nativeIndexingLimit = u8b->bufNILimit;
1729 return TRUE;
1730 }
1731
1732 }
1733
1734
1735
1736 //
1737 // This is a slightly modified copy of u_strFromUTF8,
1738 // Inserts a Replacement Char rather than failing on invalid UTF-8
1739 // Removes unnecessary features.
1740 //
1741 static UChar*
utext_strFromUTF8(UChar * dest,int32_t destCapacity,int32_t * pDestLength,const char * src,int32_t srcLength,UErrorCode * pErrorCode)1742 utext_strFromUTF8(UChar *dest,
1743 int32_t destCapacity,
1744 int32_t *pDestLength,
1745 const char* src,
1746 int32_t srcLength, // required. NUL terminated not supported.
1747 UErrorCode *pErrorCode
1748 )
1749 {
1750
1751 UChar *pDest = dest;
1752 UChar *pDestLimit = dest+destCapacity;
1753 UChar32 ch=0;
1754 int32_t index = 0;
1755 int32_t reqLength = 0;
1756 uint8_t* pSrc = (uint8_t*) src;
1757
1758
1759 while((index < srcLength)&&(pDest<pDestLimit)){
1760 ch = pSrc[index++];
1761 if(ch <=0x7f){
1762 *pDest++=(UChar)ch;
1763 }else{
1764 ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
1765 if(ch<0){
1766 ch = 0xfffd;
1767 }
1768 if(U_IS_BMP(ch)){
1769 *(pDest++)=(UChar)ch;
1770 }else{
1771 *(pDest++)=UTF16_LEAD(ch);
1772 if(pDest<pDestLimit){
1773 *(pDest++)=UTF16_TRAIL(ch);
1774 }else{
1775 reqLength++;
1776 break;
1777 }
1778 }
1779 }
1780 }
1781 /* donot fill the dest buffer just count the UChars needed */
1782 while(index < srcLength){
1783 ch = pSrc[index++];
1784 if(ch <= 0x7f){
1785 reqLength++;
1786 }else{
1787 ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
1788 if(ch<0){
1789 ch = 0xfffd;
1790 }
1791 reqLength+=U16_LENGTH(ch);
1792 }
1793 }
1794
1795 reqLength+=(int32_t)(pDest - dest);
1796
1797 if(pDestLength){
1798 *pDestLength = reqLength;
1799 }
1800
1801 /* Terminate the buffer */
1802 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
1803
1804 return dest;
1805 }
1806
1807
1808
1809 static int32_t U_CALLCONV
utf8TextExtract(UText * ut,int64_t start,int64_t limit,UChar * dest,int32_t destCapacity,UErrorCode * pErrorCode)1810 utf8TextExtract(UText *ut,
1811 int64_t start, int64_t limit,
1812 UChar *dest, int32_t destCapacity,
1813 UErrorCode *pErrorCode) {
1814 if(U_FAILURE(*pErrorCode)) {
1815 return 0;
1816 }
1817 if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
1818 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1819 return 0;
1820 }
1821 int32_t length = ut->b;
1822 int32_t start32 = pinIndex(start, length);
1823 int32_t limit32 = pinIndex(limit, length);
1824
1825 if(start32>limit32) {
1826 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1827 return 0;
1828 }
1829
1830
1831 // adjust the incoming indexes to land on code point boundaries if needed.
1832 // adjust by no more than three, because that is the largest number of trail bytes
1833 // in a well formed UTF8 character.
1834 const uint8_t *buf = (const uint8_t *)ut->context;
1835 int i;
1836 if (start32 < ut->chunkNativeLimit) {
1837 for (i=0; i<3; i++) {
1838 if (U8_IS_SINGLE(buf[start32]) || U8_IS_LEAD(buf[start32]) || start32==0) {
1839 break;
1840 }
1841 start32--;
1842 }
1843 }
1844
1845 if (limit32 < ut->chunkNativeLimit) {
1846 for (i=0; i<3; i++) {
1847 if (U8_IS_SINGLE(buf[limit32]) || U8_IS_LEAD(buf[limit32]) || limit32==0) {
1848 break;
1849 }
1850 limit32--;
1851 }
1852 }
1853
1854 // Do the actual extract.
1855 int32_t destLength=0;
1856 utext_strFromUTF8(dest, destCapacity, &destLength,
1857 (const char *)ut->context+start32, limit32-start32,
1858 pErrorCode);
1859 utf8TextAccess(ut, limit32, TRUE);
1860 return destLength;
1861 }
1862
1863 //
1864 // utf8TextMapOffsetToNative
1865 //
1866 // Map a chunk (UTF-16) offset to a native index.
1867 static int64_t U_CALLCONV
utf8TextMapOffsetToNative(const UText * ut)1868 utf8TextMapOffsetToNative(const UText *ut) {
1869 //
1870 UTF8Buf *u8b = (UTF8Buf *)ut->p;
1871 U_ASSERT(ut->chunkOffset>ut->nativeIndexingLimit && ut->chunkOffset<=ut->chunkLength);
1872 int32_t nativeOffset = u8b->mapToNative[ut->chunkOffset + u8b->bufStartIdx] + u8b->toUCharsMapStart;
1873 U_ASSERT(nativeOffset >= ut->chunkNativeStart && nativeOffset <= ut->chunkNativeLimit);
1874 return nativeOffset;
1875 }
1876
1877 //
1878 // Map a native index to the corrsponding chunk offset
1879 //
1880 static int32_t U_CALLCONV
utf8TextMapIndexToUTF16(const UText * ut,int64_t index64)1881 utf8TextMapIndexToUTF16(const UText *ut, int64_t index64) {
1882 U_ASSERT(index64 <= 0x7fffffff);
1883 int32_t index = (int32_t)index64;
1884 UTF8Buf *u8b = (UTF8Buf *)ut->p;
1885 U_ASSERT(index>=ut->chunkNativeStart+ut->nativeIndexingLimit);
1886 U_ASSERT(index<=ut->chunkNativeLimit);
1887 int32_t mapIndex = index - u8b->toUCharsMapStart;
1888 int32_t offset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1889 U_ASSERT(offset>=0 && offset<=ut->chunkLength);
1890 return offset;
1891 }
1892
1893 static UText * U_CALLCONV
utf8TextClone(UText * dest,const UText * src,UBool deep,UErrorCode * status)1894 utf8TextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status)
1895 {
1896 // First do a generic shallow clone. Does everything needed for the UText struct itself.
1897 dest = shallowTextClone(dest, src, status);
1898
1899 // For deep clones, make a copy of the string.
1900 // The copied storage is owned by the newly created clone.
1901 //
1902 // TODO: There is an isssue with using utext_nativeLength().
1903 // That function is non-const in cases where the input was NUL terminated
1904 // and the length has not yet been determined.
1905 // This function (clone()) is const.
1906 // There potentially a thread safety issue lurking here.
1907 //
1908 if (deep && U_SUCCESS(*status)) {
1909 int32_t len = (int32_t)utext_nativeLength((UText *)src);
1910 char *copyStr = (char *)uprv_malloc(len+1);
1911 if (copyStr == NULL) {
1912 *status = U_MEMORY_ALLOCATION_ERROR;
1913 } else {
1914 uprv_memcpy(copyStr, src->context, len+1);
1915 dest->context = copyStr;
1916 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
1917 }
1918 }
1919 return dest;
1920 }
1921
1922
1923 static void U_CALLCONV
utf8TextClose(UText * ut)1924 utf8TextClose(UText *ut) {
1925 // Most of the work of close is done by the generic UText framework close.
1926 // All that needs to be done here is to delete the UTF8 string if the UText
1927 // owns it. This occurs if the UText was created by cloning.
1928 if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
1929 char *s = (char *)ut->context;
1930 uprv_free(s);
1931 ut->context = NULL;
1932 }
1933 }
1934
1935 U_CDECL_END
1936
1937
1938 static const struct UTextFuncs utf8Funcs =
1939 {
1940 sizeof(UTextFuncs),
1941 0, 0, 0, // Reserved alignment padding
1942 utf8TextClone,
1943 utf8TextLength,
1944 utf8TextAccess,
1945 utf8TextExtract,
1946 NULL, /* replace*/
1947 NULL, /* copy */
1948 utf8TextMapOffsetToNative,
1949 utf8TextMapIndexToUTF16,
1950 utf8TextClose,
1951 NULL, // spare 1
1952 NULL, // spare 2
1953 NULL // spare 3
1954 };
1955
1956
1957 static const char gEmptyString[] = {0};
1958
1959 U_CAPI UText * U_EXPORT2
utext_openUTF8(UText * ut,const char * s,int64_t length,UErrorCode * status)1960 utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status) {
1961 if(U_FAILURE(*status)) {
1962 return NULL;
1963 }
1964 if(s==NULL && length==0) {
1965 s = gEmptyString;
1966 }
1967
1968 if(s==NULL || length<-1 || length>INT32_MAX) {
1969 *status=U_ILLEGAL_ARGUMENT_ERROR;
1970 return NULL;
1971 }
1972
1973 ut = utext_setup(ut, sizeof(UTF8Buf) * 2, status);
1974 if (U_FAILURE(*status)) {
1975 return ut;
1976 }
1977
1978 ut->pFuncs = &utf8Funcs;
1979 ut->context = s;
1980 ut->b = (int32_t)length;
1981 ut->c = (int32_t)length;
1982 if (ut->c < 0) {
1983 ut->c = 0;
1984 ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
1985 }
1986 ut->p = ut->pExtra;
1987 ut->q = (char *)ut->pExtra + sizeof(UTF8Buf);
1988 return ut;
1989
1990 }
1991
1992
1993
1994
1995
1996
1997
1998
1999 //------------------------------------------------------------------------------
2000 //
2001 // UText implementation wrapper for Replaceable (read/write)
2002 //
2003 // Use of UText data members:
2004 // context pointer to Replaceable.
2005 // p pointer to Replaceable if it is owned by the UText.
2006 //
2007 //------------------------------------------------------------------------------
2008
2009
2010
2011 // minimum chunk size for this implementation: 3
2012 // to allow for possible trimming for code point boundaries
2013 enum { REP_TEXT_CHUNK_SIZE=10 };
2014
2015 struct ReplExtra {
2016 /*
2017 * Chunk UChars.
2018 * +1 to simplify filling with surrogate pair at the end.
2019 */
2020 UChar s[REP_TEXT_CHUNK_SIZE+1];
2021 };
2022
2023
2024 U_CDECL_BEGIN
2025
2026 static UText * U_CALLCONV
repTextClone(UText * dest,const UText * src,UBool deep,UErrorCode * status)2027 repTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) {
2028 // First do a generic shallow clone. Does everything needed for the UText struct itself.
2029 dest = shallowTextClone(dest, src, status);
2030
2031 // For deep clones, make a copy of the Replaceable.
2032 // The copied Replaceable storage is owned by the newly created UText clone.
2033 // A non-NULL pointer in UText.p is the signal to the close() function to delete
2034 // it.
2035 //
2036 if (deep && U_SUCCESS(*status)) {
2037 const Replaceable *replSrc = (const Replaceable *)src->context;
2038 dest->context = replSrc->clone();
2039 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
2040
2041 // with deep clone, the copy is writable, even when the source is not.
2042 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
2043 }
2044 return dest;
2045 }
2046
2047
2048 static void U_CALLCONV
repTextClose(UText * ut)2049 repTextClose(UText *ut) {
2050 // Most of the work of close is done by the generic UText framework close.
2051 // All that needs to be done here is delete the Replaceable if the UText
2052 // owns it. This occurs if the UText was created by cloning.
2053 if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
2054 Replaceable *rep = (Replaceable *)ut->context;
2055 delete rep;
2056 ut->context = NULL;
2057 }
2058 }
2059
2060
2061 static int64_t U_CALLCONV
repTextLength(UText * ut)2062 repTextLength(UText *ut) {
2063 const Replaceable *replSrc = (const Replaceable *)ut->context;
2064 int32_t len = replSrc->length();
2065 return len;
2066 }
2067
2068
2069 static UBool U_CALLCONV
repTextAccess(UText * ut,int64_t index,UBool forward)2070 repTextAccess(UText *ut, int64_t index, UBool forward) {
2071 const Replaceable *rep=(const Replaceable *)ut->context;
2072 int32_t length=rep->length(); // Full length of the input text (bigger than a chunk)
2073
2074 // clip the requested index to the limits of the text.
2075 int32_t index32 = pinIndex(index, length);
2076 U_ASSERT(index<=INT32_MAX);
2077
2078
2079 /*
2080 * Compute start/limit boundaries around index, for a segment of text
2081 * to be extracted.
2082 * To allow for the possibility that our user gave an index to the trailing
2083 * half of a surrogate pair, we must request one extra preceding UChar when
2084 * going in the forward direction. This will ensure that the buffer has the
2085 * entire code point at the specified index.
2086 */
2087 if(forward) {
2088
2089 if (index32>=ut->chunkNativeStart && index32<ut->chunkNativeLimit) {
2090 // Buffer already contains the requested position.
2091 ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
2092 return TRUE;
2093 }
2094 if (index32>=length && ut->chunkNativeLimit==length) {
2095 // Request for end of string, and buffer already extends up to it.
2096 // Can't get the data, but don't change the buffer.
2097 ut->chunkOffset = length - (int32_t)ut->chunkNativeStart;
2098 return FALSE;
2099 }
2100
2101 ut->chunkNativeLimit = index + REP_TEXT_CHUNK_SIZE - 1;
2102 // Going forward, so we want to have the buffer with stuff at and beyond
2103 // the requested index. The -1 gets us one code point before the
2104 // requested index also, to handle the case of the index being on
2105 // a trail surrogate of a surrogate pair.
2106 if(ut->chunkNativeLimit > length) {
2107 ut->chunkNativeLimit = length;
2108 }
2109 // unless buffer ran off end, start is index-1.
2110 ut->chunkNativeStart = ut->chunkNativeLimit - REP_TEXT_CHUNK_SIZE;
2111 if(ut->chunkNativeStart < 0) {
2112 ut->chunkNativeStart = 0;
2113 }
2114 } else {
2115 // Reverse iteration. Fill buffer with data preceding the requested index.
2116 if (index32>ut->chunkNativeStart && index32<=ut->chunkNativeLimit) {
2117 // Requested position already in buffer.
2118 ut->chunkOffset = index32 - (int32_t)ut->chunkNativeStart;
2119 return TRUE;
2120 }
2121 if (index32==0 && ut->chunkNativeStart==0) {
2122 // Request for start, buffer already begins at start.
2123 // No data, but keep the buffer as is.
2124 ut->chunkOffset = 0;
2125 return FALSE;
2126 }
2127
2128 // Figure out the bounds of the chunk to extract for reverse iteration.
2129 // Need to worry about chunk not splitting surrogate pairs, and while still
2130 // containing the data we need.
2131 // Fix by requesting a chunk that includes an extra UChar at the end.
2132 // If this turns out to be a lead surrogate, we can lop it off and still have
2133 // the data we wanted.
2134 ut->chunkNativeStart = index32 + 1 - REP_TEXT_CHUNK_SIZE;
2135 if (ut->chunkNativeStart < 0) {
2136 ut->chunkNativeStart = 0;
2137 }
2138
2139 ut->chunkNativeLimit = index32 + 1;
2140 if (ut->chunkNativeLimit > length) {
2141 ut->chunkNativeLimit = length;
2142 }
2143 }
2144
2145 // Extract the new chunk of text from the Replaceable source.
2146 ReplExtra *ex = (ReplExtra *)ut->pExtra;
2147 // UnicodeString with its buffer a writable alias to the chunk buffer
2148 UnicodeString buffer(ex->s, 0 /*buffer length*/, REP_TEXT_CHUNK_SIZE /*buffer capacity*/);
2149 rep->extractBetween((int32_t)ut->chunkNativeStart, (int32_t)ut->chunkNativeLimit, buffer);
2150
2151 ut->chunkContents = ex->s;
2152 ut->chunkLength = (int32_t)(ut->chunkNativeLimit - ut->chunkNativeStart);
2153 ut->chunkOffset = (int32_t)(index32 - ut->chunkNativeStart);
2154
2155 // Surrogate pairs from the input text must not span chunk boundaries.
2156 // If end of chunk could be the start of a surrogate, trim it off.
2157 if (ut->chunkNativeLimit < length &&
2158 U16_IS_LEAD(ex->s[ut->chunkLength-1])) {
2159 ut->chunkLength--;
2160 ut->chunkNativeLimit--;
2161 if (ut->chunkOffset > ut->chunkLength) {
2162 ut->chunkOffset = ut->chunkLength;
2163 }
2164 }
2165
2166 // if the first UChar in the chunk could be the trailing half of a surrogate pair,
2167 // trim it off.
2168 if(ut->chunkNativeStart>0 && U16_IS_TRAIL(ex->s[0])) {
2169 ++(ut->chunkContents);
2170 ++(ut->chunkNativeStart);
2171 --(ut->chunkLength);
2172 --(ut->chunkOffset);
2173 }
2174
2175 // adjust the index/chunkOffset to a code point boundary
2176 U16_SET_CP_START(ut->chunkContents, 0, ut->chunkOffset);
2177
2178 // Use fast indexing for get/setNativeIndex()
2179 ut->nativeIndexingLimit = ut->chunkLength;
2180
2181 return TRUE;
2182 }
2183
2184
2185
2186 static int32_t U_CALLCONV
repTextExtract(UText * ut,int64_t start,int64_t limit,UChar * dest,int32_t destCapacity,UErrorCode * status)2187 repTextExtract(UText *ut,
2188 int64_t start, int64_t limit,
2189 UChar *dest, int32_t destCapacity,
2190 UErrorCode *status) {
2191 const Replaceable *rep=(const Replaceable *)ut->context;
2192 int32_t length=rep->length();
2193
2194 if(U_FAILURE(*status)) {
2195 return 0;
2196 }
2197 if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
2198 *status=U_ILLEGAL_ARGUMENT_ERROR;
2199 }
2200 if(start>limit) {
2201 *status=U_INDEX_OUTOFBOUNDS_ERROR;
2202 return 0;
2203 }
2204
2205 int32_t start32 = pinIndex(start, length);
2206 int32_t limit32 = pinIndex(limit, length);
2207
2208 // adjust start, limit if they point to trail half of surrogates
2209 if (start32<length && U16_IS_TRAIL(rep->charAt(start32)) &&
2210 U_IS_SUPPLEMENTARY(rep->char32At(start32))){
2211 start32--;
2212 }
2213 if (limit32<length && U16_IS_TRAIL(rep->charAt(limit32)) &&
2214 U_IS_SUPPLEMENTARY(rep->char32At(limit32))){
2215 limit32--;
2216 }
2217
2218 length=limit32-start32;
2219 if(length>destCapacity) {
2220 limit32 = start32 + destCapacity;
2221 }
2222 UnicodeString buffer(dest, 0, destCapacity); // writable alias
2223 rep->extractBetween(start32, limit32, buffer);
2224 repTextAccess(ut, limit32, TRUE);
2225
2226 return u_terminateUChars(dest, destCapacity, length, status);
2227 }
2228
2229 static int32_t U_CALLCONV
repTextReplace(UText * ut,int64_t start,int64_t limit,const UChar * src,int32_t length,UErrorCode * status)2230 repTextReplace(UText *ut,
2231 int64_t start, int64_t limit,
2232 const UChar *src, int32_t length,
2233 UErrorCode *status) {
2234 Replaceable *rep=(Replaceable *)ut->context;
2235 int32_t oldLength;
2236
2237 if(U_FAILURE(*status)) {
2238 return 0;
2239 }
2240 if(src==NULL && length!=0) {
2241 *status=U_ILLEGAL_ARGUMENT_ERROR;
2242 return 0;
2243 }
2244 oldLength=rep->length(); // will subtract from new length
2245 if(start>limit ) {
2246 *status=U_INDEX_OUTOFBOUNDS_ERROR;
2247 return 0;
2248 }
2249
2250 int32_t start32 = pinIndex(start, oldLength);
2251 int32_t limit32 = pinIndex(limit, oldLength);
2252
2253 // Snap start & limit to code point boundaries.
2254 if (start32<oldLength && U16_IS_TRAIL(rep->charAt(start32)) &&
2255 start32>0 && U16_IS_LEAD(rep->charAt(start32-1)))
2256 {
2257 start32--;
2258 }
2259 if (limit32<oldLength && U16_IS_LEAD(rep->charAt(limit32-1)) &&
2260 U16_IS_TRAIL(rep->charAt(limit32)))
2261 {
2262 limit32++;
2263 }
2264
2265 // Do the actual replace operation using methods of the Replaceable class
2266 UnicodeString replStr((UBool)(length<0), src, length); // read-only alias
2267 rep->handleReplaceBetween(start32, limit32, replStr);
2268 int32_t newLength = rep->length();
2269 int32_t lengthDelta = newLength - oldLength;
2270
2271 // Is the UText chunk buffer OK?
2272 if (ut->chunkNativeLimit > start32) {
2273 // this replace operation may have impacted the current chunk.
2274 // invalidate it, which will force a reload on the next access.
2275 invalidateChunk(ut);
2276 }
2277
2278 // set the iteration position to the end of the newly inserted replacement text.
2279 int32_t newIndexPos = limit32 + lengthDelta;
2280 repTextAccess(ut, newIndexPos, TRUE);
2281
2282 return lengthDelta;
2283 }
2284
2285
2286 static void U_CALLCONV
repTextCopy(UText * ut,int64_t start,int64_t limit,int64_t destIndex,UBool move,UErrorCode * status)2287 repTextCopy(UText *ut,
2288 int64_t start, int64_t limit,
2289 int64_t destIndex,
2290 UBool move,
2291 UErrorCode *status)
2292 {
2293 Replaceable *rep=(Replaceable *)ut->context;
2294 int32_t length=rep->length();
2295
2296 if(U_FAILURE(*status)) {
2297 return;
2298 }
2299 if (start>limit || (start<destIndex && destIndex<limit))
2300 {
2301 *status=U_INDEX_OUTOFBOUNDS_ERROR;
2302 return;
2303 }
2304
2305 int32_t start32 = pinIndex(start, length);
2306 int32_t limit32 = pinIndex(limit, length);
2307 int32_t destIndex32 = pinIndex(destIndex, length);
2308
2309 // TODO: snap input parameters to code point boundaries.
2310
2311 if(move) {
2312 // move: copy to destIndex, then replace original with nothing
2313 int32_t segLength=limit32-start32;
2314 rep->copy(start32, limit32, destIndex32);
2315 if(destIndex32<start32) {
2316 start32+=segLength;
2317 limit32+=segLength;
2318 }
2319 rep->handleReplaceBetween(start32, limit32, UnicodeString());
2320 } else {
2321 // copy
2322 rep->copy(start32, limit32, destIndex32);
2323 }
2324
2325 // If the change to the text touched the region in the chunk buffer,
2326 // invalidate the buffer.
2327 int32_t firstAffectedIndex = destIndex32;
2328 if (move && start32<firstAffectedIndex) {
2329 firstAffectedIndex = start32;
2330 }
2331 if (firstAffectedIndex < ut->chunkNativeLimit) {
2332 // changes may have affected range covered by the chunk
2333 invalidateChunk(ut);
2334 }
2335
2336 // Put iteration position at the newly inserted (moved) block,
2337 int32_t nativeIterIndex = destIndex32 + limit32 - start32;
2338 if (move && destIndex32>start32) {
2339 // moved a block of text towards the end of the string.
2340 nativeIterIndex = destIndex32;
2341 }
2342
2343 // Set position, reload chunk if needed.
2344 repTextAccess(ut, nativeIterIndex, TRUE);
2345 }
2346
2347 static const struct UTextFuncs repFuncs =
2348 {
2349 sizeof(UTextFuncs),
2350 0, 0, 0, // Reserved alignment padding
2351 repTextClone,
2352 repTextLength,
2353 repTextAccess,
2354 repTextExtract,
2355 repTextReplace,
2356 repTextCopy,
2357 NULL, // MapOffsetToNative,
2358 NULL, // MapIndexToUTF16,
2359 repTextClose,
2360 NULL, // spare 1
2361 NULL, // spare 2
2362 NULL // spare 3
2363 };
2364
2365
2366 U_CAPI UText * U_EXPORT2
utext_openReplaceable(UText * ut,Replaceable * rep,UErrorCode * status)2367 utext_openReplaceable(UText *ut, Replaceable *rep, UErrorCode *status)
2368 {
2369 if(U_FAILURE(*status)) {
2370 return NULL;
2371 }
2372 if(rep==NULL) {
2373 *status=U_ILLEGAL_ARGUMENT_ERROR;
2374 return NULL;
2375 }
2376 ut = utext_setup(ut, sizeof(ReplExtra), status);
2377
2378 ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_WRITABLE);
2379 if(rep->hasMetaData()) {
2380 ut->providerProperties |=I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA);
2381 }
2382
2383 ut->pFuncs = &repFuncs;
2384 ut->context = rep;
2385 return ut;
2386 }
2387
2388 U_CDECL_END
2389
2390
2391
2392
2393
2394
2395
2396
2397 //------------------------------------------------------------------------------
2398 //
2399 // UText implementation for UnicodeString (read/write) and
2400 // for const UnicodeString (read only)
2401 // (same implementation, only the flags are different)
2402 //
2403 // Use of UText data members:
2404 // context pointer to UnicodeString
2405 // p pointer to UnicodeString IF this UText owns the string
2406 // and it must be deleted on close(). NULL otherwise.
2407 //
2408 //------------------------------------------------------------------------------
2409
2410 U_CDECL_BEGIN
2411
2412
2413 static UText * U_CALLCONV
unistrTextClone(UText * dest,const UText * src,UBool deep,UErrorCode * status)2414 unistrTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) {
2415 // First do a generic shallow clone. Does everything needed for the UText struct itself.
2416 dest = shallowTextClone(dest, src, status);
2417
2418 // For deep clones, make a copy of the UnicodeSring.
2419 // The copied UnicodeString storage is owned by the newly created UText clone.
2420 // A non-NULL pointer in UText.p is the signal to the close() function to delete
2421 // the UText.
2422 //
2423 if (deep && U_SUCCESS(*status)) {
2424 const UnicodeString *srcString = (const UnicodeString *)src->context;
2425 dest->context = new UnicodeString(*srcString);
2426 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
2427
2428 // with deep clone, the copy is writable, even when the source is not.
2429 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
2430 }
2431 return dest;
2432 }
2433
2434 static void U_CALLCONV
unistrTextClose(UText * ut)2435 unistrTextClose(UText *ut) {
2436 // Most of the work of close is done by the generic UText framework close.
2437 // All that needs to be done here is delete the UnicodeString if the UText
2438 // owns it. This occurs if the UText was created by cloning.
2439 if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
2440 UnicodeString *str = (UnicodeString *)ut->context;
2441 delete str;
2442 ut->context = NULL;
2443 }
2444 }
2445
2446
2447 static int64_t U_CALLCONV
unistrTextLength(UText * t)2448 unistrTextLength(UText *t) {
2449 return ((const UnicodeString *)t->context)->length();
2450 }
2451
2452
2453 static UBool U_CALLCONV
unistrTextAccess(UText * ut,int64_t index,UBool forward)2454 unistrTextAccess(UText *ut, int64_t index, UBool forward) {
2455 int32_t length = ut->chunkLength;
2456 ut->chunkOffset = pinIndex(index, length);
2457
2458 // Check whether request is at the start or end
2459 UBool retVal = (forward && index<length) || (!forward && index>0);
2460 return retVal;
2461 }
2462
2463
2464
2465 static int32_t U_CALLCONV
unistrTextExtract(UText * t,int64_t start,int64_t limit,UChar * dest,int32_t destCapacity,UErrorCode * pErrorCode)2466 unistrTextExtract(UText *t,
2467 int64_t start, int64_t limit,
2468 UChar *dest, int32_t destCapacity,
2469 UErrorCode *pErrorCode) {
2470 const UnicodeString *us=(const UnicodeString *)t->context;
2471 int32_t length=us->length();
2472
2473 if(U_FAILURE(*pErrorCode)) {
2474 return 0;
2475 }
2476 if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
2477 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
2478 }
2479 if(start<0 || start>limit) {
2480 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2481 return 0;
2482 }
2483
2484 int32_t start32 = start<length ? us->getChar32Start((int32_t)start) : length;
2485 int32_t limit32 = limit<length ? us->getChar32Start((int32_t)limit) : length;
2486
2487 length=limit32-start32;
2488 if (destCapacity>0 && dest!=NULL) {
2489 int32_t trimmedLength = length;
2490 if(trimmedLength>destCapacity) {
2491 trimmedLength=destCapacity;
2492 }
2493 us->extract(start32, trimmedLength, dest);
2494 t->chunkOffset = start32+trimmedLength;
2495 } else {
2496 t->chunkOffset = start32;
2497 }
2498 u_terminateUChars(dest, destCapacity, length, pErrorCode);
2499 return length;
2500 }
2501
2502 static int32_t U_CALLCONV
unistrTextReplace(UText * ut,int64_t start,int64_t limit,const UChar * src,int32_t length,UErrorCode * pErrorCode)2503 unistrTextReplace(UText *ut,
2504 int64_t start, int64_t limit,
2505 const UChar *src, int32_t length,
2506 UErrorCode *pErrorCode) {
2507 UnicodeString *us=(UnicodeString *)ut->context;
2508 int32_t oldLength;
2509
2510 if(U_FAILURE(*pErrorCode)) {
2511 return 0;
2512 }
2513 if(src==NULL && length!=0) {
2514 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
2515 }
2516 if(start>limit) {
2517 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2518 return 0;
2519 }
2520 oldLength=us->length();
2521 int32_t start32 = pinIndex(start, oldLength);
2522 int32_t limit32 = pinIndex(limit, oldLength);
2523 if (start32 < oldLength) {
2524 start32 = us->getChar32Start(start32);
2525 }
2526 if (limit32 < oldLength) {
2527 limit32 = us->getChar32Start(limit32);
2528 }
2529
2530 // replace
2531 us->replace(start32, limit32-start32, src, length);
2532 int32_t newLength = us->length();
2533
2534 // Update the chunk description.
2535 ut->chunkContents = us->getBuffer();
2536 ut->chunkLength = newLength;
2537 ut->chunkNativeLimit = newLength;
2538 ut->nativeIndexingLimit = newLength;
2539
2540 // Set iteration position to the point just following the newly inserted text.
2541 int32_t lengthDelta = newLength - oldLength;
2542 ut->chunkOffset = limit32 + lengthDelta;
2543
2544 return lengthDelta;
2545 }
2546
2547 static void U_CALLCONV
unistrTextCopy(UText * ut,int64_t start,int64_t limit,int64_t destIndex,UBool move,UErrorCode * pErrorCode)2548 unistrTextCopy(UText *ut,
2549 int64_t start, int64_t limit,
2550 int64_t destIndex,
2551 UBool move,
2552 UErrorCode *pErrorCode) {
2553 UnicodeString *us=(UnicodeString *)ut->context;
2554 int32_t length=us->length();
2555
2556 if(U_FAILURE(*pErrorCode)) {
2557 return;
2558 }
2559 int32_t start32 = pinIndex(start, length);
2560 int32_t limit32 = pinIndex(limit, length);
2561 int32_t destIndex32 = pinIndex(destIndex, length);
2562
2563 if( start32>limit32 || (start32<destIndex32 && destIndex32<limit32)) {
2564 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2565 return;
2566 }
2567
2568 if(move) {
2569 // move: copy to destIndex, then replace original with nothing
2570 int32_t segLength=limit32-start32;
2571 us->copy(start32, limit32, destIndex32);
2572 if(destIndex32<start32) {
2573 start32+=segLength;
2574 }
2575 us->replace(start32, segLength, NULL, 0);
2576 } else {
2577 // copy
2578 us->copy(start32, limit32, destIndex32);
2579 }
2580
2581 // update chunk description, set iteration position.
2582 ut->chunkContents = us->getBuffer();
2583 if (move==FALSE) {
2584 // copy operation, string length grows
2585 ut->chunkLength += limit32-start32;
2586 ut->chunkNativeLimit = ut->chunkLength;
2587 ut->nativeIndexingLimit = ut->chunkLength;
2588 }
2589
2590 // Iteration position to end of the newly inserted text.
2591 ut->chunkOffset = destIndex32+limit32-start32;
2592 if (move && destIndex32>start32) {
2593 ut->chunkOffset = destIndex32;
2594 }
2595
2596 }
2597
2598 static const struct UTextFuncs unistrFuncs =
2599 {
2600 sizeof(UTextFuncs),
2601 0, 0, 0, // Reserved alignment padding
2602 unistrTextClone,
2603 unistrTextLength,
2604 unistrTextAccess,
2605 unistrTextExtract,
2606 unistrTextReplace,
2607 unistrTextCopy,
2608 NULL, // MapOffsetToNative,
2609 NULL, // MapIndexToUTF16,
2610 unistrTextClose,
2611 NULL, // spare 1
2612 NULL, // spare 2
2613 NULL // spare 3
2614 };
2615
2616
2617
2618 U_CDECL_END
2619
2620
2621 U_CAPI UText * U_EXPORT2
utext_openUnicodeString(UText * ut,UnicodeString * s,UErrorCode * status)2622 utext_openUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status) {
2623 // TODO: use openConstUnicodeString, then add in the differences.
2624 //
2625 ut = utext_setup(ut, 0, status);
2626 if (U_SUCCESS(*status)) {
2627 ut->pFuncs = &unistrFuncs;
2628 ut->context = s;
2629 ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS)|
2630 I32_FLAG(UTEXT_PROVIDER_WRITABLE);
2631
2632 ut->chunkContents = s->getBuffer();
2633 ut->chunkLength = s->length();
2634 ut->chunkNativeStart = 0;
2635 ut->chunkNativeLimit = ut->chunkLength;
2636 ut->nativeIndexingLimit = ut->chunkLength;
2637 }
2638 return ut;
2639 }
2640
2641
2642
2643 U_CAPI UText * U_EXPORT2
utext_openConstUnicodeString(UText * ut,const UnicodeString * s,UErrorCode * status)2644 utext_openConstUnicodeString(UText *ut, const UnicodeString *s, UErrorCode *status) {
2645 ut = utext_setup(ut, 0, status);
2646 // note: use the standard (writable) function table for UnicodeString.
2647 // The flag settings disable writing, so having the functions in
2648 // the table is harmless.
2649 if (U_SUCCESS(*status)) {
2650 ut->pFuncs = &unistrFuncs;
2651 ut->context = s;
2652 ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS);
2653 ut->chunkContents = s->getBuffer();
2654 ut->chunkLength = s->length();
2655 ut->chunkNativeStart = 0;
2656 ut->chunkNativeLimit = ut->chunkLength;
2657 ut->nativeIndexingLimit = ut->chunkLength;
2658 }
2659 return ut;
2660 }
2661
2662 //------------------------------------------------------------------------------
2663 //
2664 // UText implementation for const UChar * strings
2665 //
2666 // Use of UText data members:
2667 // context pointer to UnicodeString
2668 // a length. -1 if not yet known.
2669 //
2670 // TODO: support 64 bit lengths.
2671 //
2672 //------------------------------------------------------------------------------
2673
2674 U_CDECL_BEGIN
2675
2676
2677 static UText * U_CALLCONV
ucstrTextClone(UText * dest,const UText * src,UBool deep,UErrorCode * status)2678 ucstrTextClone(UText *dest, const UText * src, UBool deep, UErrorCode * status) {
2679 // First do a generic shallow clone.
2680 dest = shallowTextClone(dest, src, status);
2681
2682 // For deep clones, make a copy of the string.
2683 // The copied storage is owned by the newly created clone.
2684 // A non-NULL pointer in UText.p is the signal to the close() function to delete
2685 // it.
2686 //
2687 if (deep && U_SUCCESS(*status)) {
2688 U_ASSERT(utext_nativeLength(dest) < INT32_MAX);
2689 int32_t len = (int32_t)utext_nativeLength(dest);
2690
2691 // The cloned string IS going to be NUL terminated, whether or not the original was.
2692 const UChar *srcStr = (const UChar *)src->context;
2693 UChar *copyStr = (UChar *)uprv_malloc((len+1) * sizeof(UChar));
2694 if (copyStr == NULL) {
2695 *status = U_MEMORY_ALLOCATION_ERROR;
2696 } else {
2697 int64_t i;
2698 for (i=0; i<len; i++) {
2699 copyStr[i] = srcStr[i];
2700 }
2701 copyStr[len] = 0;
2702 dest->context = copyStr;
2703 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
2704 }
2705 }
2706 return dest;
2707 }
2708
2709
2710 static void U_CALLCONV
ucstrTextClose(UText * ut)2711 ucstrTextClose(UText *ut) {
2712 // Most of the work of close is done by the generic UText framework close.
2713 // All that needs to be done here is delete the string if the UText
2714 // owns it. This occurs if the UText was created by cloning.
2715 if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
2716 UChar *s = (UChar *)ut->context;
2717 uprv_free(s);
2718 ut->context = NULL;
2719 }
2720 }
2721
2722
2723
2724 static int64_t U_CALLCONV
ucstrTextLength(UText * ut)2725 ucstrTextLength(UText *ut) {
2726 if (ut->a < 0) {
2727 // null terminated, we don't yet know the length. Scan for it.
2728 // Access is not convenient for doing this
2729 // because the current interation postion can't be changed.
2730 const UChar *str = (const UChar *)ut->context;
2731 for (;;) {
2732 if (str[ut->chunkNativeLimit] == 0) {
2733 break;
2734 }
2735 ut->chunkNativeLimit++;
2736 }
2737 ut->a = ut->chunkNativeLimit;
2738 ut->chunkLength = (int32_t)ut->chunkNativeLimit;
2739 ut->nativeIndexingLimit = ut->chunkLength;
2740 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
2741 }
2742 return ut->a;
2743 }
2744
2745
2746 static UBool U_CALLCONV
ucstrTextAccess(UText * ut,int64_t index,UBool forward)2747 ucstrTextAccess(UText *ut, int64_t index, UBool forward) {
2748 const UChar *str = (const UChar *)ut->context;
2749
2750 // pin the requested index to the bounds of the string,
2751 // and set current iteration position.
2752 if (index<0) {
2753 index = 0;
2754 } else if (index < ut->chunkNativeLimit) {
2755 // The request data is within the chunk as it is known so far.
2756 // Put index on a code point boundary.
2757 U16_SET_CP_START(str, 0, index);
2758 } else if (ut->a >= 0) {
2759 // We know the length of this string, and the user is requesting something
2760 // at or beyond the length. Pin the requested index to the length.
2761 index = ut->a;
2762 } else {
2763 // Null terminated string, length not yet known, and the requested index
2764 // is beyond where we have scanned so far.
2765 // Scan to 32 UChars beyond the requested index. The strategy here is
2766 // to avoid fully scanning a long string when the caller only wants to
2767 // see a few characters at its beginning.
2768 int32_t scanLimit = (int32_t)index + 32;
2769 if ((index + 32)>INT32_MAX || (index + 32)<0 ) { // note: int64 expression
2770 scanLimit = INT32_MAX;
2771 }
2772
2773 int32_t chunkLimit = (int32_t)ut->chunkNativeLimit;
2774 for (; chunkLimit<scanLimit; chunkLimit++) {
2775 if (str[chunkLimit] == 0) {
2776 // We found the end of the string. Remember it, pin the requested index to it,
2777 // and bail out of here.
2778 ut->a = chunkLimit;
2779 ut->chunkLength = chunkLimit;
2780 ut->nativeIndexingLimit = chunkLimit;
2781 if (index >= chunkLimit) {
2782 index = chunkLimit;
2783 } else {
2784 U16_SET_CP_START(str, 0, index);
2785 }
2786
2787 ut->chunkNativeLimit = chunkLimit;
2788 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
2789 goto breakout;
2790 }
2791 }
2792 // We scanned through the next batch of UChars without finding the end.
2793 U16_SET_CP_START(str, 0, index);
2794 if (chunkLimit == INT32_MAX) {
2795 // Scanned to the limit of a 32 bit length.
2796 // Forceably trim the overlength string back so length fits in int32
2797 // TODO: add support for 64 bit strings.
2798 ut->a = chunkLimit;
2799 ut->chunkLength = chunkLimit;
2800 ut->nativeIndexingLimit = chunkLimit;
2801 if (index > chunkLimit) {
2802 index = chunkLimit;
2803 }
2804 ut->chunkNativeLimit = chunkLimit;
2805 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
2806 } else {
2807 // The endpoint of a chunk must not be left in the middle of a surrogate pair.
2808 // If the current end is on a lead surrogate, back the end up by one.
2809 // It doesn't matter if the end char happens to be an unpaired surrogate,
2810 // and it's simpler not to worry about it.
2811 if (U16_IS_LEAD(str[chunkLimit-1])) {
2812 --chunkLimit;
2813 }
2814 // Null-terminated chunk with end still unknown.
2815 // Update the chunk length to reflect what has been scanned thus far.
2816 // That the full length is still unknown is (still) flagged by
2817 // ut->a being < 0.
2818 ut->chunkNativeLimit = chunkLimit;
2819 ut->nativeIndexingLimit = chunkLimit;
2820 ut->chunkLength = chunkLimit;
2821 }
2822
2823 }
2824 breakout:
2825 U_ASSERT(index<=INT32_MAX);
2826 ut->chunkOffset = (int32_t)index;
2827
2828 // Check whether request is at the start or end
2829 UBool retVal = (forward && index<ut->chunkNativeLimit) || (!forward && index>0);
2830 return retVal;
2831 }
2832
2833
2834
2835 static int32_t U_CALLCONV
ucstrTextExtract(UText * ut,int64_t start,int64_t limit,UChar * dest,int32_t destCapacity,UErrorCode * pErrorCode)2836 ucstrTextExtract(UText *ut,
2837 int64_t start, int64_t limit,
2838 UChar *dest, int32_t destCapacity,
2839 UErrorCode *pErrorCode)
2840 {
2841 if(U_FAILURE(*pErrorCode)) {
2842 return 0;
2843 }
2844 if(destCapacity<0 || (dest==NULL && destCapacity>0) || start>limit) {
2845 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
2846 return 0;
2847 }
2848
2849 int32_t si, di;
2850
2851 int32_t start32;
2852 int32_t limit32;
2853
2854 // Access the start. Does two things we need:
2855 // Pins 'start' to the length of the string, if it came in out-of-bounds.
2856 // Snaps 'start' to the beginning of a code point.
2857 ucstrTextAccess(ut, start, TRUE);
2858 const UChar *s=ut->chunkContents;
2859 start32 = ut->chunkOffset;
2860
2861 int32_t strLength=(int32_t)ut->a;
2862 if (strLength >= 0) {
2863 limit32 = pinIndex(limit, strLength);
2864 } else {
2865 limit32 = pinIndex(limit, INT32_MAX);
2866 }
2867
2868 di = 0;
2869 for (si=start32; si<limit32; si++) {
2870 if (strLength<0 && s[si]==0) {
2871 // Just hit the end of a null-terminated string.
2872 ut->a = si; // set string length for this UText
2873 ut->chunkNativeLimit = si;
2874 ut->chunkLength = si;
2875 ut->nativeIndexingLimit = si;
2876 strLength = si;
2877 break;
2878 }
2879 if (di<destCapacity) {
2880 // only store if there is space.
2881 dest[di] = s[si];
2882 } else {
2883 if (strLength>=0) {
2884 // We have filled the destination buffer, and the string length is known.
2885 // Cut the loop short. There is no need to scan string termination.
2886 di = limit32 - start32;
2887 si = limit32;
2888 break;
2889 }
2890 }
2891 di++;
2892 }
2893
2894 // If the limit index points to a lead surrogate of a pair,
2895 // add the corresponding trail surrogate to the destination.
2896 if (si>0 && U16_IS_LEAD(s[si-1]) &&
2897 ((si<strLength || strLength<0) && U16_IS_TRAIL(s[si])))
2898 {
2899 if (di<destCapacity) {
2900 // store only if there is space in the output buffer.
2901 dest[di++] = s[si++];
2902 }
2903 }
2904
2905 // Put iteration position at the point just following the extracted text
2906 ut->chunkOffset = uprv_min(strLength, start32 + destCapacity);
2907
2908 // Add a terminating NUL if space in the buffer permits,
2909 // and set the error status as required.
2910 u_terminateUChars(dest, destCapacity, di, pErrorCode);
2911 return di;
2912 }
2913
2914 static const struct UTextFuncs ucstrFuncs =
2915 {
2916 sizeof(UTextFuncs),
2917 0, 0, 0, // Reserved alignment padding
2918 ucstrTextClone,
2919 ucstrTextLength,
2920 ucstrTextAccess,
2921 ucstrTextExtract,
2922 NULL, // Replace
2923 NULL, // Copy
2924 NULL, // MapOffsetToNative,
2925 NULL, // MapIndexToUTF16,
2926 ucstrTextClose,
2927 NULL, // spare 1
2928 NULL, // spare 2
2929 NULL, // spare 3
2930 };
2931
2932 U_CDECL_END
2933
2934 static const UChar gEmptyUString[] = {0};
2935
2936 U_CAPI UText * U_EXPORT2
utext_openUChars(UText * ut,const UChar * s,int64_t length,UErrorCode * status)2937 utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode *status) {
2938 if (U_FAILURE(*status)) {
2939 return NULL;
2940 }
2941 if(s==NULL && length==0) {
2942 s = gEmptyUString;
2943 }
2944 if (s==NULL || length < -1 || length>INT32_MAX) {
2945 *status = U_ILLEGAL_ARGUMENT_ERROR;
2946 return NULL;
2947 }
2948 ut = utext_setup(ut, 0, status);
2949 if (U_SUCCESS(*status)) {
2950 ut->pFuncs = &ucstrFuncs;
2951 ut->context = s;
2952 ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS);
2953 if (length==-1) {
2954 ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
2955 }
2956 ut->a = length;
2957 ut->chunkContents = s;
2958 ut->chunkNativeStart = 0;
2959 ut->chunkNativeLimit = length>=0? length : 0;
2960 ut->chunkLength = (int32_t)ut->chunkNativeLimit;
2961 ut->chunkOffset = 0;
2962 ut->nativeIndexingLimit = ut->chunkLength;
2963 }
2964 return ut;
2965 }
2966
2967
2968 //------------------------------------------------------------------------------
2969 //
2970 // UText implementation for text from ICU CharacterIterators
2971 //
2972 // Use of UText data members:
2973 // context pointer to the CharacterIterator
2974 // a length of the full text.
2975 // p pointer to buffer 1
2976 // b start index of local buffer 1 contents
2977 // q pointer to buffer 2
2978 // c start index of local buffer 2 contents
2979 // r pointer to the character iterator if the UText owns it.
2980 // Null otherwise.
2981 //
2982 //------------------------------------------------------------------------------
2983 #define CIBufSize 16
2984
2985 U_CDECL_BEGIN
2986 static void U_CALLCONV
charIterTextClose(UText * ut)2987 charIterTextClose(UText *ut) {
2988 // Most of the work of close is done by the generic UText framework close.
2989 // All that needs to be done here is delete the CharacterIterator if the UText
2990 // owns it. This occurs if the UText was created by cloning.
2991 CharacterIterator *ci = (CharacterIterator *)ut->r;
2992 delete ci;
2993 ut->r = NULL;
2994 }
2995
2996 static int64_t U_CALLCONV
charIterTextLength(UText * ut)2997 charIterTextLength(UText *ut) {
2998 return (int32_t)ut->a;
2999 }
3000
3001 static UBool U_CALLCONV
charIterTextAccess(UText * ut,int64_t index,UBool forward)3002 charIterTextAccess(UText *ut, int64_t index, UBool forward) {
3003 CharacterIterator *ci = (CharacterIterator *)ut->context;
3004
3005 int32_t clippedIndex = (int32_t)index;
3006 if (clippedIndex<0) {
3007 clippedIndex=0;
3008 } else if (clippedIndex>=ut->a) {
3009 clippedIndex=(int32_t)ut->a;
3010 }
3011 int32_t neededIndex = clippedIndex;
3012 if (!forward && neededIndex>0) {
3013 // reverse iteration, want the position just before what was asked for.
3014 neededIndex--;
3015 } else if (forward && neededIndex==ut->a && neededIndex>0) {
3016 // Forward iteration, don't ask for something past the end of the text.
3017 neededIndex--;
3018 }
3019
3020 // Find the native index of the start of the buffer containing what we want.
3021 neededIndex -= neededIndex % CIBufSize;
3022
3023 UChar *buf = NULL;
3024 UBool needChunkSetup = TRUE;
3025 int i;
3026 if (ut->chunkNativeStart == neededIndex) {
3027 // The buffer we want is already the current chunk.
3028 needChunkSetup = FALSE;
3029 } else if (ut->b == neededIndex) {
3030 // The first buffer (buffer p) has what we need.
3031 buf = (UChar *)ut->p;
3032 } else if (ut->c == neededIndex) {
3033 // The second buffer (buffer q) has what we need.
3034 buf = (UChar *)ut->q;
3035 } else {
3036 // Neither buffer already has what we need.
3037 // Load new data from the character iterator.
3038 // Use the buf that is not the current buffer.
3039 buf = (UChar *)ut->p;
3040 if (ut->p == ut->chunkContents) {
3041 buf = (UChar *)ut->q;
3042 }
3043 ci->setIndex(neededIndex);
3044 for (i=0; i<CIBufSize; i++) {
3045 buf[i] = ci->nextPostInc();
3046 if (i+neededIndex > ut->a) {
3047 break;
3048 }
3049 }
3050 }
3051
3052 // We have a buffer with the data we need.
3053 // Set it up as the current chunk, if it wasn't already.
3054 if (needChunkSetup) {
3055 ut->chunkContents = buf;
3056 ut->chunkLength = CIBufSize;
3057 ut->chunkNativeStart = neededIndex;
3058 ut->chunkNativeLimit = neededIndex + CIBufSize;
3059 if (ut->chunkNativeLimit > ut->a) {
3060 ut->chunkNativeLimit = ut->a;
3061 ut->chunkLength = (int32_t)(ut->chunkNativeLimit)-(int32_t)(ut->chunkNativeStart);
3062 }
3063 ut->nativeIndexingLimit = ut->chunkLength;
3064 U_ASSERT(ut->chunkOffset>=0 && ut->chunkOffset<=CIBufSize);
3065 }
3066 ut->chunkOffset = clippedIndex - (int32_t)ut->chunkNativeStart;
3067 UBool success = (forward? ut->chunkOffset<ut->chunkLength : ut->chunkOffset>0);
3068 return success;
3069 }
3070
3071 static UText * U_CALLCONV
charIterTextClone(UText * dest,const UText * src,UBool deep,UErrorCode * status)3072 charIterTextClone(UText *dest, const UText *src, UBool deep, UErrorCode * status) {
3073 if (U_FAILURE(*status)) {
3074 return NULL;
3075 }
3076
3077 if (deep) {
3078 // There is no CharacterIterator API for cloning the underlying text storage.
3079 *status = U_UNSUPPORTED_ERROR;
3080 return NULL;
3081 } else {
3082 CharacterIterator *srcCI =(CharacterIterator *)src->context;
3083 srcCI = srcCI->clone();
3084 dest = utext_openCharacterIterator(dest, srcCI, status);
3085 // cast off const on getNativeIndex.
3086 // For CharacterIterator based UTexts, this is safe, the operation is const.
3087 int64_t ix = utext_getNativeIndex((UText *)src);
3088 utext_setNativeIndex(dest, ix);
3089 dest->r = srcCI; // flags that this UText owns the CharacterIterator
3090 }
3091 return dest;
3092 }
3093
3094 static int32_t U_CALLCONV
charIterTextExtract(UText * ut,int64_t start,int64_t limit,UChar * dest,int32_t destCapacity,UErrorCode * status)3095 charIterTextExtract(UText *ut,
3096 int64_t start, int64_t limit,
3097 UChar *dest, int32_t destCapacity,
3098 UErrorCode *status)
3099 {
3100 if(U_FAILURE(*status)) {
3101 return 0;
3102 }
3103 if(destCapacity<0 || (dest==NULL && destCapacity>0) || start>limit) {
3104 *status=U_ILLEGAL_ARGUMENT_ERROR;
3105 return 0;
3106 }
3107 int32_t length = (int32_t)ut->a;
3108 int32_t start32 = pinIndex(start, length);
3109 int32_t limit32 = pinIndex(limit, length);
3110 int32_t desti = 0;
3111 int32_t srci;
3112 int32_t copyLimit;
3113
3114 CharacterIterator *ci = (CharacterIterator *)ut->context;
3115 ci->setIndex32(start32); // Moves ix to lead of surrogate pair, if needed.
3116 srci = ci->getIndex();
3117 copyLimit = srci;
3118 while (srci<limit32) {
3119 UChar32 c = ci->next32PostInc();
3120 int32_t len = U16_LENGTH(c);
3121 if (desti+len <= destCapacity) {
3122 U16_APPEND_UNSAFE(dest, desti, c);
3123 copyLimit = srci+len;
3124 } else {
3125 desti += len;
3126 *status = U_BUFFER_OVERFLOW_ERROR;
3127 }
3128 srci += len;
3129 }
3130
3131 charIterTextAccess(ut, copyLimit, TRUE);
3132
3133 u_terminateUChars(dest, destCapacity, desti, status);
3134 return desti;
3135 }
3136
3137 static const struct UTextFuncs charIterFuncs =
3138 {
3139 sizeof(UTextFuncs),
3140 0, 0, 0, // Reserved alignment padding
3141 charIterTextClone,
3142 charIterTextLength,
3143 charIterTextAccess,
3144 charIterTextExtract,
3145 NULL, // Replace
3146 NULL, // Copy
3147 NULL, // MapOffsetToNative,
3148 NULL, // MapIndexToUTF16,
3149 charIterTextClose,
3150 NULL, // spare 1
3151 NULL, // spare 2
3152 NULL // spare 3
3153 };
3154 U_CDECL_END
3155
3156
3157 U_CAPI UText * U_EXPORT2
utext_openCharacterIterator(UText * ut,CharacterIterator * ci,UErrorCode * status)3158 utext_openCharacterIterator(UText *ut, CharacterIterator *ci, UErrorCode *status) {
3159 if (U_FAILURE(*status)) {
3160 return NULL;
3161 }
3162
3163 if (ci->startIndex() > 0) {
3164 // No support for CharacterIterators that do not start indexing from zero.
3165 *status = U_UNSUPPORTED_ERROR;
3166 return NULL;
3167 }
3168
3169 // Extra space in UText for 2 buffers of CIBufSize UChars each.
3170 int32_t extraSpace = 2 * CIBufSize * sizeof(UChar);
3171 ut = utext_setup(ut, extraSpace, status);
3172 if (U_SUCCESS(*status)) {
3173 ut->pFuncs = &charIterFuncs;
3174 ut->context = ci;
3175 ut->providerProperties = 0;
3176 ut->a = ci->endIndex(); // Length of text
3177 ut->p = ut->pExtra; // First buffer
3178 ut->b = -1; // Native index of first buffer contents
3179 ut->q = (UChar*)ut->pExtra+CIBufSize; // Second buffer
3180 ut->c = -1; // Native index of second buffer contents
3181
3182 // Initialize current chunk contents to be empty.
3183 // First access will fault something in.
3184 // Note: The initial nativeStart and chunkOffset must sum to zero
3185 // so that getNativeIndex() will correctly compute to zero
3186 // if no call to Access() has ever been made. They can't be both
3187 // zero without Access() thinking that the chunk is valid.
3188 ut->chunkContents = (UChar *)ut->p;
3189 ut->chunkNativeStart = -1;
3190 ut->chunkOffset = 1;
3191 ut->chunkNativeLimit = 0;
3192 ut->chunkLength = 0;
3193 ut->nativeIndexingLimit = ut->chunkOffset; // enables native indexing
3194 }
3195 return ut;
3196 }
3197
3198
3199
3200