• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * COPYRIGHT:
5  * Copyright (c) 2005-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  ********************************************************************/
8 /************************************************************************
9 *   Tests for the UText and UTextIterator text abstraction classes
10 *
11 ************************************************************************/
12 
13 #include <string.h>
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include "unicode/utypes.h"
17 #include "unicode/utext.h"
18 #include "unicode/utf8.h"
19 #include "unicode/utf16.h"
20 #include "unicode/ustring.h"
21 #include "unicode/uchriter.h"
22 #include "cmemory.h"
23 #include "cstr.h"
24 #include "utxttest.h"
25 
26 static UBool  gFailed = false;
27 static int    gTestNum = 0;
28 
29 // Forward decl
30 UText *openFragmentedUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status);
31 
32 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
33     if ((x)==false) { \
34         errln("Test #%d failure in file %s at line %d\n", gTestNum, __FILE__, __LINE__); \
35         gFailed = true; \
36     } \
37 } UPRV_BLOCK_MACRO_END
38 
39 
40 #define TEST_SUCCESS(status) UPRV_BLOCK_MACRO_BEGIN { \
41     if (U_FAILURE(status)) { \
42         errln("Test #%d failure in file %s at line %d. Error = \"%s\"\n", \
43               gTestNum, __FILE__, __LINE__, u_errorName(status)); \
44         gFailed = true; \
45     } \
46 } UPRV_BLOCK_MACRO_END
47 
UTextTest()48 UTextTest::UTextTest() {
49 }
50 
~UTextTest()51 UTextTest::~UTextTest() {
52 }
53 
54 
55 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)56 UTextTest::runIndexedTest(int32_t index, UBool exec,
57                           const char* &name, char* /*par*/) {
58     TESTCASE_AUTO_BEGIN;
59     TESTCASE_AUTO(TextTest);
60     TESTCASE_AUTO(ErrorTest);
61     TESTCASE_AUTO(FreezeTest);
62     TESTCASE_AUTO(Ticket5560);
63     TESTCASE_AUTO(Ticket6847);
64     TESTCASE_AUTO(Ticket10562);
65     TESTCASE_AUTO(Ticket10983);
66     TESTCASE_AUTO(Ticket12130);
67     TESTCASE_AUTO(Ticket13344);
68     TESTCASE_AUTO(AccessChangesChunkSize);
69     TESTCASE_AUTO_END;
70 }
71 
72 //
73 // Quick and dirty random number generator.
74 //   (don't use library so that results are portable.
75 static uint32_t m_seed = 1;
m_rand()76 static uint32_t m_rand()
77 {
78     m_seed = m_seed * 1103515245 + 12345;
79     return (uint32_t)(m_seed/65536) % 32768;
80 }
81 
82 
83 //
84 //   TextTest()
85 //
86 //       Top Level function for UText testing.
87 //       Specifies the strings to be tested, with the actual testing itself
88 //       being carried out in another function, TestString().
89 //
TextTest()90 void  UTextTest::TextTest() {
91     int32_t i, j;
92 
93     TestString("abcd\\U00010001xyz");
94     TestString("");
95 
96     // Supplementary chars at start or end
97     TestString("\\U00010001");
98     TestString("abc\\U00010001");
99     TestString("\\U00010001abc");
100 
101     // Test simple strings of lengths 1 to 60, looking for glitches at buffer boundaries
102     UnicodeString s;
103     for (i=1; i<60; i++) {
104         s.truncate(0);
105         for (j=0; j<i; j++) {
106             if (j+0x30 == 0x5c) {
107                 // backslash.  Needs to be escaped
108                 s.append((char16_t)0x5c);
109             }
110             s.append(char16_t(j+0x30));
111         }
112         TestString(s);
113     }
114 
115    // Test strings with odd-aligned supplementary chars,
116    //    looking for glitches at buffer boundaries
117     for (i=1; i<60; i++) {
118         s.truncate(0);
119         s.append((char16_t)0x41);
120         for (j=0; j<i; j++) {
121             s.append(UChar32(j+0x11000));
122         }
123         TestString(s);
124     }
125 
126     // String of chars of randomly varying size in utf-8 representation.
127     //   Exercise the mapping, and the varying sized buffer.
128     //
129     s.truncate(0);
130     UChar32  c1 = 0;
131     UChar32  c2 = 0x100;
132     UChar32  c3 = 0xa000;
133     UChar32  c4 = 0x11000;
134     for (i=0; i<1000; i++) {
135         int len8 = m_rand()%4 + 1;
136         switch (len8) {
137             case 1:
138                 c1 = (c1+1)%0x80;
139                 // don't put 0 into string (0 terminated strings for some tests)
140                 // don't put '\', will cause unescape() to fail.
141                 if (c1==0x5c || c1==0) {
142                     c1++;
143                 }
144                 s.append(c1);
145                 break;
146             case 2:
147                 s.append(c2++);
148                 break;
149             case 3:
150                 s.append(c3++);
151                 break;
152             case 4:
153                 s.append(c4++);
154                 break;
155         }
156     }
157     TestString(s);
158 }
159 
160 
161 //
162 //  TestString()     Run a suite of UText tests on a string.
163 //                   The test string is unescaped before use.
164 //
TestString(const UnicodeString & s)165 void UTextTest::TestString(const UnicodeString &s) {
166     int32_t       i;
167     int32_t       j;
168     UChar32       c;
169     int32_t       cpCount = 0;
170     UErrorCode    status  = U_ZERO_ERROR;
171     UText        *ut      = nullptr;
172     int32_t       saLen;
173 
174     UnicodeString sa = s.unescape();
175     saLen = sa.length();
176 
177     //
178     // Build up a mapping between code points and UTF-16 code unit indexes.
179     //
180     m *cpMap = new m[sa.length() + 1];
181     j = 0;
182     for (i=0; i<sa.length(); i=sa.moveIndex32(i, 1)) {
183         c = sa.char32At(i);
184         cpMap[j].nativeIdx = i;
185         cpMap[j].cp = c;
186         j++;
187         cpCount++;
188     }
189     cpMap[j].nativeIdx = i;   // position following the last char in utf-16 string.
190 
191 
192     // char16_t * test, null terminated
193     status = U_ZERO_ERROR;
194     char16_t *buf = new char16_t[saLen+1];
195     sa.extract(buf, saLen+1, status);
196     TEST_SUCCESS(status);
197     ut = utext_openUChars(nullptr, buf, -1, &status);
198     TEST_SUCCESS(status);
199     TestAccess(sa, ut, cpCount, cpMap);
200     utext_close(ut);
201     delete [] buf;
202 
203     // char16_t * test, with length
204     status = U_ZERO_ERROR;
205     buf = new char16_t[saLen+1];
206     sa.extract(buf, saLen+1, status);
207     TEST_SUCCESS(status);
208     ut = utext_openUChars(nullptr, buf, saLen, &status);
209     TEST_SUCCESS(status);
210     TestAccess(sa, ut, cpCount, cpMap);
211     utext_close(ut);
212     delete [] buf;
213 
214 
215     // UnicodeString test
216     status = U_ZERO_ERROR;
217     ut = utext_openUnicodeString(nullptr, &sa, &status);
218     TEST_SUCCESS(status);
219     TestAccess(sa, ut, cpCount, cpMap);
220     TestCMR(sa, ut, cpCount, cpMap, cpMap);
221     utext_close(ut);
222 
223 
224     // Const UnicodeString test
225     status = U_ZERO_ERROR;
226     ut = utext_openConstUnicodeString(nullptr, &sa, &status);
227     TEST_SUCCESS(status);
228     TestAccess(sa, ut, cpCount, cpMap);
229     utext_close(ut);
230 
231 
232     // Replaceable test.  (UnicodeString inherits Replaceable)
233     status = U_ZERO_ERROR;
234     ut = utext_openReplaceable(nullptr, &sa, &status);
235     TEST_SUCCESS(status);
236     TestAccess(sa, ut, cpCount, cpMap);
237     TestCMR(sa, ut, cpCount, cpMap, cpMap);
238     utext_close(ut);
239 
240     // Character Iterator Tests
241     status = U_ZERO_ERROR;
242     const char16_t *cbuf = sa.getBuffer();
243     CharacterIterator *ci = new UCharCharacterIterator(cbuf, saLen, status);
244     TEST_SUCCESS(status);
245     ut = utext_openCharacterIterator(nullptr, ci, &status);
246     TEST_SUCCESS(status);
247     TestAccess(sa, ut, cpCount, cpMap);
248     utext_close(ut);
249     delete ci;
250 
251 
252     // Fragmented UnicodeString  (Chunk size of one)
253     //
254     status = U_ZERO_ERROR;
255     ut = openFragmentedUnicodeString(nullptr, &sa, &status);
256     TEST_SUCCESS(status);
257     TestAccess(sa, ut, cpCount, cpMap);
258     utext_close(ut);
259 
260     //
261     // UTF-8 test
262     //
263 
264     // Convert the test string from UnicodeString to (char *) in utf-8 format
265     int32_t u8Len = sa.extract(0, sa.length(), nullptr, 0, "utf-8");
266     char *u8String = new char[u8Len + 1];
267     sa.extract(0, sa.length(), u8String, u8Len+1, "utf-8");
268 
269     // Build up the map of code point indices in the utf-8 string
270     m * u8Map = new m[sa.length() + 1];
271     i = 0;   // native utf-8 index
272     for (j=0; j<cpCount ; j++) {  // code point number
273         u8Map[j].nativeIdx = i;
274         U8_NEXT(u8String, i, u8Len, c);
275         u8Map[j].cp = c;
276     }
277     u8Map[cpCount].nativeIdx = u8Len;   // position following the last char in utf-8 string.
278 
279     // Do the test itself
280     status = U_ZERO_ERROR;
281     ut = utext_openUTF8(nullptr, u8String, -1, &status);
282     TEST_SUCCESS(status);
283     TestAccess(sa, ut, cpCount, u8Map);
284     utext_close(ut);
285 
286 
287 
288     delete []cpMap;
289     delete []u8Map;
290     delete []u8String;
291 }
292 
293 //  TestCMR   test Copy, Move and Replace operations.
294 //              us         UnicodeString containing the test text.
295 //              ut         UText containing the same test text.
296 //              cpCount    number of code points in the test text.
297 //              nativeMap  Mapping from code points to native indexes for the UText.
298 //              u16Map     Mapping from code points to UTF-16 indexes, for use with the UnicodeString.
299 //
300 //     This function runs a whole series of operations on each incoming UText.
301 //     The UText is deep-cloned prior to each operation, so that the original UText remains unchanged.
302 //
TestCMR(const UnicodeString & us,UText * ut,int cpCount,m * nativeMap,m * u16Map)303 void UTextTest::TestCMR(const UnicodeString &us, UText *ut, int cpCount, m *nativeMap, m *u16Map) {
304     TEST_ASSERT(utext_isWritable(ut) == true);
305 
306     int  srcLengthType;       // Loop variables for selecting the position and length
307     int  srcPosType;          //   of the block to operate on within the source text.
308     int  destPosType;
309 
310     int  srcIndex  = 0;       // Code Point indexes of the block to operate on for
311     int  srcLength = 0;       //   a specific test.
312 
313     int  destIndex = 0;       // Code point index of the destination for a copy/move test.
314 
315     int32_t  nativeStart = 0; // Native unit indexes for a test.
316     int32_t  nativeLimit = 0;
317     int32_t  nativeDest  = 0;
318 
319     int32_t  u16Start    = 0; // UTF-16 indexes for a test.
320     int32_t  u16Limit    = 0; //   used when performing the same operation in a Unicode String
321     int32_t  u16Dest     = 0;
322 
323     // Iterate over a whole series of source index, length and a target indexes.
324     // This is done with code point indexes; these will be later translated to native
325     //   indexes using the cpMap.
326     for (srcLengthType=1; srcLengthType<=3; srcLengthType++) {
327         switch (srcLengthType) {
328             case 1: srcLength = 1; break;
329             case 2: srcLength = 5; break;
330             case 3: srcLength = cpCount / 3;
331         }
332         for (srcPosType=1; srcPosType<=5; srcPosType++) {
333             switch (srcPosType) {
334                 case 1: srcIndex = 0; break;
335                 case 2: srcIndex = 1; break;
336                 case 3: srcIndex = cpCount - srcLength; break;
337                 case 4: srcIndex = cpCount - srcLength - 1; break;
338                 case 5: srcIndex = cpCount / 2; break;
339             }
340             if (srcIndex < 0 || srcIndex + srcLength > cpCount) {
341                 // filter out bogus test cases -
342                 //   those with a source range that falls of an edge of the string.
343                 continue;
344             }
345 
346             //
347             // Copy and move tests.
348             //   iterate over a variety of destination positions.
349             //
350             for (destPosType=1; destPosType<=4; destPosType++) {
351                 switch (destPosType) {
352                     case 1: destIndex = 0; break;
353                     case 2: destIndex = 1; break;
354                     case 3: destIndex = srcIndex - 1; break;
355                     case 4: destIndex = srcIndex + srcLength + 1; break;
356                     case 5: destIndex = cpCount-1; break;
357                     case 6: destIndex = cpCount; break;
358                 }
359                 if (destIndex<0 || destIndex>cpCount) {
360                     // filter out bogus test cases.
361                     continue;
362                 }
363 
364                 nativeStart = nativeMap[srcIndex].nativeIdx;
365                 nativeLimit = nativeMap[srcIndex+srcLength].nativeIdx;
366                 nativeDest  = nativeMap[destIndex].nativeIdx;
367 
368                 u16Start    = u16Map[srcIndex].nativeIdx;
369                 u16Limit    = u16Map[srcIndex+srcLength].nativeIdx;
370                 u16Dest     = u16Map[destIndex].nativeIdx;
371 
372                 gFailed = false;
373                 TestCopyMove(us, ut, false,
374                     nativeStart, nativeLimit, nativeDest,
375                     u16Start, u16Limit, u16Dest);
376 
377                 TestCopyMove(us, ut, true,
378                     nativeStart, nativeLimit, nativeDest,
379                     u16Start, u16Limit, u16Dest);
380 
381                 if (gFailed) {
382                     return;
383                 }
384             }
385 
386             //
387             //  Replace tests.
388             //
389             UnicodeString fullRepString("This is an arbitrary string that will be used as replacement text");
390             for (int32_t replStrLen=0; replStrLen<20; replStrLen++) {
391                 UnicodeString repStr(fullRepString, 0, replStrLen);
392                 TestReplace(us, ut,
393                     nativeStart, nativeLimit,
394                     u16Start, u16Limit,
395                     repStr);
396                 if (gFailed) {
397                     return;
398                 }
399             }
400 
401         }
402     }
403 
404 }
405 
406 //
407 //   TestCopyMove    run a single test case for utext_copy.
408 //                   Test cases are created in TestCMR and dispatched here for execution.
409 //
TestCopyMove(const UnicodeString & us,UText * ut,UBool move,int32_t nativeStart,int32_t nativeLimit,int32_t nativeDest,int32_t u16Start,int32_t u16Limit,int32_t u16Dest)410 void UTextTest::TestCopyMove(const UnicodeString &us, UText *ut, UBool move,
411                     int32_t nativeStart, int32_t nativeLimit, int32_t nativeDest,
412                     int32_t u16Start, int32_t u16Limit, int32_t u16Dest)
413 {
414     UErrorCode      status   = U_ZERO_ERROR;
415     UText          *targetUT = nullptr;
416     gTestNum++;
417     gFailed = false;
418 
419     //
420     //  clone the UText.  The test will be run in the cloned copy
421     //  so that we don't alter the original.
422     //
423     targetUT = utext_clone(nullptr, ut, true, false, &status);
424     TEST_SUCCESS(status);
425     UnicodeString targetUS(us);    // And copy the reference string.
426 
427     // do the test operation first in the reference
428     targetUS.copy(u16Start, u16Limit, u16Dest);
429     if (move) {
430         // delete out the source range.
431         if (u16Limit < u16Dest) {
432             targetUS.removeBetween(u16Start, u16Limit);
433         } else {
434             int32_t amtCopied = u16Limit - u16Start;
435             targetUS.removeBetween(u16Start+amtCopied, u16Limit+amtCopied);
436         }
437     }
438 
439     // Do the same operation in the UText under test
440     utext_copy(targetUT, nativeStart, nativeLimit, nativeDest, move, &status);
441     if (nativeDest > nativeStart && nativeDest < nativeLimit) {
442         TEST_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
443     } else {
444         TEST_SUCCESS(status);
445 
446         // Compare the results of the two parallel tests
447         int32_t  usi = 0;    // UnicodeString position, utf-16 index.
448         int64_t  uti = 0;    // UText position, native index.
449         UChar32  usc;        // code point from Unicode String
450         UChar32  utc;        // code point from UText
451         utext_setNativeIndex(targetUT, 0);
452         for (;;) {
453             usc = targetUS.char32At(usi);
454             utc = utext_next32(targetUT);
455             if (utc < 0) {
456                 break;
457             }
458             TEST_ASSERT(uti == usi);
459             TEST_ASSERT(utc == usc);
460             usi = targetUS.moveIndex32(usi, 1);
461             uti = utext_getNativeIndex(targetUT);
462             if (gFailed) {
463                 goto cleanupAndReturn;
464             }
465         }
466         int64_t expectedNativeLength = utext_nativeLength(ut);
467         if (move == false) {
468             expectedNativeLength += nativeLimit - nativeStart;
469         }
470         uti = utext_getNativeIndex(targetUT);
471         TEST_ASSERT(uti == expectedNativeLength);
472     }
473 
474 cleanupAndReturn:
475     utext_close(targetUT);
476 }
477 
478 
479 //
480 //  TestReplace   Test a single Replace operation.
481 //
TestReplace(const UnicodeString & us,UText * ut,int32_t nativeStart,int32_t nativeLimit,int32_t u16Start,int32_t u16Limit,const UnicodeString & repStr)482 void UTextTest::TestReplace(
483             const UnicodeString &us,     // reference UnicodeString in which to do the replace
484             UText         *ut,                // UnicodeText object under test.
485             int32_t       nativeStart,        // Range to be replaced, in UText native units.
486             int32_t       nativeLimit,
487             int32_t       u16Start,           // Range to be replaced, in UTF-16 units
488             int32_t       u16Limit,           //    for use in the reference UnicodeString.
489             const UnicodeString &repStr)      // The replacement string
490 {
491     UErrorCode      status   = U_ZERO_ERROR;
492     UText          *targetUT = nullptr;
493     gTestNum++;
494     gFailed = false;
495 
496     //
497     //  clone the target UText.  The test will be run in the cloned copy
498     //  so that we don't alter the original.
499     //
500     targetUT = utext_clone(nullptr, ut, true, false, &status);
501     TEST_SUCCESS(status);
502     UnicodeString targetUS(us);    // And copy the reference string.
503 
504     //
505     // Do the replace operation in the Unicode String, to
506     //   produce a reference result.
507     //
508     targetUS.replace(u16Start, u16Limit-u16Start, repStr);
509 
510     //
511     // Do the replace on the UText under test
512     //
513     const char16_t *rs = repStr.getBuffer();
514     int32_t  rsLen = repStr.length();
515     int32_t actualDelta = utext_replace(targetUT, nativeStart, nativeLimit, rs, rsLen, &status);
516     int32_t expectedDelta = repStr.length() - (nativeLimit - nativeStart);
517     TEST_ASSERT(actualDelta == expectedDelta);
518 
519     //
520     // Compare the results
521     //
522     int32_t  usi = 0;    // UnicodeString position, utf-16 index.
523     int64_t  uti = 0;    // UText position, native index.
524     UChar32  usc;        // code point from Unicode String
525     UChar32  utc;        // code point from UText
526     int64_t  expectedNativeLength = 0;
527     utext_setNativeIndex(targetUT, 0);
528     for (;;) {
529         usc = targetUS.char32At(usi);
530         utc = utext_next32(targetUT);
531         if (utc < 0) {
532             break;
533         }
534         TEST_ASSERT(uti == usi);
535         TEST_ASSERT(utc == usc);
536         usi = targetUS.moveIndex32(usi, 1);
537         uti = utext_getNativeIndex(targetUT);
538         if (gFailed) {
539             goto cleanupAndReturn;
540         }
541     }
542     expectedNativeLength = utext_nativeLength(ut) + expectedDelta;
543     uti = utext_getNativeIndex(targetUT);
544     TEST_ASSERT(uti == expectedNativeLength);
545 
546 cleanupAndReturn:
547     utext_close(targetUT);
548 }
549 
550 //
551 //  TestAccess      Test the read only access functions on a UText, including cloning.
552 //                  The text is accessed in a variety of ways, and compared with
553 //                  the reference UnicodeString.
554 //
TestAccess(const UnicodeString & us,UText * ut,int cpCount,m * cpMap)555 void UTextTest::TestAccess(const UnicodeString &us, UText *ut, int cpCount, m *cpMap) {
556     // Run the standard tests on the caller-supplied UText.
557     TestAccessNoClone(us, ut, cpCount, cpMap);
558 
559     // Re-run tests on a shallow clone.
560     utext_setNativeIndex(ut, 0);
561     UErrorCode status = U_ZERO_ERROR;
562     UText *shallowClone = utext_clone(nullptr, ut, false /*deep*/, false /*readOnly*/, &status);
563     TEST_SUCCESS(status);
564     TestAccessNoClone(us, shallowClone, cpCount, cpMap);
565 
566     //
567     // Rerun again on a deep clone.
568     // Note that text providers are not required to provide deep cloning,
569     //   so unsupported errors are ignored.
570     //
571     status = U_ZERO_ERROR;
572     utext_setNativeIndex(shallowClone, 0);
573     UText *deepClone = utext_clone(nullptr, shallowClone, true, false, &status);
574     utext_close(shallowClone);
575     if (status != U_UNSUPPORTED_ERROR) {
576         TEST_SUCCESS(status);
577         TestAccessNoClone(us, deepClone, cpCount, cpMap);
578     }
579     utext_close(deepClone);
580 }
581 
582 
583 //
584 //  TestAccessNoClone()    Test the read only access functions on a UText.
585 //                         The text is accessed in a variety of ways, and compared with
586 //                         the reference UnicodeString.
587 //
TestAccessNoClone(const UnicodeString & us,UText * ut,int cpCount,m * cpMap)588 void UTextTest::TestAccessNoClone(const UnicodeString &us, UText *ut, int cpCount, m *cpMap) {
589     UErrorCode  status = U_ZERO_ERROR;
590     gTestNum++;
591 
592     //
593     //  Check the length from the UText
594     //
595     int64_t expectedLen = cpMap[cpCount].nativeIdx;
596     int64_t utlen = utext_nativeLength(ut);
597     TEST_ASSERT(expectedLen == utlen);
598 
599     //
600     //  Iterate forwards, verify that we get the correct code points
601     //   at the correct native offsets.
602     //
603     int         i = 0;
604     int64_t     index;
605     int64_t     expectedIndex = 0;
606     int64_t     foundIndex = 0;
607     UChar32     expectedC;
608     UChar32     foundC;
609     int64_t     len;
610 
611     for (i=0; i<cpCount; i++) {
612         expectedIndex = cpMap[i].nativeIdx;
613         foundIndex    = utext_getNativeIndex(ut);
614         TEST_ASSERT(expectedIndex == foundIndex);
615         expectedC     = cpMap[i].cp;
616         foundC        = utext_next32(ut);
617         TEST_ASSERT(expectedC == foundC);
618         foundIndex    = utext_getPreviousNativeIndex(ut);
619         TEST_ASSERT(expectedIndex == foundIndex);
620         if (gFailed) {
621             return;
622         }
623     }
624     foundC = utext_next32(ut);
625     TEST_ASSERT(foundC == U_SENTINEL);
626 
627     // Repeat above, using macros
628     utext_setNativeIndex(ut, 0);
629     for (i=0; i<cpCount; i++) {
630         expectedIndex = cpMap[i].nativeIdx;
631         foundIndex    = UTEXT_GETNATIVEINDEX(ut);
632         TEST_ASSERT(expectedIndex == foundIndex);
633         expectedC     = cpMap[i].cp;
634         foundC        = UTEXT_NEXT32(ut);
635         TEST_ASSERT(expectedC == foundC);
636         if (gFailed) {
637             return;
638         }
639     }
640     foundC = UTEXT_NEXT32(ut);
641     TEST_ASSERT(foundC == U_SENTINEL);
642 
643     //
644     //  Forward iteration (above) should have left index at the
645     //   end of the input, which should == length().
646     //
647     len = utext_nativeLength(ut);
648     foundIndex  = utext_getNativeIndex(ut);
649     TEST_ASSERT(len == foundIndex);
650 
651     //
652     // Iterate backwards over entire test string
653     //
654     len = utext_getNativeIndex(ut);
655     utext_setNativeIndex(ut, len);
656     for (i=cpCount-1; i>=0; i--) {
657         expectedC     = cpMap[i].cp;
658         expectedIndex = cpMap[i].nativeIdx;
659         int64_t prevIndex = utext_getPreviousNativeIndex(ut);
660         foundC        = utext_previous32(ut);
661         foundIndex    = utext_getNativeIndex(ut);
662         TEST_ASSERT(expectedIndex == foundIndex);
663         TEST_ASSERT(expectedC == foundC);
664         TEST_ASSERT(prevIndex == foundIndex);
665         if (gFailed) {
666             return;
667         }
668     }
669 
670     //
671     //  Backwards iteration, above, should have left our iterator
672     //   position at zero, and continued backwards iterationshould fail.
673     //
674     foundIndex = utext_getNativeIndex(ut);
675     TEST_ASSERT(foundIndex == 0);
676     foundIndex = utext_getPreviousNativeIndex(ut);
677     TEST_ASSERT(foundIndex == 0);
678 
679 
680     foundC = utext_previous32(ut);
681     TEST_ASSERT(foundC == U_SENTINEL);
682     foundIndex = utext_getNativeIndex(ut);
683     TEST_ASSERT(foundIndex == 0);
684     foundIndex = utext_getPreviousNativeIndex(ut);
685     TEST_ASSERT(foundIndex == 0);
686 
687 
688     // And again, with the macros
689     utext_setNativeIndex(ut, len);
690     for (i=cpCount-1; i>=0; i--) {
691         expectedC     = cpMap[i].cp;
692         expectedIndex = cpMap[i].nativeIdx;
693         foundC        = UTEXT_PREVIOUS32(ut);
694         foundIndex    = UTEXT_GETNATIVEINDEX(ut);
695         TEST_ASSERT(expectedIndex == foundIndex);
696         TEST_ASSERT(expectedC == foundC);
697         if (gFailed) {
698             return;
699         }
700     }
701 
702     //
703     //  Backwards iteration, above, should have left our iterator
704     //   position at zero, and continued backwards iterationshould fail.
705     //
706     foundIndex = UTEXT_GETNATIVEINDEX(ut);
707     TEST_ASSERT(foundIndex == 0);
708 
709     foundC = UTEXT_PREVIOUS32(ut);
710     TEST_ASSERT(foundC == U_SENTINEL);
711     foundIndex = UTEXT_GETNATIVEINDEX(ut);
712     TEST_ASSERT(foundIndex == 0);
713     if (gFailed) {
714         return;
715     }
716 
717     //
718     //  next32From(), previous32From(), Iterate in a somewhat random order.
719     //
720     int  cpIndex = 0;
721     for (i=0; i<cpCount; i++) {
722         cpIndex = (cpIndex + 9973) % cpCount;
723         index         = cpMap[cpIndex].nativeIdx;
724         expectedC     = cpMap[cpIndex].cp;
725         foundC        = utext_next32From(ut, index);
726         TEST_ASSERT(expectedC == foundC);
727         if (gFailed) {
728             return;
729         }
730     }
731 
732     cpIndex = 0;
733     for (i=0; i<cpCount; i++) {
734         cpIndex = (cpIndex + 9973) % cpCount;
735         index         = cpMap[cpIndex+1].nativeIdx;
736         expectedC     = cpMap[cpIndex].cp;
737         foundC        = utext_previous32From(ut, index);
738         TEST_ASSERT(expectedC == foundC);
739         if (gFailed) {
740             return;
741         }
742     }
743 
744 
745     //
746     // moveIndex(int32_t delta);
747     //
748 
749     // Walk through frontwards, incrementing by one
750     utext_setNativeIndex(ut, 0);
751     for (i=1; i<=cpCount; i++) {
752         utext_moveIndex32(ut, 1);
753         index = utext_getNativeIndex(ut);
754         expectedIndex = cpMap[i].nativeIdx;
755         TEST_ASSERT(expectedIndex == index);
756         index = UTEXT_GETNATIVEINDEX(ut);
757         TEST_ASSERT(expectedIndex == index);
758     }
759 
760     // Walk through frontwards, incrementing by two
761     utext_setNativeIndex(ut, 0);
762     for (i=2; i<cpCount; i+=2) {
763         utext_moveIndex32(ut, 2);
764         index = utext_getNativeIndex(ut);
765         expectedIndex = cpMap[i].nativeIdx;
766         TEST_ASSERT(expectedIndex == index);
767         index = UTEXT_GETNATIVEINDEX(ut);
768         TEST_ASSERT(expectedIndex == index);
769     }
770 
771     // walk through the string backwards, decrementing by one.
772     i = cpMap[cpCount].nativeIdx;
773     utext_setNativeIndex(ut, i);
774     for (i=cpCount; i>=0; i--) {
775         expectedIndex = cpMap[i].nativeIdx;
776         index = utext_getNativeIndex(ut);
777         TEST_ASSERT(expectedIndex == index);
778         index = UTEXT_GETNATIVEINDEX(ut);
779         TEST_ASSERT(expectedIndex == index);
780         utext_moveIndex32(ut, -1);
781     }
782 
783 
784     // walk through backwards, decrementing by three
785     i = cpMap[cpCount].nativeIdx;
786     utext_setNativeIndex(ut, i);
787     for (i=cpCount; i>=0; i-=3) {
788         expectedIndex = cpMap[i].nativeIdx;
789         index = utext_getNativeIndex(ut);
790         TEST_ASSERT(expectedIndex == index);
791         index = UTEXT_GETNATIVEINDEX(ut);
792         TEST_ASSERT(expectedIndex == index);
793         utext_moveIndex32(ut, -3);
794     }
795 
796 
797     //
798     // Extract
799     //
800     int bufSize = us.length() + 10;
801     char16_t *buf = new char16_t[bufSize];
802     status = U_ZERO_ERROR;
803     expectedLen = us.length();
804     len = utext_extract(ut, 0, utlen, buf, bufSize, &status);
805     TEST_SUCCESS(status);
806     TEST_ASSERT(len == expectedLen);
807     int compareResult = us.compare(buf, -1);
808     TEST_ASSERT(compareResult == 0);
809 
810     status = U_ZERO_ERROR;
811     len = utext_extract(ut, 0, utlen, nullptr, 0, &status);
812     if (utlen == 0) {
813         TEST_ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
814     } else {
815         TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR);
816     }
817     TEST_ASSERT(len == expectedLen);
818 
819     status = U_ZERO_ERROR;
820     u_memset(buf, 0x5555, bufSize);
821     len = utext_extract(ut, 0, utlen, buf, 1, &status);
822     if (us.length() == 0) {
823         TEST_SUCCESS(status);
824         TEST_ASSERT(buf[0] == 0);
825     } else {
826         // Buf len == 1, extracting a single 16 bit value.
827         // If the data char is supplementary, it doesn't matter whether the buffer remains unchanged,
828         //   or whether the lead surrogate of the pair is extracted.
829         //   It's a buffer overflow error in either case.
830         TEST_ASSERT(buf[0] == us.charAt(0) ||
831                     (buf[0] == 0x5555 && U_IS_SUPPLEMENTARY(us.char32At(0))));
832         TEST_ASSERT(buf[1] == 0x5555);
833         if (us.length() == 1) {
834             TEST_ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
835         } else {
836             TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR);
837         }
838     }
839 
840     delete []buf;
841 }
842 
843 //
844 //  ErrorTest()    Check various error and edge cases.
845 //
ErrorTest()846 void UTextTest::ErrorTest()
847 {
848     // Close of an uninitialized UText.  Shouldn't blow up.
849     {
850         UText  ut;
851         memset(&ut, 0, sizeof(UText));
852         utext_close(&ut);
853         utext_close(nullptr);
854     }
855 
856     // Double-close of a UText.  Shouldn't blow up.  UText should still be usable.
857     {
858         UErrorCode status = U_ZERO_ERROR;
859         UText ut = UTEXT_INITIALIZER;
860         UnicodeString s("Hello, World");
861         UText *ut2 = utext_openUnicodeString(&ut, &s, &status);
862         TEST_SUCCESS(status);
863         TEST_ASSERT(ut2 == &ut);
864 
865         UText *ut3 = utext_close(&ut);
866         TEST_ASSERT(ut3 == &ut);
867 
868         UText *ut4 = utext_close(&ut);
869         TEST_ASSERT(ut4 == &ut);
870 
871         utext_openUnicodeString(&ut, &s, &status);
872         TEST_SUCCESS(status);
873         utext_close(&ut);
874     }
875 
876     // Re-use of a UText, chaining through each of the types of UText
877     //   (If it doesn't blow up, and doesn't leak, it's probably working fine)
878     {
879         UErrorCode status = U_ZERO_ERROR;
880         UText ut = UTEXT_INITIALIZER;
881         UText  *utp;
882         UnicodeString s1("Hello, World");
883         char16_t s2[] = {(char16_t)0x41, (char16_t)0x42, (char16_t)0};
884         const char  *s3 = "\x66\x67\x68";
885 
886         utp = utext_openUnicodeString(&ut, &s1, &status);
887         TEST_SUCCESS(status);
888         TEST_ASSERT(utp == &ut);
889 
890         utp = utext_openConstUnicodeString(&ut, &s1, &status);
891         TEST_SUCCESS(status);
892         TEST_ASSERT(utp == &ut);
893 
894         utp = utext_openUTF8(&ut, s3, -1, &status);
895         TEST_SUCCESS(status);
896         TEST_ASSERT(utp == &ut);
897 
898         utp = utext_openUChars(&ut, s2, -1, &status);
899         TEST_SUCCESS(status);
900         TEST_ASSERT(utp == &ut);
901 
902         utp = utext_close(&ut);
903         TEST_ASSERT(utp == &ut);
904 
905         utp = utext_openUnicodeString(&ut, &s1, &status);
906         TEST_SUCCESS(status);
907         TEST_ASSERT(utp == &ut);
908     }
909 
910     // Invalid parameters on open
911     //
912     {
913         UErrorCode status = U_ZERO_ERROR;
914         UText ut = UTEXT_INITIALIZER;
915 
916         utext_openUChars(&ut, nullptr, 5, &status);
917         TEST_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
918 
919         status = U_ZERO_ERROR;
920         utext_openUChars(&ut, nullptr, -1, &status);
921         TEST_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
922 
923         status = U_ZERO_ERROR;
924         utext_openUTF8(&ut, nullptr, 4, &status);
925         TEST_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
926 
927         status = U_ZERO_ERROR;
928         utext_openUTF8(&ut, nullptr, -1, &status);
929         TEST_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
930     }
931 
932     //
933     //  UTF-8 with malformed sequences.
934     //    These should come through as the Unicode replacement char, \ufffd
935     //
936     {
937         UErrorCode status = U_ZERO_ERROR;
938         UText *ut = nullptr;
939         const char *badUTF8 = "\x41\x81\x42\xf0\x81\x81\x43";
940         UChar32  c;
941 
942         ut = utext_openUTF8(nullptr, badUTF8, -1, &status);
943         TEST_SUCCESS(status);
944         c = utext_char32At(ut, 1);
945         TEST_ASSERT(c == 0xfffd);
946         c = utext_char32At(ut, 3);
947         TEST_ASSERT(c == 0xfffd);
948         c = utext_char32At(ut, 5);
949         TEST_ASSERT(c == 0xfffd);
950         c = utext_char32At(ut, 6);
951         TEST_ASSERT(c == 0x43);
952 
953         char16_t buf[10];
954         int n = utext_extract(ut, 0, 9, buf, 10, &status);
955         TEST_SUCCESS(status);
956         TEST_ASSERT(n==7);
957         TEST_ASSERT(buf[0] == 0x41);
958         TEST_ASSERT(buf[1] == 0xfffd);
959         TEST_ASSERT(buf[2] == 0x42);
960         TEST_ASSERT(buf[3] == 0xfffd);
961         TEST_ASSERT(buf[4] == 0xfffd);
962         TEST_ASSERT(buf[5] == 0xfffd);
963         TEST_ASSERT(buf[6] == 0x43);
964         utext_close(ut);
965     }
966 
967 
968     //
969     //  isLengthExpensive - does it make the expected transitions after
970     //                      getting the length of a nul terminated string?
971     //
972     {
973         UErrorCode status = U_ZERO_ERROR;
974         UnicodeString sa("Hello, this is a string");
975         UBool  isExpensive;
976 
977         char16_t sb[100];
978         memset(sb, 0x20, sizeof(sb));
979         sb[99] = 0;
980 
981         UText *uta = utext_openUnicodeString(nullptr, &sa, &status);
982         TEST_SUCCESS(status);
983         isExpensive = utext_isLengthExpensive(uta);
984         TEST_ASSERT(isExpensive == false);
985         utext_close(uta);
986 
987         UText *utb = utext_openUChars(nullptr, sb, -1, &status);
988         TEST_SUCCESS(status);
989         isExpensive = utext_isLengthExpensive(utb);
990         TEST_ASSERT(isExpensive == true);
991         int64_t  len = utext_nativeLength(utb);
992         TEST_ASSERT(len == 99);
993         isExpensive = utext_isLengthExpensive(utb);
994         TEST_ASSERT(isExpensive == false);
995         utext_close(utb);
996     }
997 
998     //
999     // Index to positions not on code point boundaries.
1000     //
1001     {
1002         const char *u8str =         "\xc8\x81\xe1\x82\x83\xf1\x84\x85\x86";
1003         int32_t startMap[] =        {   0,  0,  2,  2,  2,  5,  5,  5,  5,  9,  9};
1004         int32_t nextMap[]  =        {   2,  2,  5,  5,  5,  9,  9,  9,  9,  9,  9};
1005         int32_t prevMap[]  =        {   0,  0,  0,  0,  0,  2,  2,  2,  2,  5,  5};
1006         UChar32  c32Map[] =    {0x201, 0x201, 0x1083, 0x1083, 0x1083, 0x044146, 0x044146, 0x044146, 0x044146, -1, -1};
1007         UChar32  pr32Map[] =   {    -1,   -1,  0x201,  0x201,  0x201,   0x1083,   0x1083,   0x1083,   0x1083, 0x044146, 0x044146};
1008 
1009         // extractLen is the size, in UChars, of what will be extracted between index and index+1.
1010         //  is zero when both index positions lie within the same code point.
1011         int32_t  exLen[] =          {   0,  1,   0,  0,  1,  0,  0,  0,  2,  0,  0};
1012 
1013 
1014         UErrorCode status = U_ZERO_ERROR;
1015         UText *ut = utext_openUTF8(nullptr, u8str, -1, &status);
1016         TEST_SUCCESS(status);
1017 
1018         // Check setIndex
1019         int32_t i;
1020         int32_t startMapLimit = UPRV_LENGTHOF(startMap);
1021         for (i=0; i<startMapLimit; i++) {
1022             utext_setNativeIndex(ut, i);
1023             int64_t cpIndex = utext_getNativeIndex(ut);
1024             TEST_ASSERT(cpIndex == startMap[i]);
1025             cpIndex = UTEXT_GETNATIVEINDEX(ut);
1026             TEST_ASSERT(cpIndex == startMap[i]);
1027         }
1028 
1029         // Check char32At
1030         for (i=0; i<startMapLimit; i++) {
1031             UChar32 c32 = utext_char32At(ut, i);
1032             TEST_ASSERT(c32 == c32Map[i]);
1033             int64_t cpIndex = utext_getNativeIndex(ut);
1034             TEST_ASSERT(cpIndex == startMap[i]);
1035         }
1036 
1037         // Check utext_next32From
1038         for (i=0; i<startMapLimit; i++) {
1039             UChar32 c32 = utext_next32From(ut, i);
1040             TEST_ASSERT(c32 == c32Map[i]);
1041             int64_t cpIndex = utext_getNativeIndex(ut);
1042             TEST_ASSERT(cpIndex == nextMap[i]);
1043         }
1044 
1045         // check utext_previous32From
1046         for (i=0; i<startMapLimit; i++) {
1047             gTestNum++;
1048             UChar32 c32 = utext_previous32From(ut, i);
1049             TEST_ASSERT(c32 == pr32Map[i]);
1050             int64_t cpIndex = utext_getNativeIndex(ut);
1051             TEST_ASSERT(cpIndex == prevMap[i]);
1052         }
1053 
1054         // check Extract
1055         //   Extract from i to i+1, which may be zero or one code points,
1056         //     depending on whether the indices straddle a cp boundary.
1057         for (i=0; i<startMapLimit; i++) {
1058             char16_t buf[3];
1059             status = U_ZERO_ERROR;
1060             int32_t  extractedLen = utext_extract(ut, i, i+1, buf, 3, &status);
1061             TEST_SUCCESS(status);
1062             TEST_ASSERT(extractedLen == exLen[i]);
1063             if (extractedLen > 0) {
1064                 UChar32  c32;
1065                 /* extractedLen-extractedLen == 0 is used to get around a compiler warning. */
1066                 U16_GET(buf, 0, extractedLen-extractedLen, extractedLen, c32);
1067                 TEST_ASSERT(c32 == c32Map[i]);
1068             }
1069         }
1070 
1071         utext_close(ut);
1072     }
1073 
1074 
1075     {    //  Similar test, with utf16 instead of utf8
1076          //  TODO:  merge the common parts of these tests.
1077 
1078         UnicodeString u16str("\\u1000\\U00011000\\u2000\\U00022000", -1, US_INV);
1079         int32_t startMap[]  ={ 0,     1,   1,    3,     4,  4,     6,  6};
1080         int32_t nextMap[]  = { 1,     3,   3,    4,     6,  6,     6,  6};
1081         int32_t prevMap[]  = { 0,     0,   0,    1,     3,  3,     4,  4};
1082         UChar32  c32Map[] =  {0x1000, 0x11000, 0x11000, 0x2000,  0x22000, 0x22000, -1, -1};
1083         UChar32  pr32Map[] = {    -1, 0x1000,  0x1000,  0x11000, 0x2000,  0x2000,   0x22000,   0x22000};
1084         int32_t  exLen[] =   {   1,  0,   2,  1,  0,  2,  0,  0,};
1085 
1086         u16str = u16str.unescape();
1087         UErrorCode status = U_ZERO_ERROR;
1088         UText *ut = utext_openUnicodeString(nullptr, &u16str, &status);
1089         TEST_SUCCESS(status);
1090 
1091         int32_t startMapLimit = UPRV_LENGTHOF(startMap);
1092         int i;
1093         for (i=0; i<startMapLimit; i++) {
1094             utext_setNativeIndex(ut, i);
1095             int64_t cpIndex = utext_getNativeIndex(ut);
1096             TEST_ASSERT(cpIndex == startMap[i]);
1097         }
1098 
1099         // Check char32At
1100         for (i=0; i<startMapLimit; i++) {
1101             UChar32 c32 = utext_char32At(ut, i);
1102             TEST_ASSERT(c32 == c32Map[i]);
1103             int64_t cpIndex = utext_getNativeIndex(ut);
1104             TEST_ASSERT(cpIndex == startMap[i]);
1105         }
1106 
1107         // Check utext_next32From
1108         for (i=0; i<startMapLimit; i++) {
1109             UChar32 c32 = utext_next32From(ut, i);
1110             TEST_ASSERT(c32 == c32Map[i]);
1111             int64_t cpIndex = utext_getNativeIndex(ut);
1112             TEST_ASSERT(cpIndex == nextMap[i]);
1113         }
1114 
1115         // check utext_previous32From
1116         for (i=0; i<startMapLimit; i++) {
1117             UChar32 c32 = utext_previous32From(ut, i);
1118             TEST_ASSERT(c32 == pr32Map[i]);
1119             int64_t cpIndex = utext_getNativeIndex(ut);
1120             TEST_ASSERT(cpIndex == prevMap[i]);
1121         }
1122 
1123         // check Extract
1124         //   Extract from i to i+1, which may be zero or one code points,
1125         //     depending on whether the indices straddle a cp boundary.
1126         for (i=0; i<startMapLimit; i++) {
1127             char16_t buf[3];
1128             status = U_ZERO_ERROR;
1129             int32_t  extractedLen = utext_extract(ut, i, i+1, buf, 3, &status);
1130             TEST_SUCCESS(status);
1131             TEST_ASSERT(extractedLen == exLen[i]);
1132             if (extractedLen > 0) {
1133                 UChar32  c32;
1134                 /* extractedLen-extractedLen == 0 is used to get around a compiler warning. */
1135                 U16_GET(buf, 0, extractedLen-extractedLen, extractedLen, c32);
1136                 TEST_ASSERT(c32 == c32Map[i]);
1137             }
1138         }
1139 
1140         utext_close(ut);
1141     }
1142 
1143     {    //  Similar test, with UText over Replaceable
1144          //  TODO:  merge the common parts of these tests.
1145 
1146         UnicodeString u16str("\\u1000\\U00011000\\u2000\\U00022000", -1, US_INV);
1147         int32_t startMap[]  ={ 0,     1,   1,    3,     4,  4,     6,  6};
1148         int32_t nextMap[]  = { 1,     3,   3,    4,     6,  6,     6,  6};
1149         int32_t prevMap[]  = { 0,     0,   0,    1,     3,  3,     4,  4};
1150         UChar32  c32Map[] =  {0x1000, 0x11000, 0x11000, 0x2000,  0x22000, 0x22000, -1, -1};
1151         UChar32  pr32Map[] = {    -1, 0x1000,  0x1000,  0x11000, 0x2000,  0x2000,   0x22000,   0x22000};
1152         int32_t  exLen[] =   {   1,  0,   2,  1,  0,  2,  0,  0,};
1153 
1154         u16str = u16str.unescape();
1155         UErrorCode status = U_ZERO_ERROR;
1156         UText *ut = utext_openReplaceable(nullptr, &u16str, &status);
1157         TEST_SUCCESS(status);
1158 
1159         int32_t startMapLimit = UPRV_LENGTHOF(startMap);
1160         int i;
1161         for (i=0; i<startMapLimit; i++) {
1162             utext_setNativeIndex(ut, i);
1163             int64_t cpIndex = utext_getNativeIndex(ut);
1164             TEST_ASSERT(cpIndex == startMap[i]);
1165         }
1166 
1167         // Check char32At
1168         for (i=0; i<startMapLimit; i++) {
1169             UChar32 c32 = utext_char32At(ut, i);
1170             TEST_ASSERT(c32 == c32Map[i]);
1171             int64_t cpIndex = utext_getNativeIndex(ut);
1172             TEST_ASSERT(cpIndex == startMap[i]);
1173         }
1174 
1175         // Check utext_next32From
1176         for (i=0; i<startMapLimit; i++) {
1177             UChar32 c32 = utext_next32From(ut, i);
1178             TEST_ASSERT(c32 == c32Map[i]);
1179             int64_t cpIndex = utext_getNativeIndex(ut);
1180             TEST_ASSERT(cpIndex == nextMap[i]);
1181         }
1182 
1183         // check utext_previous32From
1184         for (i=0; i<startMapLimit; i++) {
1185             UChar32 c32 = utext_previous32From(ut, i);
1186             TEST_ASSERT(c32 == pr32Map[i]);
1187             int64_t cpIndex = utext_getNativeIndex(ut);
1188             TEST_ASSERT(cpIndex == prevMap[i]);
1189         }
1190 
1191         // check Extract
1192         //   Extract from i to i+1, which may be zero or one code points,
1193         //     depending on whether the indices straddle a cp boundary.
1194         for (i=0; i<startMapLimit; i++) {
1195             char16_t buf[3];
1196             status = U_ZERO_ERROR;
1197             int32_t  extractedLen = utext_extract(ut, i, i+1, buf, 3, &status);
1198             TEST_SUCCESS(status);
1199             TEST_ASSERT(extractedLen == exLen[i]);
1200             if (extractedLen > 0) {
1201                 UChar32  c32;
1202                 /* extractedLen-extractedLen == 0 is used to get around a compiler warning. */
1203                 U16_GET(buf, 0, extractedLen-extractedLen, extractedLen, c32);
1204                 TEST_ASSERT(c32 == c32Map[i]);
1205             }
1206         }
1207 
1208         utext_close(ut);
1209     }
1210 }
1211 
1212 
FreezeTest()1213 void UTextTest::FreezeTest() {
1214     // Check isWritable() and freeze() behavior.
1215     //
1216 
1217     UnicodeString  ustr("Hello, World.");
1218     const char u8str[] = {char(0x31), (char)0x32, (char)0x33, 0};
1219     const char16_t u16str[] = {(char16_t)0x31, (char16_t)0x32, (char16_t)0x44, 0};
1220 
1221     UErrorCode status = U_ZERO_ERROR;
1222     UText  *ut        = nullptr;
1223     UText  *ut2       = nullptr;
1224 
1225     ut = utext_openUTF8(ut, u8str, -1, &status);
1226     TEST_SUCCESS(status);
1227     UBool writable = utext_isWritable(ut);
1228     TEST_ASSERT(writable == false);
1229     utext_copy(ut, 1, 2, 0, true, &status);
1230     TEST_ASSERT(status == U_NO_WRITE_PERMISSION);
1231 
1232     status = U_ZERO_ERROR;
1233     ut = utext_openUChars(ut, u16str, -1, &status);
1234     TEST_SUCCESS(status);
1235     writable = utext_isWritable(ut);
1236     TEST_ASSERT(writable == false);
1237     utext_copy(ut, 1, 2, 0, true, &status);
1238     TEST_ASSERT(status == U_NO_WRITE_PERMISSION);
1239 
1240     status = U_ZERO_ERROR;
1241     ut = utext_openUnicodeString(ut, &ustr, &status);
1242     TEST_SUCCESS(status);
1243     writable = utext_isWritable(ut);
1244     TEST_ASSERT(writable == true);
1245     utext_freeze(ut);
1246     writable = utext_isWritable(ut);
1247     TEST_ASSERT(writable == false);
1248     utext_copy(ut, 1, 2, 0, true, &status);
1249     TEST_ASSERT(status == U_NO_WRITE_PERMISSION);
1250 
1251     status = U_ZERO_ERROR;
1252     ut = utext_openUnicodeString(ut, &ustr, &status);
1253     TEST_SUCCESS(status);
1254     ut2 = utext_clone(ut2, ut, false, false, &status);  // clone with readonly = false
1255     TEST_SUCCESS(status);
1256     writable = utext_isWritable(ut2);
1257     TEST_ASSERT(writable == true);
1258     ut2 = utext_clone(ut2, ut, false, true, &status);  // clone with readonly = true
1259     TEST_SUCCESS(status);
1260     writable = utext_isWritable(ut2);
1261     TEST_ASSERT(writable == false);
1262     utext_copy(ut2, 1, 2, 0, true, &status);
1263     TEST_ASSERT(status == U_NO_WRITE_PERMISSION);
1264 
1265     status = U_ZERO_ERROR;
1266     ut = utext_openConstUnicodeString(ut, &ustr, &status);
1267     TEST_SUCCESS(status);
1268     writable = utext_isWritable(ut);
1269     TEST_ASSERT(writable == false);
1270     utext_copy(ut, 1, 2, 0, true, &status);
1271     TEST_ASSERT(status == U_NO_WRITE_PERMISSION);
1272 
1273     // Deep Clone of a frozen UText should re-enable writing in the copy.
1274     status = U_ZERO_ERROR;
1275     ut = utext_openUnicodeString(ut, &ustr, &status);
1276     TEST_SUCCESS(status);
1277     utext_freeze(ut);
1278     ut2 = utext_clone(ut2, ut, true, false, &status);   // deep clone
1279     TEST_SUCCESS(status);
1280     writable = utext_isWritable(ut2);
1281     TEST_ASSERT(writable == true);
1282 
1283 
1284     // Deep clone of a frozen UText, where the base type is intrinsically non-writable,
1285     //  should NOT enable writing in the copy.
1286     status = U_ZERO_ERROR;
1287     ut = utext_openUChars(ut, u16str, -1, &status);
1288     TEST_SUCCESS(status);
1289     utext_freeze(ut);
1290     ut2 = utext_clone(ut2, ut, true, false, &status);   // deep clone
1291     TEST_SUCCESS(status);
1292     writable = utext_isWritable(ut2);
1293     TEST_ASSERT(writable == false);
1294 
1295     // cleanup
1296     utext_close(ut);
1297     utext_close(ut2);
1298 }
1299 
1300 
1301 //
1302 //  Fragmented UText
1303 //      A UText type that works with a chunk size of 1.
1304 //      Intended to test for edge cases.
1305 //      Input comes from a UnicodeString.
1306 //
1307 //       ut.b    the character.  Put into both halves.
1308 //
1309 
1310 U_CDECL_BEGIN
1311 static UBool U_CALLCONV
fragTextAccess(UText * ut,int64_t index,UBool forward)1312 fragTextAccess(UText *ut, int64_t index, UBool forward) {
1313     const UnicodeString *us = static_cast<const UnicodeString *>(ut->context);
1314     char16_t c;
1315     int32_t length = us->length();
1316     if (forward && index>=0 && index<length) {
1317         c = us->charAt((int32_t)index);
1318         ut->b = c | c<<16;
1319         ut->chunkOffset = 0;
1320         ut->chunkLength = 1;
1321         ut->chunkNativeStart = index;
1322         ut->chunkNativeLimit = index+1;
1323         return true;
1324     }
1325     if (!forward && index>0 && index <=length) {
1326         c = us->charAt((int32_t)index-1);
1327         ut->b = c | c<<16;
1328         ut->chunkOffset = 1;
1329         ut->chunkLength = 1;
1330         ut->chunkNativeStart = index-1;
1331         ut->chunkNativeLimit = index;
1332         return true;
1333     }
1334     ut->b = 0;
1335     ut->chunkOffset = 0;
1336     ut->chunkLength = 0;
1337     if (index <= 0) {
1338         ut->chunkNativeStart = 0;
1339         ut->chunkNativeLimit = 0;
1340     } else {
1341         ut->chunkNativeStart = length;
1342         ut->chunkNativeLimit = length;
1343     }
1344     return false;
1345 }
1346 
1347 // Function table to be used with this fragmented text provider.
1348 //   Initialized in the open function.
1349 static UTextFuncs  fragmentFuncs;
1350 
1351 // Clone function for fragmented text provider.
1352 //   Didn't really want to provide this, but it's easier to provide it than to keep it
1353 //   out of the tests.
1354 //
1355 UText *
cloneFragmentedUnicodeString(UText * dest,const UText * src,UBool deep,UErrorCode * status)1356 cloneFragmentedUnicodeString(UText *dest, const UText *src, UBool deep, UErrorCode *status) {
1357     if (U_FAILURE(*status)) {
1358         return nullptr;
1359     }
1360     if (deep) {
1361         *status = U_UNSUPPORTED_ERROR;
1362         return nullptr;
1363     }
1364     dest = utext_openUnicodeString(dest, static_cast<UnicodeString *>(const_cast<void*>(src->context)), status);
1365     utext_setNativeIndex(dest, utext_getNativeIndex(src));
1366     return dest;
1367 }
1368 
1369 U_CDECL_END
1370 
1371 // Open function for the fragmented text provider.
1372 UText *
openFragmentedUnicodeString(UText * ut,UnicodeString * s,UErrorCode * status)1373 openFragmentedUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status) {
1374     ut = utext_openUnicodeString(ut, s, status);
1375     if (U_FAILURE(*status)) {
1376         return ut;
1377     }
1378 
1379     // Copy of the function table from the stock UnicodeString UText,
1380     //   and replace the entry for the access function.
1381     memcpy(&fragmentFuncs, ut->pFuncs, sizeof(fragmentFuncs));
1382     fragmentFuncs.access = fragTextAccess;
1383     fragmentFuncs.clone  = cloneFragmentedUnicodeString;
1384     ut->pFuncs = &fragmentFuncs;
1385 
1386     ut->chunkContents = (char16_t *)&ut->b;
1387     ut->pFuncs->access(ut, 0, true);
1388     return ut;
1389 }
1390 
1391 // Regression test for Ticket 5560
1392 //   Clone fails to update chunkContentPointer in the cloned copy.
1393 //   This is only an issue for UText types that work in a local buffer,
1394 //      (UTF-8 wrapper, for example)
1395 //
1396 //   The test:
1397 //     1.  Create an initial UText
1398 //     2.  Deep clone it.  Contents should match original.
1399 //     3.  Reset original to something different.
1400 //     4.  Check that clone contents did not change.
1401 //
Ticket5560()1402 void UTextTest::Ticket5560() {
1403     /* The following two strings are in UTF-8 even on EBCDIC platforms. */
1404     static const char s1[] = {0x41,0x42,0x43,0x44,0x45,0x46,0}; /* "ABCDEF" */
1405     static const char s2[] = {0x31,0x32,0x33,0x34,0x35,0x36,0}; /* "123456" */
1406 	UErrorCode status = U_ZERO_ERROR;
1407 
1408 	UText ut1 = UTEXT_INITIALIZER;
1409 	UText ut2 = UTEXT_INITIALIZER;
1410 
1411 	utext_openUTF8(&ut1, s1, -1, &status);
1412 	char16_t c = utext_next32(&ut1);
1413 	TEST_ASSERT(c == 0x41);  // c == 'A'
1414 
1415 	utext_clone(&ut2, &ut1, true, false, &status);
1416 	TEST_SUCCESS(status);
1417     c = utext_next32(&ut2);
1418 	TEST_ASSERT(c == 0x42);  // c == 'B'
1419     c = utext_next32(&ut1);
1420 	TEST_ASSERT(c == 0x42);  // c == 'B'
1421 
1422 	utext_openUTF8(&ut1, s2, -1, &status);
1423 	c = utext_next32(&ut1);
1424 	TEST_ASSERT(c == 0x31);  // c == '1'
1425     c = utext_next32(&ut2);
1426 	TEST_ASSERT(c == 0x43);  // c == 'C'
1427 
1428     utext_close(&ut1);
1429     utext_close(&ut2);
1430 }
1431 
1432 
1433 // Test for Ticket 6847
1434 //
Ticket6847()1435 void UTextTest::Ticket6847() {
1436     const int STRLEN = 90;
1437     char16_t s[STRLEN+1];
1438     u_memset(s, 0x41, STRLEN);
1439     s[STRLEN] = 0;
1440 
1441     UErrorCode status = U_ZERO_ERROR;
1442     UText *ut = utext_openUChars(nullptr, s, -1, &status);
1443 
1444     utext_setNativeIndex(ut, 0);
1445     int32_t count = 0;
1446     UChar32 c = 0;
1447     int64_t nativeIndex = UTEXT_GETNATIVEINDEX(ut);
1448     TEST_ASSERT(nativeIndex == 0);
1449     while ((c = utext_next32(ut)) != U_SENTINEL) {
1450         TEST_ASSERT(c == 0x41);
1451         TEST_ASSERT(count < STRLEN);
1452         if (count >= STRLEN) {
1453             break;
1454         }
1455         count++;
1456         nativeIndex = UTEXT_GETNATIVEINDEX(ut);
1457         TEST_ASSERT(nativeIndex == count);
1458     }
1459     TEST_ASSERT(count == STRLEN);
1460     nativeIndex = UTEXT_GETNATIVEINDEX(ut);
1461     TEST_ASSERT(nativeIndex == STRLEN);
1462     utext_close(ut);
1463 }
1464 
1465 
Ticket10562()1466 void UTextTest::Ticket10562() {
1467     // Note: failures show as a heap error when the test is run under valgrind.
1468     UErrorCode status = U_ZERO_ERROR;
1469 
1470     const char *utf8_string = "\x41\x41\x41\x41\x41\x41\x41\x41\x41\x41\x41\x41\x41\x41\x41";
1471     UText *utf8Text = utext_openUTF8(nullptr, utf8_string, -1, &status);
1472     TEST_SUCCESS(status);
1473     UText *deepClone = utext_clone(nullptr, utf8Text, true, false, &status);
1474     TEST_SUCCESS(status);
1475     UText *shallowClone = utext_clone(nullptr, deepClone, false, false, &status);
1476     TEST_SUCCESS(status);
1477     utext_close(shallowClone);
1478     utext_close(deepClone);
1479     utext_close(utf8Text);
1480 
1481     status = U_ZERO_ERROR;
1482     UnicodeString usString("Hello, World.");
1483     UText *usText = utext_openUnicodeString(nullptr, &usString, &status);
1484     TEST_SUCCESS(status);
1485     UText *usDeepClone = utext_clone(nullptr, usText, true, false, &status);
1486     TEST_SUCCESS(status);
1487     UText *usShallowClone = utext_clone(nullptr, usDeepClone, false, false, &status);
1488     TEST_SUCCESS(status);
1489     utext_close(usShallowClone);
1490     utext_close(usDeepClone);
1491     utext_close(usText);
1492 }
1493 
1494 
Ticket10983()1495 void UTextTest::Ticket10983() {
1496     // Note: failure shows as a seg fault when the defect is present.
1497 
1498     UErrorCode status = U_ZERO_ERROR;
1499     UnicodeString s("Hello, World");
1500     UText *ut = utext_openConstUnicodeString(nullptr, &s, &status);
1501     TEST_SUCCESS(status);
1502 
1503     status = U_INVALID_STATE_ERROR;
1504     UText *cloned = utext_clone(nullptr, ut, true, true, &status);
1505     TEST_ASSERT(cloned == nullptr);
1506     TEST_ASSERT(status == U_INVALID_STATE_ERROR);
1507 
1508     utext_close(ut);
1509 }
1510 
1511 // Ticket 12130 - extract on a UText wrapping a null terminated char16_t * string
1512 //                leaves the iteration position set incorrectly when the
1513 //                actual string length is not yet known.
1514 //
1515 //                The test text needs to be long enough that UText defers getting the length.
1516 
Ticket12130()1517 void UTextTest::Ticket12130() {
1518     UErrorCode status = U_ZERO_ERROR;
1519 
1520     const char *text8 =
1521         "Fundamentally, computers just deal with numbers. They store letters and other characters "
1522         "by assigning a number for each one. Before Unicode was invented, there were hundreds "
1523         "of different encoding systems for assigning these numbers. No single encoding could "
1524         "contain enough characters: for example, the European Union alone requires several "
1525         "different encodings to cover all its languages. Even for a single language like "
1526         "English no single encoding was adequate for all the letters, punctuation, and technical "
1527         "symbols in common use.";
1528 
1529     UnicodeString str(text8);
1530     const char16_t *ustr = str.getTerminatedBuffer();
1531     UText ut = UTEXT_INITIALIZER;
1532     utext_openUChars(&ut, ustr, -1, &status);
1533     char16_t extractBuffer[50];
1534 
1535     for (int32_t startIdx = 0; startIdx<str.length(); ++startIdx) {
1536         int32_t endIdx = startIdx + 20;
1537 
1538         u_memset(extractBuffer, 0, UPRV_LENGTHOF(extractBuffer));
1539         utext_extract(&ut, startIdx, endIdx, extractBuffer, UPRV_LENGTHOF(extractBuffer), &status);
1540         if (U_FAILURE(status)) {
1541             errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1542             return;
1543         }
1544         int64_t ni  = utext_getNativeIndex(&ut);
1545         int64_t expectedni = startIdx + 20;
1546         if (expectedni > str.length()) {
1547             expectedni = str.length();
1548         }
1549         if (expectedni != ni) {
1550             errln("%s:%d utext_getNativeIndex() expected %d, got %d", __FILE__, __LINE__, expectedni, ni);
1551         }
1552         if (0 != str.tempSubString(startIdx, 20).compare(extractBuffer)) {
1553             errln("%s:%d utext_extract() failed. expected \"%s\", got \"%s\"",
1554                     __FILE__, __LINE__, CStr(str.tempSubString(startIdx, 20))(), CStr(UnicodeString(extractBuffer))());
1555         }
1556     }
1557     utext_close(&ut);
1558 
1559     // Similar utext extract, this time with the string length provided to the UText in advance,
1560     // and a buffer of larger than required capacity.
1561 
1562     utext_openUChars(&ut, ustr, str.length(), &status);
1563     for (int32_t startIdx = 0; startIdx<str.length(); ++startIdx) {
1564         int32_t endIdx = startIdx + 20;
1565         u_memset(extractBuffer, 0, UPRV_LENGTHOF(extractBuffer));
1566         utext_extract(&ut, startIdx, endIdx, extractBuffer, UPRV_LENGTHOF(extractBuffer), &status);
1567         if (U_FAILURE(status)) {
1568             errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1569             return;
1570         }
1571         int64_t ni  = utext_getNativeIndex(&ut);
1572         int64_t expectedni = startIdx + 20;
1573         if (expectedni > str.length()) {
1574             expectedni = str.length();
1575         }
1576         if (expectedni != ni) {
1577             errln("%s:%d utext_getNativeIndex() expected %d, got %d", __FILE__, __LINE__, expectedni, ni);
1578         }
1579         if (0 != str.tempSubString(startIdx, 20).compare(extractBuffer)) {
1580             errln("%s:%d utext_extract() failed. expected \"%s\", got \"%s\"",
1581                     __FILE__, __LINE__, CStr(str.tempSubString(startIdx, 20))(), CStr(UnicodeString(extractBuffer))());
1582         }
1583     }
1584     utext_close(&ut);
1585 }
1586 
1587 // Ticket 13344 The macro form of UTEXT_SETNATIVEINDEX failed when target was a trail surrogate
1588 //              of a supplementary character.
1589 
Ticket13344()1590 void UTextTest::Ticket13344() {
1591     UErrorCode status = U_ZERO_ERROR;
1592     const char16_t *str = u"abc\U0010abcd xyz";
1593     LocalUTextPointer ut(utext_openUChars(nullptr, str, -1, &status));
1594 
1595     assertSuccess("UTextTest::Ticket13344-status", status);
1596     UTEXT_SETNATIVEINDEX(ut.getAlias(), 3);
1597     assertEquals("UTextTest::Ticket13344-lead", (int64_t)3, utext_getNativeIndex(ut.getAlias()));
1598     UTEXT_SETNATIVEINDEX(ut.getAlias(), 4);
1599     assertEquals("UTextTest::Ticket13344-trail", (int64_t)3, utext_getNativeIndex(ut.getAlias()));
1600     UTEXT_SETNATIVEINDEX(ut.getAlias(), 5);
1601     assertEquals("UTextTest::Ticket13344-bmp", (int64_t)5, utext_getNativeIndex(ut.getAlias()));
1602 
1603     utext_setNativeIndex(ut.getAlias(), 3);
1604     assertEquals("UTextTest::Ticket13344-lead-2", (int64_t)3, utext_getNativeIndex(ut.getAlias()));
1605     utext_setNativeIndex(ut.getAlias(), 4);
1606     assertEquals("UTextTest::Ticket13344-trail-2", (int64_t)3, utext_getNativeIndex(ut.getAlias()));
1607     utext_setNativeIndex(ut.getAlias(), 5);
1608     assertEquals("UTextTest::Ticket13344-bmp-2", (int64_t)5, utext_getNativeIndex(ut.getAlias()));
1609 }
1610 
1611 // ICU-21653 UText does not handle access callback that changes chunk size
1612 
1613 static const char16_t testAccessText[] = { // text with surrogates at chunk boundaries
1614     0xDC00,0xe001,0xe002,0xD83D,0xDE00,0xe005,0xe006,0xe007, 0xe008,0xe009,0xe00a,0xD83D,0xDE00,0xe00d,0xe00e,0xe00f, // 000-015, unpaired trail at 0
1615     0xE010,0xe011,0xe012,0xD83D,0xDE00,0xe015,0xe016,0xe017, 0xe018,0xe019,0xe01a,0xD83D,0xDE00,0xe01d,0xe01e,0xD800, // 016-031, paired lead at 31 with
1616     0xDC01,0xe021,0xe022,0xD83D,0xDE00,0xe025,0xe026,0xe027, 0xe028,0xe029,0xe02a,0xD83D,0xDE00,0xe02d,0xe02e,0xe02f, // 032-047, paired trail at 32
1617     0xe030,0xe031,0xe032,0xD83D,0xDE00,0xe035,0xe036,0xe037, 0xe038,0xe039,0xe03a,0xD83D,0xDE00,0xe03d,0xe03e,0xe03f, // 048-063
1618     0xDC02,0xe041,0xe042,0xD83D,0xDE00,0xe045,0xe046,0xe047, 0xe048,0xe049,0xe04a,0xD83D,0xDE00,0xe04d,0xe04e,0xe04f, // 064-079, unpaired trail at 64
1619     0xe050,0xe051,0xe052,0xD83D,0xDE00,0xe055,0xe056,0xe057, 0xe058,0xe059,0xe05a,0xD83D,0xDE00,0xe05d,0xe05e,0xD801, // 080-095, unpaired lead at 95
1620     0xe060,0xe061,0xe062,0xD83D,0xDE00,0xe065,0xe066,0xe067, 0xe068,0xe069,0xe06a,0xD83D,0xDE00,0xe06d,0xe06e,0xe06f, // 096-111
1621     0xE070,0xe071,0xe072,0xD83D,0xDE00,0xe075,0xe076,0xe077, 0xe078,0xe079,0xe07a,0xD83D,0xDE00,0xe07d,0xe07e,0xD802, // 112-127, unpaired lead at 127
1622 };
1623 
1624 static const UChar32 testAccess32Text[] = { // same as above in UTF32
1625     0xDC00,0xe001,0xe002,0x1F600,0xe005,0xe006,0xe007, 0xe008,0xe009,0xe00a,0x1F600,0xe00d,0xe00e,0xe00f, // 000-013, unpaired trail at 0
1626     0xE010,0xe011,0xe012,0x1F600,0xe015,0xe016,0xe017, 0xe018,0xe019,0xe01a,0x1F600,0xe01d,0xe01e,0x10001, // 014-027, nonBMP at 27, will split in chunks
1627            0xe021,0xe022,0x1F600,0xe025,0xe026,0xe027, 0xe028,0xe029,0xe02a,0x1F600,0xe02d,0xe02e,0xe02f, // 028-040
1628     0xe030,0xe031,0xe032,0x1F600,0xe035,0xe036,0xe037, 0xe038,0xe039,0xe03a,0x1F600,0xe03d,0xe03e,0xe03f, // 041-054
1629     0xDC02,0xe041,0xe042,0x1F600,0xe045,0xe046,0xe047, 0xe048,0xe049,0xe04a,0x1F600,0xe04d,0xe04e,0xe04f, // 055-068, unpaired trail at 55
1630     0xe050,0xe051,0xe052,0x1F600,0xe055,0xe056,0xe057, 0xe058,0xe059,0xe05a,0x1F600,0xe05d,0xe05e,0xD801, // 069-082, unpaired lead at 82
1631     0xe060,0xe061,0xe062,0x1F600,0xe065,0xe066,0xe067, 0xe068,0xe069,0xe06a,0x1F600,0xe06d,0xe06e,0xe06f, // 083-096
1632     0xE070,0xe071,0xe072,0x1F600,0xe075,0xe076,0xe077, 0xe078,0xe079,0xe07a,0x1F600,0xe07d,0xe07e,0xD802, // 097-110, unpaired lead at 110
1633 };
1634 
1635 enum {
1636     kTestAccessSmallChunkSize = 8,
1637     kTestAccessLargeChunkSize = 32,
1638     kTextAccessGapSize = 2
1639 };
1640 
1641 typedef struct {
1642     int64_t nativeOffset;
1643     UChar32 expectChar;
1644 } OffsetAndChar;
1645 
1646 static const OffsetAndChar testAccessEntries[] = { // sequence of offsets to test with expected UChar32
1647     // random access
1648     { 127,  0xD802 },
1649     { 16,   0xE010 },
1650     { 95,   0xD801 },
1651     { 31,   0x10001 },
1652     { 112,  0xE070 },
1653     { 0,    0xDC00 },
1654     { 64,   0xDC02 },
1655     { 32,   0x10001 },
1656     // sequential access
1657     { 0,    0xDC00 },
1658     { 16,   0xE010 },
1659     { 31,   0x10001 },
1660     { 32,   0x10001 },
1661     { 64,   0xDC02 },
1662     { 95,   0xD801 },
1663     { 112,  0xE070 },
1664     { 127,  0xD802 },
1665 };
1666 
1667 static const OffsetAndChar testAccess32Entries[] = { // sequence of offsets to test with expected UChar32
1668     // random access
1669     { 110,  0xD802 },   // 0 *
1670     { 14,   0xE010 },   // 1
1671     { 82,   0xD801 },   // 2 *
1672     { 27,   0x10001 },  // 3 *
1673     { 97,   0xE070 },   // 4
1674     { 0,    0xDC00 },   // 5
1675     { 55,   0xDC02 },   // 6
1676     // sequential access
1677     { 0,    0xDC00 },   // 7
1678     { 14,   0xE010 },   // 8
1679     { 27,   0x10001 },  // 9 *
1680     { 55,   0xDC02 },   // 10
1681     { 97,   0xE070 },   // 11
1682     { 82,   0xD801 },   // 12 *
1683     { 110,  0xD802 },   // 13 *
1684 };
1685 // modified UTextAccess function for char16_t string; a cross between
1686 // UText ucstrTextAccess and a function that modifies chunk size
1687 // 1. assumes native length is known and in ut->a
1688 // 2. assumes that most fields may be 0 or nullptr, will fill out if index not in range
1689 // 3. Will designate buffer of size kTestAccessSmallChunkSize or kTestAccessLargeChunkSize
1690 //    depending on kTextAccessGapSize
1691 static UBool
ustrTextAccessModChunks(UText * ut,int64_t index,UBool forward)1692 ustrTextAccessModChunks(UText *ut, int64_t index, UBool forward) {
1693     const char16_t *str = (const char16_t *)ut->context;
1694     int64_t length = ut->a;
1695 
1696     // pin the requested index to the bounds of the string
1697     if (index < 0) {
1698         index = 0;
1699     } else if (index > length) {
1700         index = length;
1701     }
1702     if (forward) {
1703         if (index < ut->chunkNativeLimit && index >= ut->chunkNativeStart) {
1704             /* Already inside the buffer. Set the new offset. */
1705             ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
1706             return true;
1707         }
1708         if (index >= length && ut->chunkNativeLimit == length) {
1709             /* Off the end of the buffer, but we can't get it. */
1710             ut->chunkOffset = ut->chunkLength;
1711             return false;
1712         }
1713     }
1714     else {
1715         if (index <= ut->chunkNativeLimit && index > ut->chunkNativeStart) {
1716             /* Already inside the buffer. Set the new offset. */
1717             ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
1718             return true;
1719         }
1720         if (index == 0 && ut->chunkNativeStart == 0) {
1721             /* Already at the beginning; can't go any farther */
1722             ut->chunkOffset = 0;
1723             return false;
1724         }
1725     }
1726     /* It's not inside the buffer. Start over from scratch. */
1727     // Assume large chunk size for first access
1728     int32_t chunkSize = kTestAccessLargeChunkSize;
1729     if (ut->chunkContents != nullptr && ut->chunkLength != 0) {
1730         // Subsequent access, set chunk size depending on gap (smaller chunk for large gap => random access)
1731         int64_t gap = forward ? (index-ut->chunkNativeLimit) : (ut->chunkNativeStart-index);
1732         if (gap < 0) {
1733             gap = -gap;
1734         }
1735         chunkSize = (gap > kTextAccessGapSize)? kTestAccessSmallChunkSize: kTestAccessLargeChunkSize;
1736     }
1737     ut->chunkLength = chunkSize;
1738     ut->chunkOffset = index % chunkSize;
1739     if (!forward && ut->chunkOffset == 0 && index >= chunkSize) {
1740         ut->chunkOffset = chunkSize;
1741     }
1742     ut->chunkNativeStart = index - ut->chunkOffset;
1743     ut->chunkNativeLimit = ut->chunkNativeStart + ut->chunkLength;
1744     ut->chunkContents = str + ut->chunkNativeStart;
1745     ut->nativeIndexingLimit = ut->chunkLength;
1746     return true;
1747 }
1748 
1749 // For testing UTF32 access (no native index does not match chunk offset/index
1750 
1751 /**
1752  * @return the length, in the native units of the original text string.
1753  */
1754 // 1. assumes native length is known and in ut->a
1755 static int64_t
u32NativeLength(UText * ut)1756 u32NativeLength(UText *ut) {
1757     return ut->a;
1758 }
1759 
1760 /**
1761  * Map from the current char16_t offset within the current text chunk to
1762  *  the corresponding native index in the original source text.
1763  * @return Absolute (native) index corresponding to chunkOffset in the current chunk.
1764  *         The returned native index should always be to a code point boundary.
1765  */
1766 // 1. assumes native length is known and in ut->a
1767 // 2. assumes that pointer to offset map is in
1768 static int64_t
u32MapOffsetToNative(const UText * ut)1769 u32MapOffsetToNative(const UText *ut) {
1770     const int64_t* offsetMap = (const int64_t*)ut->p;
1771     int64_t u16Offset = offsetMap[ut->chunkNativeStart] + ut->chunkOffset;
1772     int64_t index = ut->a;
1773     while (u16Offset < offsetMap[index]) {
1774         index--;
1775     }
1776     return index;
1777 }
1778 
1779 /**
1780  * Map from a native index to a char16_t offset within a text chunk.
1781  * Behavior is undefined if the native index does not fall within the
1782  *   current chunk.
1783  * @param nativeIndex Absolute (native) text index, chunk->start<=index<=chunk->limit.
1784  * @return            Chunk-relative UTF-16 offset corresponding to the specified native
1785  *                    index.
1786  */
1787 static int32_t
u32MapNativeIndexToUTF16(const UText * ut,int64_t index)1788 u32MapNativeIndexToUTF16(const UText *ut, int64_t index) {
1789     const int64_t* offsetMap = (const int64_t*)ut->p;
1790     if (index <= ut->chunkNativeStart) {
1791         return 0;
1792     } else if (index >= ut->chunkNativeLimit) {
1793         return ut->chunkLength;
1794     }
1795     return (offsetMap[index] - offsetMap[ut->chunkNativeStart]);
1796 }
1797 
1798 static void
u32Close(UText * ut)1799 u32Close(UText *ut) {
1800     uprv_free((void*)ut->p);
1801 }
1802 
1803 static UBool
u32Access(UText * ut,int64_t index,UBool forward)1804 u32Access(UText *ut, int64_t index, UBool forward) {
1805     int64_t length = ut->a;
1806     const int64_t* offsetMap = (const int64_t*)ut->p;
1807     const char16_t *u16 = (const char16_t *)ut->q;
1808 
1809     // pin the requested index to the bounds of the string
1810     if (index < 0) {
1811         index = 0;
1812     } else if (index > length) {
1813         index = length;
1814     }
1815     if (forward) {
1816         if (index < ut->chunkNativeLimit && index >= ut->chunkNativeStart) {
1817             /* Already inside the buffer. Set the new offset. */
1818             ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
1819             return true;
1820         }
1821         if (index >= length && ut->chunkNativeLimit == length) {
1822             /* Off the end of the buffer, but we can't get it. */
1823             ut->chunkOffset = ut->chunkLength;
1824             return false;
1825         }
1826     }
1827     else {
1828         if (index <= ut->chunkNativeLimit && index > ut->chunkNativeStart) {
1829             /* Already inside the buffer. Set the new offset. */
1830             ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
1831             return true;
1832         }
1833         if (index == 0 && ut->chunkNativeStart == 0) {
1834             /* Already at the beginning; can't go any farther */
1835             ut->chunkOffset = 0;
1836             return false;
1837         }
1838     }
1839     /* It's not inside the buffer. Start over from scratch. */
1840     // Assume large chunk size for first access
1841     int32_t chunkSize = kTestAccessLargeChunkSize;
1842     if (ut->chunkContents != nullptr && ut->chunkLength != 0) {
1843         // Subsequent access, set chunk size depending on gap (smaller chunk for large gap => random access)
1844         int64_t gap = forward ? (index-ut->chunkNativeLimit) : (ut->chunkNativeStart-index);
1845         if (gap < 0) {
1846             gap = -gap;
1847         }
1848         chunkSize = (gap > kTextAccessGapSize)? kTestAccessSmallChunkSize: kTestAccessLargeChunkSize;
1849     }
1850     int64_t u16Offset = offsetMap[index]; // guaranteed to be on code point boundary
1851     int64_t u16ChunkTryStart = (u16Offset/chunkSize) * chunkSize;
1852     int64_t u16ChunkTryEnd = u16ChunkTryStart + chunkSize;
1853     if (!forward && u16ChunkTryStart==u16Offset && u16ChunkTryStart>0) {
1854         u16ChunkTryEnd = u16ChunkTryStart;
1855         u16ChunkTryStart -= chunkSize;
1856     }
1857     int64_t nativeIndexEnd = length;
1858     while (u16ChunkTryEnd < offsetMap[nativeIndexEnd]) {
1859         nativeIndexEnd--;
1860     }
1861     int64_t nativeIndexStart = nativeIndexEnd;
1862     while (u16ChunkTryStart < offsetMap[nativeIndexStart]) {
1863         nativeIndexStart--;
1864     }
1865     if (forward && nativeIndexEnd < length && u16Offset >= offsetMap[nativeIndexEnd]) {
1866         // oops we need to be in the following chunk
1867         nativeIndexStart = nativeIndexEnd;
1868         u16ChunkTryEnd = ((offsetMap[nativeIndexStart + 1] + chunkSize)/chunkSize) * chunkSize;
1869         nativeIndexEnd = length;
1870         while (u16ChunkTryEnd < offsetMap[nativeIndexEnd]) {
1871             nativeIndexEnd--;
1872         }
1873     }
1874     ut->chunkNativeStart = nativeIndexStart;
1875     ut->chunkNativeLimit = nativeIndexEnd;
1876     ut->chunkLength = offsetMap[nativeIndexEnd] - offsetMap[nativeIndexStart];
1877     ut->chunkOffset = u16Offset - offsetMap[nativeIndexStart];
1878     ut->chunkContents = u16 + offsetMap[nativeIndexStart];
1879     ut->nativeIndexingLimit = 0 ;
1880     return true;
1881 }
1882 
1883 static const struct UTextFuncs u32Funcs =
1884 {
1885     sizeof(UTextFuncs),
1886     0, 0, 0,              // Reserved alignment padding
1887     nullptr,              // Clone
1888     u32NativeLength,
1889     u32Access,
1890     nullptr,              // Extract
1891     nullptr,              // Replace
1892     nullptr,              // Copy
1893     u32MapOffsetToNative,
1894     u32MapNativeIndexToUTF16,
1895     u32Close,
1896     nullptr,              // spare 1
1897     nullptr,              // spare 2
1898     nullptr,              // spare 3
1899 };
1900 
1901 // A hack, this takes a pointer to both the UTF32 and UTF16 versions of the text
1902 static UText *
utext_openUChar32s(UText * ut,const UChar32 * s,int64_t length,const char16_t * q,UErrorCode * status)1903 utext_openUChar32s(UText *ut, const UChar32 *s, int64_t length, const char16_t *q, UErrorCode *status) {
1904     if (U_FAILURE(*status)) {
1905         return nullptr;
1906     }
1907     if (s==nullptr || length < 0) {
1908         *status = U_ILLEGAL_ARGUMENT_ERROR;
1909         return nullptr;
1910     }
1911     ut = utext_setup(ut, 0, status);
1912     if (U_SUCCESS(*status)) {
1913         int64_t* offsetMap = (int64_t*)uprv_malloc((length+1)*sizeof(int64_t));
1914         if (offsetMap == nullptr) {
1915             *status = U_MEMORY_ALLOCATION_ERROR;
1916             return nullptr;
1917         }
1918         ut->pFuncs               = &u32Funcs;
1919         ut->context              = s;
1920         ut->providerProperties   = 0;
1921         ut->a                    = length;
1922         ut->chunkContents        = nullptr;
1923         ut->chunkNativeStart     = 0;
1924         ut->chunkNativeLimit     = 0;
1925         ut->chunkLength          = 0;
1926         ut->chunkOffset          = 0;
1927         ut->nativeIndexingLimit  = 0;
1928         ut->p                    = offsetMap;
1929         ut->q                    = q;
1930         int64_t u16Offset = 0;
1931         *offsetMap++ = 0;
1932         while (length-- > 0) {
1933             u16Offset += (*s++ < 0x10000)? 1: 2;
1934             *offsetMap++ = u16Offset;
1935         }
1936     }
1937     return ut;
1938 }
1939 
1940 
1941 
AccessChangesChunkSize()1942 void UTextTest::AccessChangesChunkSize() {
1943     UErrorCode status = U_ZERO_ERROR;
1944     UText ut = UTEXT_INITIALIZER;
1945     utext_openUChars(&ut, testAccessText, UPRV_LENGTHOF(testAccessText), &status);
1946     if (U_FAILURE(status)) {
1947         errln("utext_openUChars failed: %s", u_errorName(status));
1948         return;
1949     }
1950     // now reset many ut fields for this test
1951     ut.providerProperties = 0; // especially need to clear UTEXT_PROVIDER_STABLE_CHUNKS
1952     ut.chunkNativeLimit = 0;
1953     ut.nativeIndexingLimit = 0;
1954     ut.chunkNativeStart = 0;
1955     ut.chunkOffset = 0;
1956     ut.chunkLength = 0;
1957     ut.chunkContents = nullptr;
1958     UTextFuncs textFuncs = *ut.pFuncs;
1959     textFuncs.access = ustrTextAccessModChunks; // custom access that changes chunk size
1960     ut.pFuncs = &textFuncs;
1961 
1962     // do test
1963 	const OffsetAndChar *testEntryPtr = testAccessEntries;
1964 	int32_t testCount = UPRV_LENGTHOF(testAccessEntries);
1965 	for (; testCount-- > 0; testEntryPtr++) {
1966 	    utext_setNativeIndex(&ut, testEntryPtr->nativeOffset);
1967 	    int64_t beforeOffset = utext_getNativeIndex(&ut);
1968 	    UChar32 uchar = utext_current32(&ut);
1969 	    int64_t afterOffset = utext_getNativeIndex(&ut);
1970 	    if (uchar != testEntryPtr->expectChar || afterOffset != beforeOffset) {
1971 	        errln("utext_current32 unexpected behavior for u16, test case %lld: expected char %04X at offset %lld, got %04X at %lld;\n"
1972 	            "chunkNativeStart %lld chunkNativeLimit %lld nativeIndexingLimit %d chunkLength %d chunkOffset %d",
1973 	            (int64_t)(testEntryPtr-testAccessEntries), testEntryPtr->expectChar, beforeOffset, uchar, afterOffset,
1974 	            ut.chunkNativeStart, ut.chunkNativeLimit, ut.nativeIndexingLimit, ut.chunkLength, ut.chunkOffset);
1975 	    }
1976 	}
1977 	utext_close(&ut);
1978 
1979 	ut = UTEXT_INITIALIZER;
1980 	utext_openUChar32s(&ut, testAccess32Text, UPRV_LENGTHOF(testAccess32Text), testAccessText, &status);
1981     if (U_FAILURE(status)) {
1982         errln("utext_openUChar32s failed: %s", u_errorName(status));
1983         return;
1984     }
1985     // do test
1986 	testEntryPtr = testAccess32Entries;
1987 	testCount = UPRV_LENGTHOF(testAccess32Entries);
1988 	for (; testCount-- > 0; testEntryPtr++) {
1989 	    utext_setNativeIndex(&ut, testEntryPtr->nativeOffset);
1990 	    int64_t beforeOffset = utext_getNativeIndex(&ut);
1991 	    UChar32 uchar = utext_current32(&ut);
1992 	    int64_t afterOffset = utext_getNativeIndex(&ut);
1993 	    if (uchar != testEntryPtr->expectChar || afterOffset != beforeOffset) {
1994 	        errln("utext_current32 unexpected behavior for u32, test case %lld: expected char %04X at offset %lld, got %04X at %lld;\n"
1995 	            "chunkNativeStart %lld chunkNativeLimit %lld nativeIndexingLimit %d chunkLength %d chunkOffset %d",
1996 	            (int64_t)(testEntryPtr-testAccess32Entries), testEntryPtr->expectChar, beforeOffset, uchar, afterOffset,
1997 	            ut.chunkNativeStart, ut.chunkNativeLimit, ut.nativeIndexingLimit, ut.chunkLength, ut.chunkOffset);
1998 	    }
1999 	}
2000 	utext_close(&ut);
2001 }
2002 
2003