• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * COPYRIGHT:
5  * Copyright (c) 2005-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  ********************************************************************/
8 /************************************************************************
9 *   Tests for the UText and UTextIterator text abstraction classes
10 *
11 ************************************************************************/
12 
13 #include <string.h>
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include "unicode/utypes.h"
17 #include "unicode/utext.h"
18 #include "unicode/utf8.h"
19 #include "unicode/utf16.h"
20 #include "unicode/ustring.h"
21 #include "unicode/uchriter.h"
22 #include "cmemory.h"
23 #include "cstr.h"
24 #include "utxttest.h"
25 
26 static UBool  gFailed = false;
27 static int    gTestNum = 0;
28 
29 // Forward decl
30 UText *openFragmentedUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status);
31 
32 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
33     if ((x)==false) { \
34         errln("Test #%d failure in file %s at line %d\n", gTestNum, __FILE__, __LINE__); \
35         gFailed = true; \
36     } \
37 } UPRV_BLOCK_MACRO_END
38 
39 
40 #define TEST_SUCCESS(status) UPRV_BLOCK_MACRO_BEGIN { \
41     if (U_FAILURE(status)) { \
42         errln("Test #%d failure in file %s at line %d. Error = \"%s\"\n", \
43               gTestNum, __FILE__, __LINE__, u_errorName(status)); \
44         gFailed = true; \
45     } \
46 } UPRV_BLOCK_MACRO_END
47 
UTextTest()48 UTextTest::UTextTest() {
49 }
50 
~UTextTest()51 UTextTest::~UTextTest() {
52 }
53 
54 
55 void
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)56 UTextTest::runIndexedTest(int32_t index, UBool exec,
57                           const char* &name, char* /*par*/) {
58     TESTCASE_AUTO_BEGIN;
59     TESTCASE_AUTO(TextTest);
60     TESTCASE_AUTO(ErrorTest);
61     TESTCASE_AUTO(FreezeTest);
62     TESTCASE_AUTO(Ticket5560);
63     TESTCASE_AUTO(Ticket6847);
64     TESTCASE_AUTO(Ticket10562);
65     TESTCASE_AUTO(Ticket10983);
66     TESTCASE_AUTO(Ticket12130);
67     TESTCASE_AUTO(Ticket13344);
68     TESTCASE_AUTO(AccessChangesChunkSize);
69     TESTCASE_AUTO_END;
70 }
71 
72 //
73 // Quick and dirty random number generator.
74 //   (don't use library so that results are portable.
75 static uint32_t m_seed = 1;
m_rand()76 static uint32_t m_rand()
77 {
78     m_seed = m_seed * 1103515245 + 12345;
79     return (uint32_t)(m_seed/65536) % 32768;
80 }
81 
82 
83 //
84 //   TextTest()
85 //
86 //       Top Level function for UText testing.
87 //       Specifies the strings to be tested, with the actual testing itself
88 //       being carried out in another function, TestString().
89 //
TextTest()90 void  UTextTest::TextTest() {
91     int32_t i, j;
92 
93     TestString("abcd\\U00010001xyz");
94     TestString("");
95 
96     // Supplementary chars at start or end
97     TestString("\\U00010001");
98     TestString("abc\\U00010001");
99     TestString("\\U00010001abc");
100 
101     // Test simple strings of lengths 1 to 60, looking for glitches at buffer boundaries
102     UnicodeString s;
103     for (i=1; i<60; i++) {
104         s.truncate(0);
105         for (j=0; j<i; j++) {
106             if (j+0x30 == 0x5c) {
107                 // backslash.  Needs to be escaped
108                 s.append((char16_t)0x5c);
109             }
110             s.append(char16_t(j+0x30));
111         }
112         TestString(s);
113     }
114 
115    // Test strings with odd-aligned supplementary chars,
116    //    looking for glitches at buffer boundaries
117     for (i=1; i<60; i++) {
118         s.truncate(0);
119         s.append((char16_t)0x41);
120         for (j=0; j<i; j++) {
121             s.append(UChar32(j+0x11000));
122         }
123         TestString(s);
124     }
125 
126     // String of chars of randomly varying size in utf-8 representation.
127     //   Exercise the mapping, and the varying sized buffer.
128     //
129     s.truncate(0);
130     UChar32  c1 = 0;
131     UChar32  c2 = 0x100;
132     UChar32  c3 = 0xa000;
133     UChar32  c4 = 0x11000;
134     for (i=0; i<1000; i++) {
135         int len8 = m_rand()%4 + 1;
136         switch (len8) {
137             case 1:
138                 c1 = (c1+1)%0x80;
139                 // don't put 0 into string (0 terminated strings for some tests)
140                 // don't put '\', will cause unescape() to fail.
141                 if (c1==0x5c || c1==0) {
142                     c1++;
143                 }
144                 s.append(c1);
145                 break;
146             case 2:
147                 s.append(c2++);
148                 break;
149             case 3:
150                 s.append(c3++);
151                 break;
152             case 4:
153                 s.append(c4++);
154                 break;
155         }
156     }
157     TestString(s);
158 }
159 
160 
161 //
162 //  TestString()     Run a suite of UText tests on a string.
163 //                   The test string is unescaped before use.
164 //
TestString(const UnicodeString & s)165 void UTextTest::TestString(const UnicodeString &s) {
166     int32_t       i;
167     int32_t       j;
168     UChar32       c;
169     int32_t       cpCount = 0;
170     UErrorCode    status  = U_ZERO_ERROR;
171     UText        *ut      = nullptr;
172     int32_t       saLen;
173 
174     UnicodeString sa = s.unescape();
175     saLen = sa.length();
176 
177     //
178     // Build up a mapping between code points and UTF-16 code unit indexes.
179     //
180     m *cpMap = new m[sa.length() + 1];
181     j = 0;
182     for (i=0; i<sa.length(); i=sa.moveIndex32(i, 1)) {
183         c = sa.char32At(i);
184         cpMap[j].nativeIdx = i;
185         cpMap[j].cp = c;
186         j++;
187         cpCount++;
188     }
189     cpMap[j].nativeIdx = i;   // position following the last char in utf-16 string.
190 
191 
192     // char16_t * test, null terminated
193     status = U_ZERO_ERROR;
194     char16_t *buf = new char16_t[saLen+1];
195     sa.extract(buf, saLen+1, status);
196     TEST_SUCCESS(status);
197     ut = utext_openUChars(nullptr, buf, -1, &status);
198     TEST_SUCCESS(status);
199     TestAccess(sa, ut, cpCount, cpMap);
200     utext_close(ut);
201     delete [] buf;
202 
203     // char16_t * test, with length
204     status = U_ZERO_ERROR;
205     buf = new char16_t[saLen+1];
206     sa.extract(buf, saLen+1, status);
207     TEST_SUCCESS(status);
208     ut = utext_openUChars(nullptr, buf, saLen, &status);
209     TEST_SUCCESS(status);
210     TestAccess(sa, ut, cpCount, cpMap);
211     utext_close(ut);
212     delete [] buf;
213 
214 
215     // UnicodeString test
216     status = U_ZERO_ERROR;
217     ut = utext_openUnicodeString(nullptr, &sa, &status);
218     TEST_SUCCESS(status);
219     TestAccess(sa, ut, cpCount, cpMap);
220     TestCMR(sa, ut, cpCount, cpMap, cpMap);
221     utext_close(ut);
222 
223 
224     // Const UnicodeString test
225     status = U_ZERO_ERROR;
226     ut = utext_openConstUnicodeString(nullptr, &sa, &status);
227     TEST_SUCCESS(status);
228     TestAccess(sa, ut, cpCount, cpMap);
229     utext_close(ut);
230 
231 
232     // Replaceable test.  (UnicodeString inherits Replaceable)
233     status = U_ZERO_ERROR;
234     ut = utext_openReplaceable(nullptr, &sa, &status);
235     TEST_SUCCESS(status);
236     TestAccess(sa, ut, cpCount, cpMap);
237     TestCMR(sa, ut, cpCount, cpMap, cpMap);
238     utext_close(ut);
239 
240     // Character Iterator Tests
241     status = U_ZERO_ERROR;
242     const char16_t *cbuf = sa.getBuffer();
243     CharacterIterator *ci = new UCharCharacterIterator(cbuf, saLen, status);
244     TEST_SUCCESS(status);
245     ut = utext_openCharacterIterator(nullptr, ci, &status);
246     TEST_SUCCESS(status);
247     TestAccess(sa, ut, cpCount, cpMap);
248     utext_close(ut);
249     delete ci;
250 
251 
252     // Fragmented UnicodeString  (Chunk size of one)
253     //
254     status = U_ZERO_ERROR;
255     ut = openFragmentedUnicodeString(nullptr, &sa, &status);
256     TEST_SUCCESS(status);
257     TestAccess(sa, ut, cpCount, cpMap);
258     utext_close(ut);
259 
260     //
261     // UTF-8 test
262     //
263 
264     // Convert the test string from UnicodeString to (char *) in utf-8 format
265     int32_t u8Len = sa.extract(0, sa.length(), nullptr, 0, "utf-8");
266     char *u8String = new char[u8Len + 1];
267     sa.extract(0, sa.length(), u8String, u8Len+1, "utf-8");
268 
269     // Build up the map of code point indices in the utf-8 string
270     m * u8Map = new m[sa.length() + 1];
271     i = 0;   // native utf-8 index
272     for (j=0; j<cpCount ; j++) {  // code point number
273         u8Map[j].nativeIdx = i;
274         U8_NEXT(u8String, i, u8Len, c);
275         u8Map[j].cp = c;
276     }
277     u8Map[cpCount].nativeIdx = u8Len;   // position following the last char in utf-8 string.
278 
279     // Do the test itself
280     status = U_ZERO_ERROR;
281     ut = utext_openUTF8(nullptr, u8String, -1, &status);
282     TEST_SUCCESS(status);
283     TestAccess(sa, ut, cpCount, u8Map);
284     utext_close(ut);
285 
286 
287 
288     delete []cpMap;
289     delete []u8Map;
290     delete []u8String;
291 }
292 
293 //  TestCMR   test Copy, Move and Replace operations.
294 //              us         UnicodeString containing the test text.
295 //              ut         UText containing the same test text.
296 //              cpCount    number of code points in the test text.
297 //              nativeMap  Mapping from code points to native indexes for the UText.
298 //              u16Map     Mapping from code points to UTF-16 indexes, for use with the UnicodeString.
299 //
300 //     This function runs a whole series of operations on each incoming UText.
301 //     The UText is deep-cloned prior to each operation, so that the original UText remains unchanged.
302 //
TestCMR(const UnicodeString & us,UText * ut,int cpCount,m * nativeMap,m * u16Map)303 void UTextTest::TestCMR(const UnicodeString &us, UText *ut, int cpCount, m *nativeMap, m *u16Map) {
304     TEST_ASSERT(utext_isWritable(ut) == true);
305 
306     int  srcLengthType;       // Loop variables for selecting the position and length
307     int  srcPosType;          //   of the block to operate on within the source text.
308     int  destPosType;
309 
310     int  srcIndex  = 0;       // Code Point indexes of the block to operate on for
311     int  srcLength = 0;       //   a specific test.
312 
313     int  destIndex = 0;       // Code point index of the destination for a copy/move test.
314 
315     int32_t  nativeStart = 0; // Native unit indexes for a test.
316     int32_t  nativeLimit = 0;
317     int32_t  nativeDest  = 0;
318 
319     int32_t  u16Start    = 0; // UTF-16 indexes for a test.
320     int32_t  u16Limit    = 0; //   used when performing the same operation in a Unicode String
321     int32_t  u16Dest     = 0;
322 
323     // Iterate over a whole series of source index, length and a target indexes.
324     // This is done with code point indexes; these will be later translated to native
325     //   indexes using the cpMap.
326     for (srcLengthType=1; srcLengthType<=3; srcLengthType++) {
327         switch (srcLengthType) {
328             case 1: srcLength = 1; break;
329             case 2: srcLength = 5; break;
330             case 3: srcLength = cpCount / 3;
331         }
332         for (srcPosType=1; srcPosType<=5; srcPosType++) {
333             switch (srcPosType) {
334                 case 1: srcIndex = 0; break;
335                 case 2: srcIndex = 1; break;
336                 case 3: srcIndex = cpCount - srcLength; break;
337                 case 4: srcIndex = cpCount - srcLength - 1; break;
338                 case 5: srcIndex = cpCount / 2; break;
339             }
340             if (srcIndex < 0 || srcIndex + srcLength > cpCount) {
341                 // filter out bogus test cases -
342                 //   those with a source range that falls of an edge of the string.
343                 continue;
344             }
345 
346             //
347             // Copy and move tests.
348             //   iterate over a variety of destination positions.
349             //
350             for (destPosType=1; destPosType<=4; destPosType++) {
351                 switch (destPosType) {
352                     case 1: destIndex = 0; break;
353                     case 2: destIndex = 1; break;
354                     case 3: destIndex = srcIndex - 1; break;
355                     case 4: destIndex = srcIndex + srcLength + 1; break;
356                     case 5: destIndex = cpCount-1; break;
357                     case 6: destIndex = cpCount; break;
358                 }
359                 if (destIndex<0 || destIndex>cpCount) {
360                     // filter out bogus test cases.
361                     continue;
362                 }
363 
364                 nativeStart = nativeMap[srcIndex].nativeIdx;
365                 nativeLimit = nativeMap[srcIndex+srcLength].nativeIdx;
366                 nativeDest  = nativeMap[destIndex].nativeIdx;
367 
368                 u16Start    = u16Map[srcIndex].nativeIdx;
369                 u16Limit    = u16Map[srcIndex+srcLength].nativeIdx;
370                 u16Dest     = u16Map[destIndex].nativeIdx;
371 
372                 gFailed = false;
373                 TestCopyMove(us, ut, false,
374                     nativeStart, nativeLimit, nativeDest,
375                     u16Start, u16Limit, u16Dest);
376 
377                 TestCopyMove(us, ut, true,
378                     nativeStart, nativeLimit, nativeDest,
379                     u16Start, u16Limit, u16Dest);
380 
381                 if (gFailed) {
382                     return;
383                 }
384             }
385 
386             //
387             //  Replace tests.
388             //
389             UnicodeString fullRepString("This is an arbitrary string that will be used as replacement text");
390             for (int32_t replStrLen=0; replStrLen<20; replStrLen++) {
391                 UnicodeString repStr(fullRepString, 0, replStrLen);
392                 TestReplace(us, ut,
393                     nativeStart, nativeLimit,
394                     u16Start, u16Limit,
395                     repStr);
396                 if (gFailed) {
397                     return;
398                 }
399             }
400 
401         }
402     }
403 
404 }
405 
406 //
407 //   TestCopyMove    run a single test case for utext_copy.
408 //                   Test cases are created in TestCMR and dispatched here for execution.
409 //
TestCopyMove(const UnicodeString & us,UText * ut,UBool move,int32_t nativeStart,int32_t nativeLimit,int32_t nativeDest,int32_t u16Start,int32_t u16Limit,int32_t u16Dest)410 void UTextTest::TestCopyMove(const UnicodeString &us, UText *ut, UBool move,
411                     int32_t nativeStart, int32_t nativeLimit, int32_t nativeDest,
412                     int32_t u16Start, int32_t u16Limit, int32_t u16Dest)
413 {
414     UErrorCode      status   = U_ZERO_ERROR;
415     UText          *targetUT = nullptr;
416     gTestNum++;
417     gFailed = false;
418 
419     //
420     //  clone the UText.  The test will be run in the cloned copy
421     //  so that we don't alter the original.
422     //
423     targetUT = utext_clone(nullptr, ut, true, false, &status);
424     TEST_SUCCESS(status);
425     UnicodeString targetUS(us);    // And copy the reference string.
426 
427     // do the test operation first in the reference
428     targetUS.copy(u16Start, u16Limit, u16Dest);
429     if (move) {
430         // delete out the source range.
431         if (u16Limit < u16Dest) {
432             targetUS.removeBetween(u16Start, u16Limit);
433         } else {
434             int32_t amtCopied = u16Limit - u16Start;
435             targetUS.removeBetween(u16Start+amtCopied, u16Limit+amtCopied);
436         }
437     }
438 
439     // Do the same operation in the UText under test
440     utext_copy(targetUT, nativeStart, nativeLimit, nativeDest, move, &status);
441     if (nativeDest > nativeStart && nativeDest < nativeLimit) {
442         TEST_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
443     } else {
444         TEST_SUCCESS(status);
445 
446         // Compare the results of the two parallel tests
447         int32_t  usi = 0;    // UnicodeString position, utf-16 index.
448         int64_t  uti = 0;    // UText position, native index.
449         int32_t  cpi;        // char32 position (code point index)
450         UChar32  usc;        // code point from Unicode String
451         UChar32  utc;        // code point from UText
452         utext_setNativeIndex(targetUT, 0);
453         for (cpi=0; ; cpi++) {
454             usc = targetUS.char32At(usi);
455             utc = utext_next32(targetUT);
456             if (utc < 0) {
457                 break;
458             }
459             TEST_ASSERT(uti == usi);
460             TEST_ASSERT(utc == usc);
461             usi = targetUS.moveIndex32(usi, 1);
462             uti = utext_getNativeIndex(targetUT);
463             if (gFailed) {
464                 goto cleanupAndReturn;
465             }
466         }
467         int64_t expectedNativeLength = utext_nativeLength(ut);
468         if (move == false) {
469             expectedNativeLength += nativeLimit - nativeStart;
470         }
471         uti = utext_getNativeIndex(targetUT);
472         TEST_ASSERT(uti == expectedNativeLength);
473     }
474 
475 cleanupAndReturn:
476     utext_close(targetUT);
477 }
478 
479 
480 //
481 //  TestReplace   Test a single Replace operation.
482 //
TestReplace(const UnicodeString & us,UText * ut,int32_t nativeStart,int32_t nativeLimit,int32_t u16Start,int32_t u16Limit,const UnicodeString & repStr)483 void UTextTest::TestReplace(
484             const UnicodeString &us,     // reference UnicodeString in which to do the replace
485             UText         *ut,                // UnicodeText object under test.
486             int32_t       nativeStart,        // Range to be replaced, in UText native units.
487             int32_t       nativeLimit,
488             int32_t       u16Start,           // Range to be replaced, in UTF-16 units
489             int32_t       u16Limit,           //    for use in the reference UnicodeString.
490             const UnicodeString &repStr)      // The replacement string
491 {
492     UErrorCode      status   = U_ZERO_ERROR;
493     UText          *targetUT = nullptr;
494     gTestNum++;
495     gFailed = false;
496 
497     //
498     //  clone the target UText.  The test will be run in the cloned copy
499     //  so that we don't alter the original.
500     //
501     targetUT = utext_clone(nullptr, ut, true, false, &status);
502     TEST_SUCCESS(status);
503     UnicodeString targetUS(us);    // And copy the reference string.
504 
505     //
506     // Do the replace operation in the Unicode String, to
507     //   produce a reference result.
508     //
509     targetUS.replace(u16Start, u16Limit-u16Start, repStr);
510 
511     //
512     // Do the replace on the UText under test
513     //
514     const char16_t *rs = repStr.getBuffer();
515     int32_t  rsLen = repStr.length();
516     int32_t actualDelta = utext_replace(targetUT, nativeStart, nativeLimit, rs, rsLen, &status);
517     int32_t expectedDelta = repStr.length() - (nativeLimit - nativeStart);
518     TEST_ASSERT(actualDelta == expectedDelta);
519 
520     //
521     // Compare the results
522     //
523     int32_t  usi = 0;    // UnicodeString position, utf-16 index.
524     int64_t  uti = 0;    // UText position, native index.
525     int32_t  cpi;        // char32 position (code point index)
526     UChar32  usc;        // code point from Unicode String
527     UChar32  utc;        // code point from UText
528     int64_t  expectedNativeLength = 0;
529     utext_setNativeIndex(targetUT, 0);
530     for (cpi=0; ; cpi++) {
531         usc = targetUS.char32At(usi);
532         utc = utext_next32(targetUT);
533         if (utc < 0) {
534             break;
535         }
536         TEST_ASSERT(uti == usi);
537         TEST_ASSERT(utc == usc);
538         usi = targetUS.moveIndex32(usi, 1);
539         uti = utext_getNativeIndex(targetUT);
540         if (gFailed) {
541             goto cleanupAndReturn;
542         }
543     }
544     expectedNativeLength = utext_nativeLength(ut) + expectedDelta;
545     uti = utext_getNativeIndex(targetUT);
546     TEST_ASSERT(uti == expectedNativeLength);
547 
548 cleanupAndReturn:
549     utext_close(targetUT);
550 }
551 
552 //
553 //  TestAccess      Test the read only access functions on a UText, including cloning.
554 //                  The text is accessed in a variety of ways, and compared with
555 //                  the reference UnicodeString.
556 //
TestAccess(const UnicodeString & us,UText * ut,int cpCount,m * cpMap)557 void UTextTest::TestAccess(const UnicodeString &us, UText *ut, int cpCount, m *cpMap) {
558     // Run the standard tests on the caller-supplied UText.
559     TestAccessNoClone(us, ut, cpCount, cpMap);
560 
561     // Re-run tests on a shallow clone.
562     utext_setNativeIndex(ut, 0);
563     UErrorCode status = U_ZERO_ERROR;
564     UText *shallowClone = utext_clone(nullptr, ut, false /*deep*/, false /*readOnly*/, &status);
565     TEST_SUCCESS(status);
566     TestAccessNoClone(us, shallowClone, cpCount, cpMap);
567 
568     //
569     // Rerun again on a deep clone.
570     // Note that text providers are not required to provide deep cloning,
571     //   so unsupported errors are ignored.
572     //
573     status = U_ZERO_ERROR;
574     utext_setNativeIndex(shallowClone, 0);
575     UText *deepClone = utext_clone(nullptr, shallowClone, true, false, &status);
576     utext_close(shallowClone);
577     if (status != U_UNSUPPORTED_ERROR) {
578         TEST_SUCCESS(status);
579         TestAccessNoClone(us, deepClone, cpCount, cpMap);
580     }
581     utext_close(deepClone);
582 }
583 
584 
585 //
586 //  TestAccessNoClone()    Test the read only access functions on a UText.
587 //                         The text is accessed in a variety of ways, and compared with
588 //                         the reference UnicodeString.
589 //
TestAccessNoClone(const UnicodeString & us,UText * ut,int cpCount,m * cpMap)590 void UTextTest::TestAccessNoClone(const UnicodeString &us, UText *ut, int cpCount, m *cpMap) {
591     UErrorCode  status = U_ZERO_ERROR;
592     gTestNum++;
593 
594     //
595     //  Check the length from the UText
596     //
597     int64_t expectedLen = cpMap[cpCount].nativeIdx;
598     int64_t utlen = utext_nativeLength(ut);
599     TEST_ASSERT(expectedLen == utlen);
600 
601     //
602     //  Iterate forwards, verify that we get the correct code points
603     //   at the correct native offsets.
604     //
605     int         i = 0;
606     int64_t     index;
607     int64_t     expectedIndex = 0;
608     int64_t     foundIndex = 0;
609     UChar32     expectedC;
610     UChar32     foundC;
611     int64_t     len;
612 
613     for (i=0; i<cpCount; i++) {
614         expectedIndex = cpMap[i].nativeIdx;
615         foundIndex    = utext_getNativeIndex(ut);
616         TEST_ASSERT(expectedIndex == foundIndex);
617         expectedC     = cpMap[i].cp;
618         foundC        = utext_next32(ut);
619         TEST_ASSERT(expectedC == foundC);
620         foundIndex    = utext_getPreviousNativeIndex(ut);
621         TEST_ASSERT(expectedIndex == foundIndex);
622         if (gFailed) {
623             return;
624         }
625     }
626     foundC = utext_next32(ut);
627     TEST_ASSERT(foundC == U_SENTINEL);
628 
629     // Repeat above, using macros
630     utext_setNativeIndex(ut, 0);
631     for (i=0; i<cpCount; i++) {
632         expectedIndex = cpMap[i].nativeIdx;
633         foundIndex    = UTEXT_GETNATIVEINDEX(ut);
634         TEST_ASSERT(expectedIndex == foundIndex);
635         expectedC     = cpMap[i].cp;
636         foundC        = UTEXT_NEXT32(ut);
637         TEST_ASSERT(expectedC == foundC);
638         if (gFailed) {
639             return;
640         }
641     }
642     foundC = UTEXT_NEXT32(ut);
643     TEST_ASSERT(foundC == U_SENTINEL);
644 
645     //
646     //  Forward iteration (above) should have left index at the
647     //   end of the input, which should == length().
648     //
649     len = utext_nativeLength(ut);
650     foundIndex  = utext_getNativeIndex(ut);
651     TEST_ASSERT(len == foundIndex);
652 
653     //
654     // Iterate backwards over entire test string
655     //
656     len = utext_getNativeIndex(ut);
657     utext_setNativeIndex(ut, len);
658     for (i=cpCount-1; i>=0; i--) {
659         expectedC     = cpMap[i].cp;
660         expectedIndex = cpMap[i].nativeIdx;
661         int64_t prevIndex = utext_getPreviousNativeIndex(ut);
662         foundC        = utext_previous32(ut);
663         foundIndex    = utext_getNativeIndex(ut);
664         TEST_ASSERT(expectedIndex == foundIndex);
665         TEST_ASSERT(expectedC == foundC);
666         TEST_ASSERT(prevIndex == foundIndex);
667         if (gFailed) {
668             return;
669         }
670     }
671 
672     //
673     //  Backwards iteration, above, should have left our iterator
674     //   position at zero, and continued backwards iterationshould fail.
675     //
676     foundIndex = utext_getNativeIndex(ut);
677     TEST_ASSERT(foundIndex == 0);
678     foundIndex = utext_getPreviousNativeIndex(ut);
679     TEST_ASSERT(foundIndex == 0);
680 
681 
682     foundC = utext_previous32(ut);
683     TEST_ASSERT(foundC == U_SENTINEL);
684     foundIndex = utext_getNativeIndex(ut);
685     TEST_ASSERT(foundIndex == 0);
686     foundIndex = utext_getPreviousNativeIndex(ut);
687     TEST_ASSERT(foundIndex == 0);
688 
689 
690     // And again, with the macros
691     utext_setNativeIndex(ut, len);
692     for (i=cpCount-1; i>=0; i--) {
693         expectedC     = cpMap[i].cp;
694         expectedIndex = cpMap[i].nativeIdx;
695         foundC        = UTEXT_PREVIOUS32(ut);
696         foundIndex    = UTEXT_GETNATIVEINDEX(ut);
697         TEST_ASSERT(expectedIndex == foundIndex);
698         TEST_ASSERT(expectedC == foundC);
699         if (gFailed) {
700             return;
701         }
702     }
703 
704     //
705     //  Backwards iteration, above, should have left our iterator
706     //   position at zero, and continued backwards iterationshould fail.
707     //
708     foundIndex = UTEXT_GETNATIVEINDEX(ut);
709     TEST_ASSERT(foundIndex == 0);
710 
711     foundC = UTEXT_PREVIOUS32(ut);
712     TEST_ASSERT(foundC == U_SENTINEL);
713     foundIndex = UTEXT_GETNATIVEINDEX(ut);
714     TEST_ASSERT(foundIndex == 0);
715     if (gFailed) {
716         return;
717     }
718 
719     //
720     //  next32From(), previous32From(), Iterate in a somewhat random order.
721     //
722     int  cpIndex = 0;
723     for (i=0; i<cpCount; i++) {
724         cpIndex = (cpIndex + 9973) % cpCount;
725         index         = cpMap[cpIndex].nativeIdx;
726         expectedC     = cpMap[cpIndex].cp;
727         foundC        = utext_next32From(ut, index);
728         TEST_ASSERT(expectedC == foundC);
729         if (gFailed) {
730             return;
731         }
732     }
733 
734     cpIndex = 0;
735     for (i=0; i<cpCount; i++) {
736         cpIndex = (cpIndex + 9973) % cpCount;
737         index         = cpMap[cpIndex+1].nativeIdx;
738         expectedC     = cpMap[cpIndex].cp;
739         foundC        = utext_previous32From(ut, index);
740         TEST_ASSERT(expectedC == foundC);
741         if (gFailed) {
742             return;
743         }
744     }
745 
746 
747     //
748     // moveIndex(int32_t delta);
749     //
750 
751     // Walk through frontwards, incrementing by one
752     utext_setNativeIndex(ut, 0);
753     for (i=1; i<=cpCount; i++) {
754         utext_moveIndex32(ut, 1);
755         index = utext_getNativeIndex(ut);
756         expectedIndex = cpMap[i].nativeIdx;
757         TEST_ASSERT(expectedIndex == index);
758         index = UTEXT_GETNATIVEINDEX(ut);
759         TEST_ASSERT(expectedIndex == index);
760     }
761 
762     // Walk through frontwards, incrementing by two
763     utext_setNativeIndex(ut, 0);
764     for (i=2; i<cpCount; i+=2) {
765         utext_moveIndex32(ut, 2);
766         index = utext_getNativeIndex(ut);
767         expectedIndex = cpMap[i].nativeIdx;
768         TEST_ASSERT(expectedIndex == index);
769         index = UTEXT_GETNATIVEINDEX(ut);
770         TEST_ASSERT(expectedIndex == index);
771     }
772 
773     // walk through the string backwards, decrementing by one.
774     i = cpMap[cpCount].nativeIdx;
775     utext_setNativeIndex(ut, i);
776     for (i=cpCount; i>=0; i--) {
777         expectedIndex = cpMap[i].nativeIdx;
778         index = utext_getNativeIndex(ut);
779         TEST_ASSERT(expectedIndex == index);
780         index = UTEXT_GETNATIVEINDEX(ut);
781         TEST_ASSERT(expectedIndex == index);
782         utext_moveIndex32(ut, -1);
783     }
784 
785 
786     // walk through backwards, decrementing by three
787     i = cpMap[cpCount].nativeIdx;
788     utext_setNativeIndex(ut, i);
789     for (i=cpCount; i>=0; i-=3) {
790         expectedIndex = cpMap[i].nativeIdx;
791         index = utext_getNativeIndex(ut);
792         TEST_ASSERT(expectedIndex == index);
793         index = UTEXT_GETNATIVEINDEX(ut);
794         TEST_ASSERT(expectedIndex == index);
795         utext_moveIndex32(ut, -3);
796     }
797 
798 
799     //
800     // Extract
801     //
802     int bufSize = us.length() + 10;
803     char16_t *buf = new char16_t[bufSize];
804     status = U_ZERO_ERROR;
805     expectedLen = us.length();
806     len = utext_extract(ut, 0, utlen, buf, bufSize, &status);
807     TEST_SUCCESS(status);
808     TEST_ASSERT(len == expectedLen);
809     int compareResult = us.compare(buf, -1);
810     TEST_ASSERT(compareResult == 0);
811 
812     status = U_ZERO_ERROR;
813     len = utext_extract(ut, 0, utlen, nullptr, 0, &status);
814     if (utlen == 0) {
815         TEST_ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
816     } else {
817         TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR);
818     }
819     TEST_ASSERT(len == expectedLen);
820 
821     status = U_ZERO_ERROR;
822     u_memset(buf, 0x5555, bufSize);
823     len = utext_extract(ut, 0, utlen, buf, 1, &status);
824     if (us.length() == 0) {
825         TEST_SUCCESS(status);
826         TEST_ASSERT(buf[0] == 0);
827     } else {
828         // Buf len == 1, extracting a single 16 bit value.
829         // If the data char is supplementary, it doesn't matter whether the buffer remains unchanged,
830         //   or whether the lead surrogate of the pair is extracted.
831         //   It's a buffer overflow error in either case.
832         TEST_ASSERT(buf[0] == us.charAt(0) ||
833                     (buf[0] == 0x5555 && U_IS_SUPPLEMENTARY(us.char32At(0))));
834         TEST_ASSERT(buf[1] == 0x5555);
835         if (us.length() == 1) {
836             TEST_ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
837         } else {
838             TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR);
839         }
840     }
841 
842     delete []buf;
843 }
844 
845 //
846 //  ErrorTest()    Check various error and edge cases.
847 //
ErrorTest()848 void UTextTest::ErrorTest()
849 {
850     // Close of an uninitialized UText.  Shouldn't blow up.
851     {
852         UText  ut;
853         memset(&ut, 0, sizeof(UText));
854         utext_close(&ut);
855         utext_close(nullptr);
856     }
857 
858     // Double-close of a UText.  Shouldn't blow up.  UText should still be usable.
859     {
860         UErrorCode status = U_ZERO_ERROR;
861         UText ut = UTEXT_INITIALIZER;
862         UnicodeString s("Hello, World");
863         UText *ut2 = utext_openUnicodeString(&ut, &s, &status);
864         TEST_SUCCESS(status);
865         TEST_ASSERT(ut2 == &ut);
866 
867         UText *ut3 = utext_close(&ut);
868         TEST_ASSERT(ut3 == &ut);
869 
870         UText *ut4 = utext_close(&ut);
871         TEST_ASSERT(ut4 == &ut);
872 
873         utext_openUnicodeString(&ut, &s, &status);
874         TEST_SUCCESS(status);
875         utext_close(&ut);
876     }
877 
878     // Re-use of a UText, chaining through each of the types of UText
879     //   (If it doesn't blow up, and doesn't leak, it's probably working fine)
880     {
881         UErrorCode status = U_ZERO_ERROR;
882         UText ut = UTEXT_INITIALIZER;
883         UText  *utp;
884         UnicodeString s1("Hello, World");
885         char16_t s2[] = {(char16_t)0x41, (char16_t)0x42, (char16_t)0};
886         const char  *s3 = "\x66\x67\x68";
887 
888         utp = utext_openUnicodeString(&ut, &s1, &status);
889         TEST_SUCCESS(status);
890         TEST_ASSERT(utp == &ut);
891 
892         utp = utext_openConstUnicodeString(&ut, &s1, &status);
893         TEST_SUCCESS(status);
894         TEST_ASSERT(utp == &ut);
895 
896         utp = utext_openUTF8(&ut, s3, -1, &status);
897         TEST_SUCCESS(status);
898         TEST_ASSERT(utp == &ut);
899 
900         utp = utext_openUChars(&ut, s2, -1, &status);
901         TEST_SUCCESS(status);
902         TEST_ASSERT(utp == &ut);
903 
904         utp = utext_close(&ut);
905         TEST_ASSERT(utp == &ut);
906 
907         utp = utext_openUnicodeString(&ut, &s1, &status);
908         TEST_SUCCESS(status);
909         TEST_ASSERT(utp == &ut);
910     }
911 
912     // Invalid parameters on open
913     //
914     {
915         UErrorCode status = U_ZERO_ERROR;
916         UText ut = UTEXT_INITIALIZER;
917 
918         utext_openUChars(&ut, nullptr, 5, &status);
919         TEST_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
920 
921         status = U_ZERO_ERROR;
922         utext_openUChars(&ut, nullptr, -1, &status);
923         TEST_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
924 
925         status = U_ZERO_ERROR;
926         utext_openUTF8(&ut, nullptr, 4, &status);
927         TEST_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
928 
929         status = U_ZERO_ERROR;
930         utext_openUTF8(&ut, nullptr, -1, &status);
931         TEST_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
932     }
933 
934     //
935     //  UTF-8 with malformed sequences.
936     //    These should come through as the Unicode replacement char, \ufffd
937     //
938     {
939         UErrorCode status = U_ZERO_ERROR;
940         UText *ut = nullptr;
941         const char *badUTF8 = "\x41\x81\x42\xf0\x81\x81\x43";
942         UChar32  c;
943 
944         ut = utext_openUTF8(nullptr, badUTF8, -1, &status);
945         TEST_SUCCESS(status);
946         c = utext_char32At(ut, 1);
947         TEST_ASSERT(c == 0xfffd);
948         c = utext_char32At(ut, 3);
949         TEST_ASSERT(c == 0xfffd);
950         c = utext_char32At(ut, 5);
951         TEST_ASSERT(c == 0xfffd);
952         c = utext_char32At(ut, 6);
953         TEST_ASSERT(c == 0x43);
954 
955         char16_t buf[10];
956         int n = utext_extract(ut, 0, 9, buf, 10, &status);
957         TEST_SUCCESS(status);
958         TEST_ASSERT(n==7);
959         TEST_ASSERT(buf[0] == 0x41);
960         TEST_ASSERT(buf[1] == 0xfffd);
961         TEST_ASSERT(buf[2] == 0x42);
962         TEST_ASSERT(buf[3] == 0xfffd);
963         TEST_ASSERT(buf[4] == 0xfffd);
964         TEST_ASSERT(buf[5] == 0xfffd);
965         TEST_ASSERT(buf[6] == 0x43);
966         utext_close(ut);
967     }
968 
969 
970     //
971     //  isLengthExpensive - does it make the expected transitions after
972     //                      getting the length of a nul terminated string?
973     //
974     {
975         UErrorCode status = U_ZERO_ERROR;
976         UnicodeString sa("Hello, this is a string");
977         UBool  isExpensive;
978 
979         char16_t sb[100];
980         memset(sb, 0x20, sizeof(sb));
981         sb[99] = 0;
982 
983         UText *uta = utext_openUnicodeString(nullptr, &sa, &status);
984         TEST_SUCCESS(status);
985         isExpensive = utext_isLengthExpensive(uta);
986         TEST_ASSERT(isExpensive == false);
987         utext_close(uta);
988 
989         UText *utb = utext_openUChars(nullptr, sb, -1, &status);
990         TEST_SUCCESS(status);
991         isExpensive = utext_isLengthExpensive(utb);
992         TEST_ASSERT(isExpensive == true);
993         int64_t  len = utext_nativeLength(utb);
994         TEST_ASSERT(len == 99);
995         isExpensive = utext_isLengthExpensive(utb);
996         TEST_ASSERT(isExpensive == false);
997         utext_close(utb);
998     }
999 
1000     //
1001     // Index to positions not on code point boundaries.
1002     //
1003     {
1004         const char *u8str =         "\xc8\x81\xe1\x82\x83\xf1\x84\x85\x86";
1005         int32_t startMap[] =        {   0,  0,  2,  2,  2,  5,  5,  5,  5,  9,  9};
1006         int32_t nextMap[]  =        {   2,  2,  5,  5,  5,  9,  9,  9,  9,  9,  9};
1007         int32_t prevMap[]  =        {   0,  0,  0,  0,  0,  2,  2,  2,  2,  5,  5};
1008         UChar32  c32Map[] =    {0x201, 0x201, 0x1083, 0x1083, 0x1083, 0x044146, 0x044146, 0x044146, 0x044146, -1, -1};
1009         UChar32  pr32Map[] =   {    -1,   -1,  0x201,  0x201,  0x201,   0x1083,   0x1083,   0x1083,   0x1083, 0x044146, 0x044146};
1010 
1011         // extractLen is the size, in UChars, of what will be extracted between index and index+1.
1012         //  is zero when both index positions lie within the same code point.
1013         int32_t  exLen[] =          {   0,  1,   0,  0,  1,  0,  0,  0,  2,  0,  0};
1014 
1015 
1016         UErrorCode status = U_ZERO_ERROR;
1017         UText *ut = utext_openUTF8(nullptr, u8str, -1, &status);
1018         TEST_SUCCESS(status);
1019 
1020         // Check setIndex
1021         int32_t i;
1022         int32_t startMapLimit = UPRV_LENGTHOF(startMap);
1023         for (i=0; i<startMapLimit; i++) {
1024             utext_setNativeIndex(ut, i);
1025             int64_t cpIndex = utext_getNativeIndex(ut);
1026             TEST_ASSERT(cpIndex == startMap[i]);
1027             cpIndex = UTEXT_GETNATIVEINDEX(ut);
1028             TEST_ASSERT(cpIndex == startMap[i]);
1029         }
1030 
1031         // Check char32At
1032         for (i=0; i<startMapLimit; i++) {
1033             UChar32 c32 = utext_char32At(ut, i);
1034             TEST_ASSERT(c32 == c32Map[i]);
1035             int64_t cpIndex = utext_getNativeIndex(ut);
1036             TEST_ASSERT(cpIndex == startMap[i]);
1037         }
1038 
1039         // Check utext_next32From
1040         for (i=0; i<startMapLimit; i++) {
1041             UChar32 c32 = utext_next32From(ut, i);
1042             TEST_ASSERT(c32 == c32Map[i]);
1043             int64_t cpIndex = utext_getNativeIndex(ut);
1044             TEST_ASSERT(cpIndex == nextMap[i]);
1045         }
1046 
1047         // check utext_previous32From
1048         for (i=0; i<startMapLimit; i++) {
1049             gTestNum++;
1050             UChar32 c32 = utext_previous32From(ut, i);
1051             TEST_ASSERT(c32 == pr32Map[i]);
1052             int64_t cpIndex = utext_getNativeIndex(ut);
1053             TEST_ASSERT(cpIndex == prevMap[i]);
1054         }
1055 
1056         // check Extract
1057         //   Extract from i to i+1, which may be zero or one code points,
1058         //     depending on whether the indices straddle a cp boundary.
1059         for (i=0; i<startMapLimit; i++) {
1060             char16_t buf[3];
1061             status = U_ZERO_ERROR;
1062             int32_t  extractedLen = utext_extract(ut, i, i+1, buf, 3, &status);
1063             TEST_SUCCESS(status);
1064             TEST_ASSERT(extractedLen == exLen[i]);
1065             if (extractedLen > 0) {
1066                 UChar32  c32;
1067                 /* extractedLen-extractedLen == 0 is used to get around a compiler warning. */
1068                 U16_GET(buf, 0, extractedLen-extractedLen, extractedLen, c32);
1069                 TEST_ASSERT(c32 == c32Map[i]);
1070             }
1071         }
1072 
1073         utext_close(ut);
1074     }
1075 
1076 
1077     {    //  Similar test, with utf16 instead of utf8
1078          //  TODO:  merge the common parts of these tests.
1079 
1080         UnicodeString u16str("\\u1000\\U00011000\\u2000\\U00022000", -1, US_INV);
1081         int32_t startMap[]  ={ 0,     1,   1,    3,     4,  4,     6,  6};
1082         int32_t nextMap[]  = { 1,     3,   3,    4,     6,  6,     6,  6};
1083         int32_t prevMap[]  = { 0,     0,   0,    1,     3,  3,     4,  4};
1084         UChar32  c32Map[] =  {0x1000, 0x11000, 0x11000, 0x2000,  0x22000, 0x22000, -1, -1};
1085         UChar32  pr32Map[] = {    -1, 0x1000,  0x1000,  0x11000, 0x2000,  0x2000,   0x22000,   0x22000};
1086         int32_t  exLen[] =   {   1,  0,   2,  1,  0,  2,  0,  0,};
1087 
1088         u16str = u16str.unescape();
1089         UErrorCode status = U_ZERO_ERROR;
1090         UText *ut = utext_openUnicodeString(nullptr, &u16str, &status);
1091         TEST_SUCCESS(status);
1092 
1093         int32_t startMapLimit = UPRV_LENGTHOF(startMap);
1094         int i;
1095         for (i=0; i<startMapLimit; i++) {
1096             utext_setNativeIndex(ut, i);
1097             int64_t cpIndex = utext_getNativeIndex(ut);
1098             TEST_ASSERT(cpIndex == startMap[i]);
1099         }
1100 
1101         // Check char32At
1102         for (i=0; i<startMapLimit; i++) {
1103             UChar32 c32 = utext_char32At(ut, i);
1104             TEST_ASSERT(c32 == c32Map[i]);
1105             int64_t cpIndex = utext_getNativeIndex(ut);
1106             TEST_ASSERT(cpIndex == startMap[i]);
1107         }
1108 
1109         // Check utext_next32From
1110         for (i=0; i<startMapLimit; i++) {
1111             UChar32 c32 = utext_next32From(ut, i);
1112             TEST_ASSERT(c32 == c32Map[i]);
1113             int64_t cpIndex = utext_getNativeIndex(ut);
1114             TEST_ASSERT(cpIndex == nextMap[i]);
1115         }
1116 
1117         // check utext_previous32From
1118         for (i=0; i<startMapLimit; i++) {
1119             UChar32 c32 = utext_previous32From(ut, i);
1120             TEST_ASSERT(c32 == pr32Map[i]);
1121             int64_t cpIndex = utext_getNativeIndex(ut);
1122             TEST_ASSERT(cpIndex == prevMap[i]);
1123         }
1124 
1125         // check Extract
1126         //   Extract from i to i+1, which may be zero or one code points,
1127         //     depending on whether the indices straddle a cp boundary.
1128         for (i=0; i<startMapLimit; i++) {
1129             char16_t buf[3];
1130             status = U_ZERO_ERROR;
1131             int32_t  extractedLen = utext_extract(ut, i, i+1, buf, 3, &status);
1132             TEST_SUCCESS(status);
1133             TEST_ASSERT(extractedLen == exLen[i]);
1134             if (extractedLen > 0) {
1135                 UChar32  c32;
1136                 /* extractedLen-extractedLen == 0 is used to get around a compiler warning. */
1137                 U16_GET(buf, 0, extractedLen-extractedLen, extractedLen, c32);
1138                 TEST_ASSERT(c32 == c32Map[i]);
1139             }
1140         }
1141 
1142         utext_close(ut);
1143     }
1144 
1145     {    //  Similar test, with UText over Replaceable
1146          //  TODO:  merge the common parts of these tests.
1147 
1148         UnicodeString u16str("\\u1000\\U00011000\\u2000\\U00022000", -1, US_INV);
1149         int32_t startMap[]  ={ 0,     1,   1,    3,     4,  4,     6,  6};
1150         int32_t nextMap[]  = { 1,     3,   3,    4,     6,  6,     6,  6};
1151         int32_t prevMap[]  = { 0,     0,   0,    1,     3,  3,     4,  4};
1152         UChar32  c32Map[] =  {0x1000, 0x11000, 0x11000, 0x2000,  0x22000, 0x22000, -1, -1};
1153         UChar32  pr32Map[] = {    -1, 0x1000,  0x1000,  0x11000, 0x2000,  0x2000,   0x22000,   0x22000};
1154         int32_t  exLen[] =   {   1,  0,   2,  1,  0,  2,  0,  0,};
1155 
1156         u16str = u16str.unescape();
1157         UErrorCode status = U_ZERO_ERROR;
1158         UText *ut = utext_openReplaceable(nullptr, &u16str, &status);
1159         TEST_SUCCESS(status);
1160 
1161         int32_t startMapLimit = UPRV_LENGTHOF(startMap);
1162         int i;
1163         for (i=0; i<startMapLimit; i++) {
1164             utext_setNativeIndex(ut, i);
1165             int64_t cpIndex = utext_getNativeIndex(ut);
1166             TEST_ASSERT(cpIndex == startMap[i]);
1167         }
1168 
1169         // Check char32At
1170         for (i=0; i<startMapLimit; i++) {
1171             UChar32 c32 = utext_char32At(ut, i);
1172             TEST_ASSERT(c32 == c32Map[i]);
1173             int64_t cpIndex = utext_getNativeIndex(ut);
1174             TEST_ASSERT(cpIndex == startMap[i]);
1175         }
1176 
1177         // Check utext_next32From
1178         for (i=0; i<startMapLimit; i++) {
1179             UChar32 c32 = utext_next32From(ut, i);
1180             TEST_ASSERT(c32 == c32Map[i]);
1181             int64_t cpIndex = utext_getNativeIndex(ut);
1182             TEST_ASSERT(cpIndex == nextMap[i]);
1183         }
1184 
1185         // check utext_previous32From
1186         for (i=0; i<startMapLimit; i++) {
1187             UChar32 c32 = utext_previous32From(ut, i);
1188             TEST_ASSERT(c32 == pr32Map[i]);
1189             int64_t cpIndex = utext_getNativeIndex(ut);
1190             TEST_ASSERT(cpIndex == prevMap[i]);
1191         }
1192 
1193         // check Extract
1194         //   Extract from i to i+1, which may be zero or one code points,
1195         //     depending on whether the indices straddle a cp boundary.
1196         for (i=0; i<startMapLimit; i++) {
1197             char16_t buf[3];
1198             status = U_ZERO_ERROR;
1199             int32_t  extractedLen = utext_extract(ut, i, i+1, buf, 3, &status);
1200             TEST_SUCCESS(status);
1201             TEST_ASSERT(extractedLen == exLen[i]);
1202             if (extractedLen > 0) {
1203                 UChar32  c32;
1204                 /* extractedLen-extractedLen == 0 is used to get around a compiler warning. */
1205                 U16_GET(buf, 0, extractedLen-extractedLen, extractedLen, c32);
1206                 TEST_ASSERT(c32 == c32Map[i]);
1207             }
1208         }
1209 
1210         utext_close(ut);
1211     }
1212 }
1213 
1214 
FreezeTest()1215 void UTextTest::FreezeTest() {
1216     // Check isWritable() and freeze() behavior.
1217     //
1218 
1219     UnicodeString  ustr("Hello, World.");
1220     const char u8str[] = {char(0x31), (char)0x32, (char)0x33, 0};
1221     const char16_t u16str[] = {(char16_t)0x31, (char16_t)0x32, (char16_t)0x44, 0};
1222 
1223     UErrorCode status = U_ZERO_ERROR;
1224     UText  *ut        = nullptr;
1225     UText  *ut2       = nullptr;
1226 
1227     ut = utext_openUTF8(ut, u8str, -1, &status);
1228     TEST_SUCCESS(status);
1229     UBool writable = utext_isWritable(ut);
1230     TEST_ASSERT(writable == false);
1231     utext_copy(ut, 1, 2, 0, true, &status);
1232     TEST_ASSERT(status == U_NO_WRITE_PERMISSION);
1233 
1234     status = U_ZERO_ERROR;
1235     ut = utext_openUChars(ut, u16str, -1, &status);
1236     TEST_SUCCESS(status);
1237     writable = utext_isWritable(ut);
1238     TEST_ASSERT(writable == false);
1239     utext_copy(ut, 1, 2, 0, true, &status);
1240     TEST_ASSERT(status == U_NO_WRITE_PERMISSION);
1241 
1242     status = U_ZERO_ERROR;
1243     ut = utext_openUnicodeString(ut, &ustr, &status);
1244     TEST_SUCCESS(status);
1245     writable = utext_isWritable(ut);
1246     TEST_ASSERT(writable == true);
1247     utext_freeze(ut);
1248     writable = utext_isWritable(ut);
1249     TEST_ASSERT(writable == false);
1250     utext_copy(ut, 1, 2, 0, true, &status);
1251     TEST_ASSERT(status == U_NO_WRITE_PERMISSION);
1252 
1253     status = U_ZERO_ERROR;
1254     ut = utext_openUnicodeString(ut, &ustr, &status);
1255     TEST_SUCCESS(status);
1256     ut2 = utext_clone(ut2, ut, false, false, &status);  // clone with readonly = false
1257     TEST_SUCCESS(status);
1258     writable = utext_isWritable(ut2);
1259     TEST_ASSERT(writable == true);
1260     ut2 = utext_clone(ut2, ut, false, true, &status);  // clone with readonly = true
1261     TEST_SUCCESS(status);
1262     writable = utext_isWritable(ut2);
1263     TEST_ASSERT(writable == false);
1264     utext_copy(ut2, 1, 2, 0, true, &status);
1265     TEST_ASSERT(status == U_NO_WRITE_PERMISSION);
1266 
1267     status = U_ZERO_ERROR;
1268     ut = utext_openConstUnicodeString(ut, &ustr, &status);
1269     TEST_SUCCESS(status);
1270     writable = utext_isWritable(ut);
1271     TEST_ASSERT(writable == false);
1272     utext_copy(ut, 1, 2, 0, true, &status);
1273     TEST_ASSERT(status == U_NO_WRITE_PERMISSION);
1274 
1275     // Deep Clone of a frozen UText should re-enable writing in the copy.
1276     status = U_ZERO_ERROR;
1277     ut = utext_openUnicodeString(ut, &ustr, &status);
1278     TEST_SUCCESS(status);
1279     utext_freeze(ut);
1280     ut2 = utext_clone(ut2, ut, true, false, &status);   // deep clone
1281     TEST_SUCCESS(status);
1282     writable = utext_isWritable(ut2);
1283     TEST_ASSERT(writable == true);
1284 
1285 
1286     // Deep clone of a frozen UText, where the base type is intrinsically non-writable,
1287     //  should NOT enable writing in the copy.
1288     status = U_ZERO_ERROR;
1289     ut = utext_openUChars(ut, u16str, -1, &status);
1290     TEST_SUCCESS(status);
1291     utext_freeze(ut);
1292     ut2 = utext_clone(ut2, ut, true, false, &status);   // deep clone
1293     TEST_SUCCESS(status);
1294     writable = utext_isWritable(ut2);
1295     TEST_ASSERT(writable == false);
1296 
1297     // cleanup
1298     utext_close(ut);
1299     utext_close(ut2);
1300 }
1301 
1302 
1303 //
1304 //  Fragmented UText
1305 //      A UText type that works with a chunk size of 1.
1306 //      Intended to test for edge cases.
1307 //      Input comes from a UnicodeString.
1308 //
1309 //       ut.b    the character.  Put into both halves.
1310 //
1311 
1312 U_CDECL_BEGIN
1313 static UBool U_CALLCONV
fragTextAccess(UText * ut,int64_t index,UBool forward)1314 fragTextAccess(UText *ut, int64_t index, UBool forward) {
1315     const UnicodeString *us = static_cast<const UnicodeString *>(ut->context);
1316     char16_t c;
1317     int32_t length = us->length();
1318     if (forward && index>=0 && index<length) {
1319         c = us->charAt((int32_t)index);
1320         ut->b = c | c<<16;
1321         ut->chunkOffset = 0;
1322         ut->chunkLength = 1;
1323         ut->chunkNativeStart = index;
1324         ut->chunkNativeLimit = index+1;
1325         return true;
1326     }
1327     if (!forward && index>0 && index <=length) {
1328         c = us->charAt((int32_t)index-1);
1329         ut->b = c | c<<16;
1330         ut->chunkOffset = 1;
1331         ut->chunkLength = 1;
1332         ut->chunkNativeStart = index-1;
1333         ut->chunkNativeLimit = index;
1334         return true;
1335     }
1336     ut->b = 0;
1337     ut->chunkOffset = 0;
1338     ut->chunkLength = 0;
1339     if (index <= 0) {
1340         ut->chunkNativeStart = 0;
1341         ut->chunkNativeLimit = 0;
1342     } else {
1343         ut->chunkNativeStart = length;
1344         ut->chunkNativeLimit = length;
1345     }
1346     return false;
1347 }
1348 
1349 // Function table to be used with this fragmented text provider.
1350 //   Initialized in the open function.
1351 static UTextFuncs  fragmentFuncs;
1352 
1353 // Clone function for fragmented text provider.
1354 //   Didn't really want to provide this, but it's easier to provide it than to keep it
1355 //   out of the tests.
1356 //
1357 UText *
cloneFragmentedUnicodeString(UText * dest,const UText * src,UBool deep,UErrorCode * status)1358 cloneFragmentedUnicodeString(UText *dest, const UText *src, UBool deep, UErrorCode *status) {
1359     if (U_FAILURE(*status)) {
1360         return nullptr;
1361     }
1362     if (deep) {
1363         *status = U_UNSUPPORTED_ERROR;
1364         return nullptr;
1365     }
1366     dest = utext_openUnicodeString(dest, static_cast<UnicodeString *>(const_cast<void*>(src->context)), status);
1367     utext_setNativeIndex(dest, utext_getNativeIndex(src));
1368     return dest;
1369 }
1370 
1371 U_CDECL_END
1372 
1373 // Open function for the fragmented text provider.
1374 UText *
openFragmentedUnicodeString(UText * ut,UnicodeString * s,UErrorCode * status)1375 openFragmentedUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status) {
1376     ut = utext_openUnicodeString(ut, s, status);
1377     if (U_FAILURE(*status)) {
1378         return ut;
1379     }
1380 
1381     // Copy of the function table from the stock UnicodeString UText,
1382     //   and replace the entry for the access function.
1383     memcpy(&fragmentFuncs, ut->pFuncs, sizeof(fragmentFuncs));
1384     fragmentFuncs.access = fragTextAccess;
1385     fragmentFuncs.clone  = cloneFragmentedUnicodeString;
1386     ut->pFuncs = &fragmentFuncs;
1387 
1388     ut->chunkContents = (char16_t *)&ut->b;
1389     ut->pFuncs->access(ut, 0, true);
1390     return ut;
1391 }
1392 
1393 // Regression test for Ticket 5560
1394 //   Clone fails to update chunkContentPointer in the cloned copy.
1395 //   This is only an issue for UText types that work in a local buffer,
1396 //      (UTF-8 wrapper, for example)
1397 //
1398 //   The test:
1399 //     1.  Create an initial UText
1400 //     2.  Deep clone it.  Contents should match original.
1401 //     3.  Reset original to something different.
1402 //     4.  Check that clone contents did not change.
1403 //
Ticket5560()1404 void UTextTest::Ticket5560() {
1405     /* The following two strings are in UTF-8 even on EBCDIC platforms. */
1406     static const char s1[] = {0x41,0x42,0x43,0x44,0x45,0x46,0}; /* "ABCDEF" */
1407     static const char s2[] = {0x31,0x32,0x33,0x34,0x35,0x36,0}; /* "123456" */
1408 	UErrorCode status = U_ZERO_ERROR;
1409 
1410 	UText ut1 = UTEXT_INITIALIZER;
1411 	UText ut2 = UTEXT_INITIALIZER;
1412 
1413 	utext_openUTF8(&ut1, s1, -1, &status);
1414 	char16_t c = utext_next32(&ut1);
1415 	TEST_ASSERT(c == 0x41);  // c == 'A'
1416 
1417 	utext_clone(&ut2, &ut1, true, false, &status);
1418 	TEST_SUCCESS(status);
1419     c = utext_next32(&ut2);
1420 	TEST_ASSERT(c == 0x42);  // c == 'B'
1421     c = utext_next32(&ut1);
1422 	TEST_ASSERT(c == 0x42);  // c == 'B'
1423 
1424 	utext_openUTF8(&ut1, s2, -1, &status);
1425 	c = utext_next32(&ut1);
1426 	TEST_ASSERT(c == 0x31);  // c == '1'
1427     c = utext_next32(&ut2);
1428 	TEST_ASSERT(c == 0x43);  // c == 'C'
1429 
1430     utext_close(&ut1);
1431     utext_close(&ut2);
1432 }
1433 
1434 
1435 // Test for Ticket 6847
1436 //
Ticket6847()1437 void UTextTest::Ticket6847() {
1438     const int STRLEN = 90;
1439     char16_t s[STRLEN+1];
1440     u_memset(s, 0x41, STRLEN);
1441     s[STRLEN] = 0;
1442 
1443     UErrorCode status = U_ZERO_ERROR;
1444     UText *ut = utext_openUChars(nullptr, s, -1, &status);
1445 
1446     utext_setNativeIndex(ut, 0);
1447     int32_t count = 0;
1448     UChar32 c = 0;
1449     int64_t nativeIndex = UTEXT_GETNATIVEINDEX(ut);
1450     TEST_ASSERT(nativeIndex == 0);
1451     while ((c = utext_next32(ut)) != U_SENTINEL) {
1452         TEST_ASSERT(c == 0x41);
1453         TEST_ASSERT(count < STRLEN);
1454         if (count >= STRLEN) {
1455             break;
1456         }
1457         count++;
1458         nativeIndex = UTEXT_GETNATIVEINDEX(ut);
1459         TEST_ASSERT(nativeIndex == count);
1460     }
1461     TEST_ASSERT(count == STRLEN);
1462     nativeIndex = UTEXT_GETNATIVEINDEX(ut);
1463     TEST_ASSERT(nativeIndex == STRLEN);
1464     utext_close(ut);
1465 }
1466 
1467 
Ticket10562()1468 void UTextTest::Ticket10562() {
1469     // Note: failures show as a heap error when the test is run under valgrind.
1470     UErrorCode status = U_ZERO_ERROR;
1471 
1472     const char *utf8_string = "\x41\x41\x41\x41\x41\x41\x41\x41\x41\x41\x41\x41\x41\x41\x41";
1473     UText *utf8Text = utext_openUTF8(nullptr, utf8_string, -1, &status);
1474     TEST_SUCCESS(status);
1475     UText *deepClone = utext_clone(nullptr, utf8Text, true, false, &status);
1476     TEST_SUCCESS(status);
1477     UText *shallowClone = utext_clone(nullptr, deepClone, false, false, &status);
1478     TEST_SUCCESS(status);
1479     utext_close(shallowClone);
1480     utext_close(deepClone);
1481     utext_close(utf8Text);
1482 
1483     status = U_ZERO_ERROR;
1484     UnicodeString usString("Hello, World.");
1485     UText *usText = utext_openUnicodeString(nullptr, &usString, &status);
1486     TEST_SUCCESS(status);
1487     UText *usDeepClone = utext_clone(nullptr, usText, true, false, &status);
1488     TEST_SUCCESS(status);
1489     UText *usShallowClone = utext_clone(nullptr, usDeepClone, false, false, &status);
1490     TEST_SUCCESS(status);
1491     utext_close(usShallowClone);
1492     utext_close(usDeepClone);
1493     utext_close(usText);
1494 }
1495 
1496 
Ticket10983()1497 void UTextTest::Ticket10983() {
1498     // Note: failure shows as a seg fault when the defect is present.
1499 
1500     UErrorCode status = U_ZERO_ERROR;
1501     UnicodeString s("Hello, World");
1502     UText *ut = utext_openConstUnicodeString(nullptr, &s, &status);
1503     TEST_SUCCESS(status);
1504 
1505     status = U_INVALID_STATE_ERROR;
1506     UText *cloned = utext_clone(nullptr, ut, true, true, &status);
1507     TEST_ASSERT(cloned == nullptr);
1508     TEST_ASSERT(status == U_INVALID_STATE_ERROR);
1509 
1510     utext_close(ut);
1511 }
1512 
1513 // Ticket 12130 - extract on a UText wrapping a null terminated char16_t * string
1514 //                leaves the iteration position set incorrectly when the
1515 //                actual string length is not yet known.
1516 //
1517 //                The test text needs to be long enough that UText defers getting the length.
1518 
Ticket12130()1519 void UTextTest::Ticket12130() {
1520     UErrorCode status = U_ZERO_ERROR;
1521 
1522     const char *text8 =
1523         "Fundamentally, computers just deal with numbers. They store letters and other characters "
1524         "by assigning a number for each one. Before Unicode was invented, there were hundreds "
1525         "of different encoding systems for assigning these numbers. No single encoding could "
1526         "contain enough characters: for example, the European Union alone requires several "
1527         "different encodings to cover all its languages. Even for a single language like "
1528         "English no single encoding was adequate for all the letters, punctuation, and technical "
1529         "symbols in common use.";
1530 
1531     UnicodeString str(text8);
1532     const char16_t *ustr = str.getTerminatedBuffer();
1533     UText ut = UTEXT_INITIALIZER;
1534     utext_openUChars(&ut, ustr, -1, &status);
1535     char16_t extractBuffer[50];
1536 
1537     for (int32_t startIdx = 0; startIdx<str.length(); ++startIdx) {
1538         int32_t endIdx = startIdx + 20;
1539 
1540         u_memset(extractBuffer, 0, UPRV_LENGTHOF(extractBuffer));
1541         utext_extract(&ut, startIdx, endIdx, extractBuffer, UPRV_LENGTHOF(extractBuffer), &status);
1542         if (U_FAILURE(status)) {
1543             errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1544             return;
1545         }
1546         int64_t ni  = utext_getNativeIndex(&ut);
1547         int64_t expectedni = startIdx + 20;
1548         if (expectedni > str.length()) {
1549             expectedni = str.length();
1550         }
1551         if (expectedni != ni) {
1552             errln("%s:%d utext_getNativeIndex() expected %d, got %d", __FILE__, __LINE__, expectedni, ni);
1553         }
1554         if (0 != str.tempSubString(startIdx, 20).compare(extractBuffer)) {
1555             errln("%s:%d utext_extract() failed. expected \"%s\", got \"%s\"",
1556                     __FILE__, __LINE__, CStr(str.tempSubString(startIdx, 20))(), CStr(UnicodeString(extractBuffer))());
1557         }
1558     }
1559     utext_close(&ut);
1560 
1561     // Similar utext extract, this time with the string length provided to the UText in advance,
1562     // and a buffer of larger than required capacity.
1563 
1564     utext_openUChars(&ut, ustr, str.length(), &status);
1565     for (int32_t startIdx = 0; startIdx<str.length(); ++startIdx) {
1566         int32_t endIdx = startIdx + 20;
1567         u_memset(extractBuffer, 0, UPRV_LENGTHOF(extractBuffer));
1568         utext_extract(&ut, startIdx, endIdx, extractBuffer, UPRV_LENGTHOF(extractBuffer), &status);
1569         if (U_FAILURE(status)) {
1570             errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1571             return;
1572         }
1573         int64_t ni  = utext_getNativeIndex(&ut);
1574         int64_t expectedni = startIdx + 20;
1575         if (expectedni > str.length()) {
1576             expectedni = str.length();
1577         }
1578         if (expectedni != ni) {
1579             errln("%s:%d utext_getNativeIndex() expected %d, got %d", __FILE__, __LINE__, expectedni, ni);
1580         }
1581         if (0 != str.tempSubString(startIdx, 20).compare(extractBuffer)) {
1582             errln("%s:%d utext_extract() failed. expected \"%s\", got \"%s\"",
1583                     __FILE__, __LINE__, CStr(str.tempSubString(startIdx, 20))(), CStr(UnicodeString(extractBuffer))());
1584         }
1585     }
1586     utext_close(&ut);
1587 }
1588 
1589 // Ticket 13344 The macro form of UTEXT_SETNATIVEINDEX failed when target was a trail surrogate
1590 //              of a supplementary character.
1591 
Ticket13344()1592 void UTextTest::Ticket13344() {
1593     UErrorCode status = U_ZERO_ERROR;
1594     const char16_t *str = u"abc\U0010abcd xyz";
1595     LocalUTextPointer ut(utext_openUChars(nullptr, str, -1, &status));
1596 
1597     assertSuccess("UTextTest::Ticket13344-status", status);
1598     UTEXT_SETNATIVEINDEX(ut.getAlias(), 3);
1599     assertEquals("UTextTest::Ticket13344-lead", (int64_t)3, utext_getNativeIndex(ut.getAlias()));
1600     UTEXT_SETNATIVEINDEX(ut.getAlias(), 4);
1601     assertEquals("UTextTest::Ticket13344-trail", (int64_t)3, utext_getNativeIndex(ut.getAlias()));
1602     UTEXT_SETNATIVEINDEX(ut.getAlias(), 5);
1603     assertEquals("UTextTest::Ticket13344-bmp", (int64_t)5, utext_getNativeIndex(ut.getAlias()));
1604 
1605     utext_setNativeIndex(ut.getAlias(), 3);
1606     assertEquals("UTextTest::Ticket13344-lead-2", (int64_t)3, utext_getNativeIndex(ut.getAlias()));
1607     utext_setNativeIndex(ut.getAlias(), 4);
1608     assertEquals("UTextTest::Ticket13344-trail-2", (int64_t)3, utext_getNativeIndex(ut.getAlias()));
1609     utext_setNativeIndex(ut.getAlias(), 5);
1610     assertEquals("UTextTest::Ticket13344-bmp-2", (int64_t)5, utext_getNativeIndex(ut.getAlias()));
1611 }
1612 
1613 // ICU-21653 UText does not handle access callback that changes chunk size
1614 
1615 static const char16_t testAccessText[] = { // text with surrogates at chunk boundaries
1616     0xDC00,0xe001,0xe002,0xD83D,0xDE00,0xe005,0xe006,0xe007, 0xe008,0xe009,0xe00a,0xD83D,0xDE00,0xe00d,0xe00e,0xe00f, // 000-015, unpaired trail at 0
1617     0xE010,0xe011,0xe012,0xD83D,0xDE00,0xe015,0xe016,0xe017, 0xe018,0xe019,0xe01a,0xD83D,0xDE00,0xe01d,0xe01e,0xD800, // 016-031, paired lead at 31 with
1618     0xDC01,0xe021,0xe022,0xD83D,0xDE00,0xe025,0xe026,0xe027, 0xe028,0xe029,0xe02a,0xD83D,0xDE00,0xe02d,0xe02e,0xe02f, // 032-047, paired trail at 32
1619     0xe030,0xe031,0xe032,0xD83D,0xDE00,0xe035,0xe036,0xe037, 0xe038,0xe039,0xe03a,0xD83D,0xDE00,0xe03d,0xe03e,0xe03f, // 048-063
1620     0xDC02,0xe041,0xe042,0xD83D,0xDE00,0xe045,0xe046,0xe047, 0xe048,0xe049,0xe04a,0xD83D,0xDE00,0xe04d,0xe04e,0xe04f, // 064-079, unpaired trail at 64
1621     0xe050,0xe051,0xe052,0xD83D,0xDE00,0xe055,0xe056,0xe057, 0xe058,0xe059,0xe05a,0xD83D,0xDE00,0xe05d,0xe05e,0xD801, // 080-095, unpaired lead at 95
1622     0xe060,0xe061,0xe062,0xD83D,0xDE00,0xe065,0xe066,0xe067, 0xe068,0xe069,0xe06a,0xD83D,0xDE00,0xe06d,0xe06e,0xe06f, // 096-111
1623     0xE070,0xe071,0xe072,0xD83D,0xDE00,0xe075,0xe076,0xe077, 0xe078,0xe079,0xe07a,0xD83D,0xDE00,0xe07d,0xe07e,0xD802, // 112-127, unpaired lead at 127
1624 };
1625 
1626 static const UChar32 testAccess32Text[] = { // same as above in UTF32
1627     0xDC00,0xe001,0xe002,0x1F600,0xe005,0xe006,0xe007, 0xe008,0xe009,0xe00a,0x1F600,0xe00d,0xe00e,0xe00f, // 000-013, unpaired trail at 0
1628     0xE010,0xe011,0xe012,0x1F600,0xe015,0xe016,0xe017, 0xe018,0xe019,0xe01a,0x1F600,0xe01d,0xe01e,0x10001, // 014-027, nonBMP at 27, will split in chunks
1629            0xe021,0xe022,0x1F600,0xe025,0xe026,0xe027, 0xe028,0xe029,0xe02a,0x1F600,0xe02d,0xe02e,0xe02f, // 028-040
1630     0xe030,0xe031,0xe032,0x1F600,0xe035,0xe036,0xe037, 0xe038,0xe039,0xe03a,0x1F600,0xe03d,0xe03e,0xe03f, // 041-054
1631     0xDC02,0xe041,0xe042,0x1F600,0xe045,0xe046,0xe047, 0xe048,0xe049,0xe04a,0x1F600,0xe04d,0xe04e,0xe04f, // 055-068, unpaired trail at 55
1632     0xe050,0xe051,0xe052,0x1F600,0xe055,0xe056,0xe057, 0xe058,0xe059,0xe05a,0x1F600,0xe05d,0xe05e,0xD801, // 069-082, unpaired lead at 82
1633     0xe060,0xe061,0xe062,0x1F600,0xe065,0xe066,0xe067, 0xe068,0xe069,0xe06a,0x1F600,0xe06d,0xe06e,0xe06f, // 083-096
1634     0xE070,0xe071,0xe072,0x1F600,0xe075,0xe076,0xe077, 0xe078,0xe079,0xe07a,0x1F600,0xe07d,0xe07e,0xD802, // 097-110, unpaired lead at 110
1635 };
1636 
1637 enum {
1638     kTestAccessSmallChunkSize = 8,
1639     kTestAccessLargeChunkSize = 32,
1640     kTextAccessGapSize = 2
1641 };
1642 
1643 typedef struct {
1644     int64_t nativeOffset;
1645     UChar32 expectChar;
1646 } OffsetAndChar;
1647 
1648 static const OffsetAndChar testAccessEntries[] = { // sequence of offsets to test with expected UChar32
1649     // random access
1650     { 127,  0xD802 },
1651     { 16,   0xE010 },
1652     { 95,   0xD801 },
1653     { 31,   0x10001 },
1654     { 112,  0xE070 },
1655     { 0,    0xDC00 },
1656     { 64,   0xDC02 },
1657     { 32,   0x10001 },
1658     // sequential access
1659     { 0,    0xDC00 },
1660     { 16,   0xE010 },
1661     { 31,   0x10001 },
1662     { 32,   0x10001 },
1663     { 64,   0xDC02 },
1664     { 95,   0xD801 },
1665     { 112,  0xE070 },
1666     { 127,  0xD802 },
1667 };
1668 
1669 static const OffsetAndChar testAccess32Entries[] = { // sequence of offsets to test with expected UChar32
1670     // random access
1671     { 110,  0xD802 },   // 0 *
1672     { 14,   0xE010 },   // 1
1673     { 82,   0xD801 },   // 2 *
1674     { 27,   0x10001 },  // 3 *
1675     { 97,   0xE070 },   // 4
1676     { 0,    0xDC00 },   // 5
1677     { 55,   0xDC02 },   // 6
1678     // sequential access
1679     { 0,    0xDC00 },   // 7
1680     { 14,   0xE010 },   // 8
1681     { 27,   0x10001 },  // 9 *
1682     { 55,   0xDC02 },   // 10
1683     { 97,   0xE070 },   // 11
1684     { 82,   0xD801 },   // 12 *
1685     { 110,  0xD802 },   // 13 *
1686 };
1687 // modified UTextAccess function for char16_t string; a cross between
1688 // UText ucstrTextAccess and a function that modifies chunk size
1689 // 1. assumes native length is known and in ut->a
1690 // 2. assumes that most fields may be 0 or nullptr, will fill out if index not in range
1691 // 3. Will designate buffer of size kTestAccessSmallChunkSize or kTestAccessLargeChunkSize
1692 //    depending on kTextAccessGapSize
1693 static UBool
ustrTextAccessModChunks(UText * ut,int64_t index,UBool forward)1694 ustrTextAccessModChunks(UText *ut, int64_t index, UBool forward) {
1695     const char16_t *str = (const char16_t *)ut->context;
1696     int64_t length = ut->a;
1697 
1698     // pin the requested index to the bounds of the string
1699     if (index < 0) {
1700         index = 0;
1701     } else if (index > length) {
1702         index = length;
1703     }
1704     if (forward) {
1705         if (index < ut->chunkNativeLimit && index >= ut->chunkNativeStart) {
1706             /* Already inside the buffer. Set the new offset. */
1707             ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
1708             return true;
1709         }
1710         if (index >= length && ut->chunkNativeLimit == length) {
1711             /* Off the end of the buffer, but we can't get it. */
1712             ut->chunkOffset = ut->chunkLength;
1713             return false;
1714         }
1715     }
1716     else {
1717         if (index <= ut->chunkNativeLimit && index > ut->chunkNativeStart) {
1718             /* Already inside the buffer. Set the new offset. */
1719             ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
1720             return true;
1721         }
1722         if (index == 0 && ut->chunkNativeStart == 0) {
1723             /* Already at the beginning; can't go any farther */
1724             ut->chunkOffset = 0;
1725             return false;
1726         }
1727     }
1728     /* It's not inside the buffer. Start over from scratch. */
1729     // Assume large chunk size for first access
1730     int32_t chunkSize = kTestAccessLargeChunkSize;
1731     if (ut->chunkContents != nullptr && ut->chunkLength != 0) {
1732         // Subsequent access, set chunk size depending on gap (smaller chunk for large gap => random access)
1733         int64_t gap = forward ? (index-ut->chunkNativeLimit) : (ut->chunkNativeStart-index);
1734         if (gap < 0) {
1735             gap = -gap;
1736         }
1737         chunkSize = (gap > kTextAccessGapSize)? kTestAccessSmallChunkSize: kTestAccessLargeChunkSize;
1738     }
1739     ut->chunkLength = chunkSize;
1740     ut->chunkOffset = index % chunkSize;
1741     if (!forward && ut->chunkOffset == 0 && index >= chunkSize) {
1742         ut->chunkOffset = chunkSize;
1743     }
1744     ut->chunkNativeStart = index - ut->chunkOffset;
1745     ut->chunkNativeLimit = ut->chunkNativeStart + ut->chunkLength;
1746     ut->chunkContents = str + ut->chunkNativeStart;
1747     ut->nativeIndexingLimit = ut->chunkLength;
1748     return true;
1749 }
1750 
1751 // For testing UTF32 access (no native index does not match chunk offset/index
1752 
1753 /**
1754  * @return the length, in the native units of the original text string.
1755  */
1756 // 1. assumes native length is known and in ut->a
1757 static int64_t
u32NativeLength(UText * ut)1758 u32NativeLength(UText *ut) {
1759     return ut->a;
1760 }
1761 
1762 /**
1763  * Map from the current char16_t offset within the current text chunk to
1764  *  the corresponding native index in the original source text.
1765  * @return Absolute (native) index corresponding to chunkOffset in the current chunk.
1766  *         The returned native index should always be to a code point boundary.
1767  */
1768 // 1. assumes native length is known and in ut->a
1769 // 2. assumes that pointer to offset map is in
1770 static int64_t
u32MapOffsetToNative(const UText * ut)1771 u32MapOffsetToNative(const UText *ut) {
1772     const int64_t* offsetMap = (const int64_t*)ut->p;
1773     int64_t u16Offset = offsetMap[ut->chunkNativeStart] + ut->chunkOffset;
1774     int64_t index = ut->a;
1775     while (u16Offset < offsetMap[index]) {
1776         index--;
1777     }
1778     return index;
1779 }
1780 
1781 /**
1782  * Map from a native index to a char16_t offset within a text chunk.
1783  * Behavior is undefined if the native index does not fall within the
1784  *   current chunk.
1785  * @param nativeIndex Absolute (native) text index, chunk->start<=index<=chunk->limit.
1786  * @return            Chunk-relative UTF-16 offset corresponding to the specified native
1787  *                    index.
1788  */
1789 static int32_t
u32MapNativeIndexToUTF16(const UText * ut,int64_t index)1790 u32MapNativeIndexToUTF16(const UText *ut, int64_t index) {
1791     const int64_t* offsetMap = (const int64_t*)ut->p;
1792     if (index <= ut->chunkNativeStart) {
1793         return 0;
1794     } else if (index >= ut->chunkNativeLimit) {
1795         return ut->chunkLength;
1796     }
1797     return (offsetMap[index] - offsetMap[ut->chunkNativeStart]);
1798 }
1799 
1800 static void
u32Close(UText * ut)1801 u32Close(UText *ut) {
1802     uprv_free((void*)ut->p);
1803 }
1804 
1805 static UBool
u32Access(UText * ut,int64_t index,UBool forward)1806 u32Access(UText *ut, int64_t index, UBool forward) {
1807     int64_t length = ut->a;
1808     const int64_t* offsetMap = (const int64_t*)ut->p;
1809     const char16_t *u16 = (const char16_t *)ut->q;
1810 
1811     // pin the requested index to the bounds of the string
1812     if (index < 0) {
1813         index = 0;
1814     } else if (index > length) {
1815         index = length;
1816     }
1817     if (forward) {
1818         if (index < ut->chunkNativeLimit && index >= ut->chunkNativeStart) {
1819             /* Already inside the buffer. Set the new offset. */
1820             ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
1821             return true;
1822         }
1823         if (index >= length && ut->chunkNativeLimit == length) {
1824             /* Off the end of the buffer, but we can't get it. */
1825             ut->chunkOffset = ut->chunkLength;
1826             return false;
1827         }
1828     }
1829     else {
1830         if (index <= ut->chunkNativeLimit && index > ut->chunkNativeStart) {
1831             /* Already inside the buffer. Set the new offset. */
1832             ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
1833             return true;
1834         }
1835         if (index == 0 && ut->chunkNativeStart == 0) {
1836             /* Already at the beginning; can't go any farther */
1837             ut->chunkOffset = 0;
1838             return false;
1839         }
1840     }
1841     /* It's not inside the buffer. Start over from scratch. */
1842     // Assume large chunk size for first access
1843     int32_t chunkSize = kTestAccessLargeChunkSize;
1844     if (ut->chunkContents != nullptr && ut->chunkLength != 0) {
1845         // Subsequent access, set chunk size depending on gap (smaller chunk for large gap => random access)
1846         int64_t gap = forward ? (index-ut->chunkNativeLimit) : (ut->chunkNativeStart-index);
1847         if (gap < 0) {
1848             gap = -gap;
1849         }
1850         chunkSize = (gap > kTextAccessGapSize)? kTestAccessSmallChunkSize: kTestAccessLargeChunkSize;
1851     }
1852     int64_t u16Offset = offsetMap[index]; // guaranteed to be on code point boundary
1853     int64_t u16ChunkTryStart = (u16Offset/chunkSize) * chunkSize;
1854     int64_t u16ChunkTryEnd = u16ChunkTryStart + chunkSize;
1855     if (!forward && u16ChunkTryStart==u16Offset && u16ChunkTryStart>0) {
1856         u16ChunkTryEnd = u16ChunkTryStart;
1857         u16ChunkTryStart -= chunkSize;
1858     }
1859     int64_t nativeIndexEnd = length;
1860     while (u16ChunkTryEnd < offsetMap[nativeIndexEnd]) {
1861         nativeIndexEnd--;
1862     }
1863     int64_t nativeIndexStart = nativeIndexEnd;
1864     while (u16ChunkTryStart < offsetMap[nativeIndexStart]) {
1865         nativeIndexStart--;
1866     }
1867     if (forward && nativeIndexEnd < length && u16Offset >= offsetMap[nativeIndexEnd]) {
1868         // oops we need to be in the following chunk
1869         nativeIndexStart = nativeIndexEnd;
1870         u16ChunkTryEnd = ((offsetMap[nativeIndexStart + 1] + chunkSize)/chunkSize) * chunkSize;
1871         nativeIndexEnd = length;
1872         while (u16ChunkTryEnd < offsetMap[nativeIndexEnd]) {
1873             nativeIndexEnd--;
1874         }
1875     }
1876     ut->chunkNativeStart = nativeIndexStart;
1877     ut->chunkNativeLimit = nativeIndexEnd;
1878     ut->chunkLength = offsetMap[nativeIndexEnd] - offsetMap[nativeIndexStart];
1879     ut->chunkOffset = u16Offset - offsetMap[nativeIndexStart];
1880     ut->chunkContents = u16 + offsetMap[nativeIndexStart];
1881     ut->nativeIndexingLimit = 0 ;
1882     return true;
1883 }
1884 
1885 static const struct UTextFuncs u32Funcs =
1886 {
1887     sizeof(UTextFuncs),
1888     0, 0, 0,              // Reserved alignment padding
1889     nullptr,              // Clone
1890     u32NativeLength,
1891     u32Access,
1892     nullptr,              // Extract
1893     nullptr,              // Replace
1894     nullptr,              // Copy
1895     u32MapOffsetToNative,
1896     u32MapNativeIndexToUTF16,
1897     u32Close,
1898     nullptr,              // spare 1
1899     nullptr,              // spare 2
1900     nullptr,              // spare 3
1901 };
1902 
1903 // A hack, this takes a pointer to both the UTF32 and UTF16 versions of the text
1904 static UText *
utext_openUChar32s(UText * ut,const UChar32 * s,int64_t length,const char16_t * q,UErrorCode * status)1905 utext_openUChar32s(UText *ut, const UChar32 *s, int64_t length, const char16_t *q, UErrorCode *status) {
1906     if (U_FAILURE(*status)) {
1907         return nullptr;
1908     }
1909     if (s==nullptr || length < 0) {
1910         *status = U_ILLEGAL_ARGUMENT_ERROR;
1911         return nullptr;
1912     }
1913     ut = utext_setup(ut, 0, status);
1914     if (U_SUCCESS(*status)) {
1915         int64_t* offsetMap = (int64_t*)uprv_malloc((length+1)*sizeof(int64_t));
1916         if (offsetMap == nullptr) {
1917             *status = U_MEMORY_ALLOCATION_ERROR;
1918             return nullptr;
1919         }
1920         ut->pFuncs               = &u32Funcs;
1921         ut->context              = s;
1922         ut->providerProperties   = 0;
1923         ut->a                    = length;
1924         ut->chunkContents        = nullptr;
1925         ut->chunkNativeStart     = 0;
1926         ut->chunkNativeLimit     = 0;
1927         ut->chunkLength          = 0;
1928         ut->chunkOffset          = 0;
1929         ut->nativeIndexingLimit  = 0;
1930         ut->p                    = offsetMap;
1931         ut->q                    = q;
1932         int64_t u16Offset = 0;
1933         *offsetMap++ = 0;
1934         while (length-- > 0) {
1935             u16Offset += (*s++ < 0x10000)? 1: 2;
1936             *offsetMap++ = u16Offset;
1937         }
1938     }
1939     return ut;
1940 }
1941 
1942 
1943 
AccessChangesChunkSize()1944 void UTextTest::AccessChangesChunkSize() {
1945     UErrorCode status = U_ZERO_ERROR;
1946     UText ut = UTEXT_INITIALIZER;
1947     utext_openUChars(&ut, testAccessText, UPRV_LENGTHOF(testAccessText), &status);
1948     if (U_FAILURE(status)) {
1949         errln("utext_openUChars failed: %s", u_errorName(status));
1950         return;
1951     }
1952     // now reset many ut fields for this test
1953     ut.providerProperties = 0; // especially need to clear UTEXT_PROVIDER_STABLE_CHUNKS
1954     ut.chunkNativeLimit = 0;
1955     ut.nativeIndexingLimit = 0;
1956     ut.chunkNativeStart = 0;
1957     ut.chunkOffset = 0;
1958     ut.chunkLength = 0;
1959     ut.chunkContents = nullptr;
1960     UTextFuncs textFuncs = *ut.pFuncs;
1961     textFuncs.access = ustrTextAccessModChunks; // custom access that changes chunk size
1962     ut.pFuncs = &textFuncs;
1963 
1964     // do test
1965 	const OffsetAndChar *testEntryPtr = testAccessEntries;
1966 	int32_t testCount = UPRV_LENGTHOF(testAccessEntries);
1967 	for (; testCount-- > 0; testEntryPtr++) {
1968 	    utext_setNativeIndex(&ut, testEntryPtr->nativeOffset);
1969 	    int64_t beforeOffset = utext_getNativeIndex(&ut);
1970 	    UChar32 uchar = utext_current32(&ut);
1971 	    int64_t afterOffset = utext_getNativeIndex(&ut);
1972 	    if (uchar != testEntryPtr->expectChar || afterOffset != beforeOffset) {
1973 	        errln("utext_current32 unexpected behavior for u16, test case %lld: expected char %04X at offset %lld, got %04X at %lld;\n"
1974 	            "chunkNativeStart %lld chunkNativeLimit %lld nativeIndexingLimit %d chunkLength %d chunkOffset %d",
1975 	            (int64_t)(testEntryPtr-testAccessEntries), testEntryPtr->expectChar, beforeOffset, uchar, afterOffset,
1976 	            ut.chunkNativeStart, ut.chunkNativeLimit, ut.nativeIndexingLimit, ut.chunkLength, ut.chunkOffset);
1977 	    }
1978 	}
1979 	utext_close(&ut);
1980 
1981 	ut = UTEXT_INITIALIZER;
1982 	utext_openUChar32s(&ut, testAccess32Text, UPRV_LENGTHOF(testAccess32Text), testAccessText, &status);
1983     if (U_FAILURE(status)) {
1984         errln("utext_openUChar32s failed: %s", u_errorName(status));
1985         return;
1986     }
1987     // do test
1988 	testEntryPtr = testAccess32Entries;
1989 	testCount = UPRV_LENGTHOF(testAccess32Entries);
1990 	for (; testCount-- > 0; testEntryPtr++) {
1991 	    utext_setNativeIndex(&ut, testEntryPtr->nativeOffset);
1992 	    int64_t beforeOffset = utext_getNativeIndex(&ut);
1993 	    UChar32 uchar = utext_current32(&ut);
1994 	    int64_t afterOffset = utext_getNativeIndex(&ut);
1995 	    if (uchar != testEntryPtr->expectChar || afterOffset != beforeOffset) {
1996 	        errln("utext_current32 unexpected behavior for u32, test case %lld: expected char %04X at offset %lld, got %04X at %lld;\n"
1997 	            "chunkNativeStart %lld chunkNativeLimit %lld nativeIndexingLimit %d chunkLength %d chunkOffset %d",
1998 	            (int64_t)(testEntryPtr-testAccess32Entries), testEntryPtr->expectChar, beforeOffset, uchar, afterOffset,
1999 	            ut.chunkNativeStart, ut.chunkNativeLimit, ut.nativeIndexingLimit, ut.chunkLength, ut.chunkOffset);
2000 	    }
2001 	}
2002 	utext_close(&ut);
2003 }
2004 
2005