1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2016 and later: Unicode, Inc. and others.
5 * License & terms of use: http://www.unicode.org/copyright.html#License
6 *
7 *******************************************************************************
8 *******************************************************************************
9 *
10 * Copyright (C) 2003-2014, International Business Machines
11 * Corporation and others. All Rights Reserved.
12 *
13 *******************************************************************************
14 * file name: uciter8.c
15 * encoding: US-ASCII
16 * tab size: 8 (not used)
17 * indentation:4
18 *
19 * created on: 2003jan10
20 * created by: Markus W. Scherer
21 *
22 * This file contains sample code that illustrates reading
23 * 8-bit Unicode text leniently, accepting a mix of UTF-8 and CESU-8
24 * and also accepting single surrogates.
25 */
26
27 #include <stdio.h>
28 #include <string.h>
29 #include "unicode/utypes.h"
30 #include "unicode/uiter.h"
31 #include "uit_len8.h"
32
33 #define log_err printf
34
35 /* UCharIterator test ------------------------------------------------------- */
36
37 /*
38 * The following code is a copy of the UCharIterator test code in
39 * source/test/cintltst/custrtst.c,
40 * testing the lenient-8 iterator instead of the UTF-8 one.
41 */
42
43 /*
44 * Compare results from two iterators, should be same.
45 * Assume that the text is not empty and that
46 * iteration start==0 and iteration limit==length.
47 */
48 static void
compareIterators(UCharIterator * iter1,const char * n1,UCharIterator * iter2,const char * n2)49 compareIterators(UCharIterator *iter1, const char *n1,
50 UCharIterator *iter2, const char *n2) {
51 int32_t i, pos1, pos2, middle, length;
52 UChar32 c1, c2;
53
54 /* compare lengths */
55 length=iter1->getIndex(iter1, UITER_LENGTH);
56 pos2=iter2->getIndex(iter2, UITER_LENGTH);
57 if(length!=pos2) {
58 log_err("%s->getIndex(length)=%d != %d=%s->getIndex(length)\n", n1, length, pos2, n2);
59 return;
60 }
61
62 /* set into the middle */
63 middle=length/2;
64
65 pos1=iter1->move(iter1, middle, UITER_ZERO);
66 if(pos1!=middle) {
67 log_err("%s->move(from 0 to middle %d)=%d does not move to the middle\n", n1, middle, pos1);
68 return;
69 }
70
71 pos2=iter2->move(iter2, middle, UITER_ZERO);
72 if(pos2!=middle) {
73 log_err("%s->move(from 0 to middle %d)=%d does not move to the middle\n", n2, middle, pos2);
74 return;
75 }
76
77 /* test current() */
78 c1=iter1->current(iter1);
79 c2=iter2->current(iter2);
80 if(c1!=c2) {
81 log_err("%s->current()=U+%04x != U+%04x=%s->current() at middle=%d\n", n1, c1, c2, n2, middle);
82 return;
83 }
84
85 /* move forward 3 UChars */
86 for(i=0; i<3; ++i) {
87 c1=iter1->next(iter1);
88 c2=iter2->next(iter2);
89 if(c1!=c2) {
90 log_err("%s->next()=U+%04x != U+%04x=%s->next() at %d (started in middle)\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
91 return;
92 }
93 }
94
95 /* move backward 5 UChars */
96 for(i=0; i<5; ++i) {
97 c1=iter1->previous(iter1);
98 c2=iter2->previous(iter2);
99 if(c1!=c2) {
100 log_err("%s->previous()=U+%04x != U+%04x=%s->previous() at %d (started in middle)\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
101 return;
102 }
103 }
104
105 /* iterate forward from the beginning */
106 pos1=iter1->move(iter1, 0, UITER_START);
107 if(pos1<0) {
108 log_err("%s->move(start) failed\n", n1);
109 return;
110 }
111 if(!iter1->hasNext(iter1)) {
112 log_err("%s->hasNext() at the start returns FALSE\n", n1);
113 return;
114 }
115
116 pos2=iter2->move(iter2, 0, UITER_START);
117 if(pos2<0) {
118 log_err("%s->move(start) failed\n", n2);
119 return;
120 }
121 if(!iter2->hasNext(iter2)) {
122 log_err("%s->hasNext() at the start returns FALSE\n", n2);
123 return;
124 }
125
126 do {
127 c1=iter1->next(iter1);
128 c2=iter2->next(iter2);
129 if(c1!=c2) {
130 log_err("%s->next()=U+%04x != U+%04x=%s->next() at %d\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
131 return;
132 }
133 } while(c1>=0);
134
135 if(iter1->hasNext(iter1)) {
136 log_err("%s->hasNext() at the end returns TRUE\n", n1);
137 return;
138 }
139 if(iter2->hasNext(iter2)) {
140 log_err("%s->hasNext() at the end returns TRUE\n", n2);
141 return;
142 }
143
144 /* back to the middle */
145 pos1=iter1->move(iter1, middle, UITER_ZERO);
146 if(pos1!=middle) {
147 log_err("%s->move(from end to middle %d)=%d does not move to the middle\n", n1, middle, pos1);
148 return;
149 }
150
151 pos2=iter2->move(iter2, middle, UITER_ZERO);
152 if(pos2!=middle) {
153 log_err("%s->move(from end to middle %d)=%d does not move to the middle\n", n2, middle, pos2);
154 return;
155 }
156
157 /* move to index 1 */
158 pos1=iter1->move(iter1, 1, UITER_ZERO);
159 if(pos1!=1) {
160 log_err("%s->move(from middle %d to 1)=%d does not move to 1\n", n1, middle, pos1);
161 return;
162 }
163
164 pos2=iter2->move(iter2, 1, UITER_ZERO);
165 if(pos2!=1) {
166 log_err("%s->move(from middle %d to 1)=%d does not move to 1\n", n2, middle, pos2);
167 return;
168 }
169
170 /* iterate backward from the end */
171 pos1=iter1->move(iter1, 0, UITER_LIMIT);
172 if(pos1<0) {
173 log_err("%s->move(limit) failed\n", n1);
174 return;
175 }
176 if(!iter1->hasPrevious(iter1)) {
177 log_err("%s->hasPrevious() at the end returns FALSE\n", n1);
178 return;
179 }
180
181 pos2=iter2->move(iter2, 0, UITER_LIMIT);
182 if(pos2<0) {
183 log_err("%s->move(limit) failed\n", n2);
184 return;
185 }
186 if(!iter2->hasPrevious(iter2)) {
187 log_err("%s->hasPrevious() at the end returns FALSE\n", n2);
188 return;
189 }
190
191 do {
192 c1=iter1->previous(iter1);
193 c2=iter2->previous(iter2);
194 if(c1!=c2) {
195 log_err("%s->previous()=U+%04x != U+%04x=%s->previous() at %d\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
196 return;
197 }
198 } while(c1>=0);
199
200 if(iter1->hasPrevious(iter1)) {
201 log_err("%s->hasPrevious() at the start returns TRUE\n", n1);
202 return;
203 }
204 if(iter2->hasPrevious(iter2)) {
205 log_err("%s->hasPrevious() at the start returns TRUE\n", n2);
206 return;
207 }
208 }
209
210 /*
211 * Test the iterator's getState() and setState() functions.
212 * iter1 and iter2 must be set up for the same iterator type and the same string
213 * but may be physically different structs (different addresses).
214 *
215 * Assume that the text is not empty and that
216 * iteration start==0 and iteration limit==length.
217 * It must be 2<=middle<=length-2.
218 */
219 static void
testIteratorState(UCharIterator * iter1,UCharIterator * iter2,const char * n,int32_t middle)220 testIteratorState(UCharIterator *iter1, UCharIterator *iter2, const char *n, int32_t middle) {
221 UChar32 u[4];
222
223 UErrorCode errorCode;
224 UChar32 c;
225 uint32_t state;
226 int32_t i, j;
227
228 /* get four UChars from the middle of the string */
229 iter1->move(iter1, middle-2, UITER_ZERO);
230 for(i=0; i<4; ++i) {
231 c=iter1->next(iter1);
232 if(c<0) {
233 /* the test violates the assumptions, see comment above */
234 log_err("test error: %s[%d]=%d\n", n, middle-2+i, c);
235 return;
236 }
237 u[i]=c;
238 }
239
240 /* move to the middle and get the state */
241 iter1->move(iter1, -2, UITER_CURRENT);
242 state=uiter_getState(iter1);
243
244 /* set the state into the second iterator and compare the results */
245 errorCode=U_ZERO_ERROR;
246 uiter_setState(iter2, state, &errorCode);
247 if(U_FAILURE(errorCode)) {
248 log_err("%s->setState(0x%x) failed: %s\n", n, state, u_errorName(errorCode));
249 return;
250 }
251
252 c=iter2->current(iter2);
253 if(c!=u[2]) {
254 log_err("%s->current(at %d)=U+%04x!=U+%04x\n", n, middle, c, u[2]);
255 }
256
257 c=iter2->previous(iter2);
258 if(c!=u[1]) {
259 log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-1, c, u[1]);
260 }
261
262 iter2->move(iter2, 2, UITER_CURRENT);
263 c=iter2->next(iter2);
264 if(c!=u[3]) {
265 log_err("%s->next(at %d)=U+%04x!=U+%04x\n", n, middle+1, c, u[3]);
266 }
267
268 iter2->move(iter2, -3, UITER_CURRENT);
269 c=iter2->previous(iter2);
270 if(c!=u[0]) {
271 log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-2, c, u[0]);
272 }
273
274 /* move the second iterator back to the middle */
275 iter2->move(iter2, 1, UITER_CURRENT);
276 iter2->next(iter2);
277
278 /* check that both are in the middle */
279 i=iter1->getIndex(iter1, UITER_CURRENT);
280 j=iter2->getIndex(iter2, UITER_CURRENT);
281 if(i!=middle) {
282 log_err("%s->getIndex(current)=%d!=%d as expected\n", n, i, middle);
283 }
284 if(i!=j) {
285 log_err("%s->getIndex(current)=%d!=%d after setState()\n", n, j, i);
286 }
287
288 /* compare lengths */
289 i=iter1->getIndex(iter1, UITER_LENGTH);
290 j=iter2->getIndex(iter2, UITER_LENGTH);
291 if(i!=j) {
292 log_err("%s->getIndex(length)=%d!=%d before/after setState()\n", n, i, j);
293 }
294 }
295
296 static void
TestLenient8Iterator()297 TestLenient8Iterator() {
298 static const UChar text[]={
299 0x61, 0x62, 0x63,
300 /* dffd 107fd d801 dffd - in UTF-16, U+107fd=<d801 dffd> */
301 0xdffd, 0xd801, 0xdffd, 0xd801, 0xdffd,
302 0x78, 0x79, 0x7a, 0
303 };
304 static const uint8_t bytes[]={
305 0x61, 0x62, 0x63,
306 /* dffd 107fd d801 dffd - mixture */
307 0xed, 0xbf, 0xbd, 0xf0, 0x90, 0x9f, 0xbd, 0xed, 0xa0, 0x81, 0xed, 0xbf, 0xbd,
308 0x78, 0x79, 0x7a, 0
309 };
310
311 UCharIterator iter1, iter2;
312 UChar32 c1, c2;
313 int32_t length;
314
315 puts("test a UCharIterator for lenient 8-bit Unicode (accept single surrogates)");
316
317 /* compare the same string between UTF-16 and lenient-8 UCharIterators */
318 uiter_setString(&iter1, text, -1);
319 uiter_setLenient8(&iter2, (const char *)bytes, sizeof(bytes)-1);
320 compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator");
321
322 /* try again with length=-1 */
323 uiter_setLenient8(&iter2, (const char *)bytes, -1);
324 compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator_1");
325
326 /* test get/set state */
327 length=UPRV_LENGTHOF(text)-1;
328 uiter_setLenient8(&iter1, (const char*)bytes, -1);
329 testIteratorState(&iter1, &iter2, "Lenient8IteratorState", length/2);
330 testIteratorState(&iter1, &iter2, "Lenient8IteratorStatePlus1", length/2+1);
331
332 /* ---------------------------------------------------------------------- */
333
334 puts("no output so far means that the lenient-8 iterator works fine");
335
336 puts("iterate forward:\nUTF-16\tlenient-8");
337 uiter_setString(&iter1, text, -1);
338 iter1.move(&iter1, 0, UITER_START);
339 iter2.move(&iter2, 0, UITER_START);
340 for(;;) {
341 c1=iter1.next(&iter1);
342 c2=iter2.next(&iter2);
343 if(c1<0 && c2<0) {
344 break;
345 }
346 if(c1<0) {
347 printf("\t%04x\n", c2);
348 } else if(c2<0) {
349 printf("%04x\n", c1);
350 } else {
351 printf("%04x\t%04x\n", c1, c2);
352 }
353 }
354 }
355
356 extern int
main(int argc,const char * argv[])357 main(int argc, const char *argv[]) {
358 TestLenient8Iterator();
359 return 0;
360 }
361