1 /*
2 *******************************************************************************
3 *
4 * © 2016 and later: Unicode, Inc. and others.
5 * License & terms of use: http://www.unicode.org/copyright.html
6 *
7 *******************************************************************************
8 *******************************************************************************
9 *
10 * Copyright (C) 2003-2014, International Business Machines
11 * Corporation and others. All Rights Reserved.
12 *
13 *******************************************************************************
14 * file name: uciter8.c
15 * encoding: UTF-8
16 * tab size: 8 (not used)
17 * indentation:4
18 *
19 * created on: 2003jan10
20 * created by: Markus W. Scherer
21 *
22 * This file contains sample code that illustrates reading
23 * 8-bit Unicode text leniently, accepting a mix of UTF-8 and CESU-8
24 * and also accepting single surrogates.
25 */
26
27 #include <stdio.h>
28 #include <string.h>
29 #include "unicode/utypes.h"
30 #include "unicode/uiter.h"
31 #include "uit_len8.h"
32
33 #ifndef UPRV_LENGTHOF
34 #define UPRV_LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
35 #endif
36
37 #define log_err printf
38
39 /* UCharIterator test ------------------------------------------------------- */
40
41 /*
42 * The following code is a copy of the UCharIterator test code in
43 * source/test/cintltst/custrtst.c,
44 * testing the lenient-8 iterator instead of the UTF-8 one.
45 */
46
47 /*
48 * Compare results from two iterators, should be same.
49 * Assume that the text is not empty and that
50 * iteration start==0 and iteration limit==length.
51 */
52 static void
compareIterators(UCharIterator * iter1,const char * n1,UCharIterator * iter2,const char * n2)53 compareIterators(UCharIterator *iter1, const char *n1,
54 UCharIterator *iter2, const char *n2) {
55 int32_t i, pos1, pos2, middle, length;
56 UChar32 c1, c2;
57
58 /* compare lengths */
59 length=iter1->getIndex(iter1, UITER_LENGTH);
60 pos2=iter2->getIndex(iter2, UITER_LENGTH);
61 if(length!=pos2) {
62 log_err("%s->getIndex(length)=%d != %d=%s->getIndex(length)\n", n1, length, pos2, n2);
63 return;
64 }
65
66 /* set into the middle */
67 middle=length/2;
68
69 pos1=iter1->move(iter1, middle, UITER_ZERO);
70 if(pos1!=middle) {
71 log_err("%s->move(from 0 to middle %d)=%d does not move to the middle\n", n1, middle, pos1);
72 return;
73 }
74
75 pos2=iter2->move(iter2, middle, UITER_ZERO);
76 if(pos2!=middle) {
77 log_err("%s->move(from 0 to middle %d)=%d does not move to the middle\n", n2, middle, pos2);
78 return;
79 }
80
81 /* test current() */
82 c1=iter1->current(iter1);
83 c2=iter2->current(iter2);
84 if(c1!=c2) {
85 log_err("%s->current()=U+%04x != U+%04x=%s->current() at middle=%d\n", n1, c1, c2, n2, middle);
86 return;
87 }
88
89 /* move forward 3 UChars */
90 for(i=0; i<3; ++i) {
91 c1=iter1->next(iter1);
92 c2=iter2->next(iter2);
93 if(c1!=c2) {
94 log_err("%s->next()=U+%04x != U+%04x=%s->next() at %d (started in middle)\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
95 return;
96 }
97 }
98
99 /* move backward 5 UChars */
100 for(i=0; i<5; ++i) {
101 c1=iter1->previous(iter1);
102 c2=iter2->previous(iter2);
103 if(c1!=c2) {
104 log_err("%s->previous()=U+%04x != U+%04x=%s->previous() at %d (started in middle)\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
105 return;
106 }
107 }
108
109 /* iterate forward from the beginning */
110 pos1=iter1->move(iter1, 0, UITER_START);
111 if(pos1<0) {
112 log_err("%s->move(start) failed\n", n1);
113 return;
114 }
115 if(!iter1->hasNext(iter1)) {
116 log_err("%s->hasNext() at the start returns false\n", n1);
117 return;
118 }
119
120 pos2=iter2->move(iter2, 0, UITER_START);
121 if(pos2<0) {
122 log_err("%s->move(start) failed\n", n2);
123 return;
124 }
125 if(!iter2->hasNext(iter2)) {
126 log_err("%s->hasNext() at the start returns false\n", n2);
127 return;
128 }
129
130 do {
131 c1=iter1->next(iter1);
132 c2=iter2->next(iter2);
133 if(c1!=c2) {
134 log_err("%s->next()=U+%04x != U+%04x=%s->next() at %d\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
135 return;
136 }
137 } while(c1>=0);
138
139 if(iter1->hasNext(iter1)) {
140 log_err("%s->hasNext() at the end returns true\n", n1);
141 return;
142 }
143 if(iter2->hasNext(iter2)) {
144 log_err("%s->hasNext() at the end returns true\n", n2);
145 return;
146 }
147
148 /* back to the middle */
149 pos1=iter1->move(iter1, middle, UITER_ZERO);
150 if(pos1!=middle) {
151 log_err("%s->move(from end to middle %d)=%d does not move to the middle\n", n1, middle, pos1);
152 return;
153 }
154
155 pos2=iter2->move(iter2, middle, UITER_ZERO);
156 if(pos2!=middle) {
157 log_err("%s->move(from end to middle %d)=%d does not move to the middle\n", n2, middle, pos2);
158 return;
159 }
160
161 /* move to index 1 */
162 pos1=iter1->move(iter1, 1, UITER_ZERO);
163 if(pos1!=1) {
164 log_err("%s->move(from middle %d to 1)=%d does not move to 1\n", n1, middle, pos1);
165 return;
166 }
167
168 pos2=iter2->move(iter2, 1, UITER_ZERO);
169 if(pos2!=1) {
170 log_err("%s->move(from middle %d to 1)=%d does not move to 1\n", n2, middle, pos2);
171 return;
172 }
173
174 /* iterate backward from the end */
175 pos1=iter1->move(iter1, 0, UITER_LIMIT);
176 if(pos1<0) {
177 log_err("%s->move(limit) failed\n", n1);
178 return;
179 }
180 if(!iter1->hasPrevious(iter1)) {
181 log_err("%s->hasPrevious() at the end returns false\n", n1);
182 return;
183 }
184
185 pos2=iter2->move(iter2, 0, UITER_LIMIT);
186 if(pos2<0) {
187 log_err("%s->move(limit) failed\n", n2);
188 return;
189 }
190 if(!iter2->hasPrevious(iter2)) {
191 log_err("%s->hasPrevious() at the end returns false\n", n2);
192 return;
193 }
194
195 do {
196 c1=iter1->previous(iter1);
197 c2=iter2->previous(iter2);
198 if(c1!=c2) {
199 log_err("%s->previous()=U+%04x != U+%04x=%s->previous() at %d\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
200 return;
201 }
202 } while(c1>=0);
203
204 if(iter1->hasPrevious(iter1)) {
205 log_err("%s->hasPrevious() at the start returns true\n", n1);
206 return;
207 }
208 if(iter2->hasPrevious(iter2)) {
209 log_err("%s->hasPrevious() at the start returns true\n", n2);
210 return;
211 }
212 }
213
214 /*
215 * Test the iterator's getState() and setState() functions.
216 * iter1 and iter2 must be set up for the same iterator type and the same string
217 * but may be physically different structs (different addresses).
218 *
219 * Assume that the text is not empty and that
220 * iteration start==0 and iteration limit==length.
221 * It must be 2<=middle<=length-2.
222 */
223 static void
testIteratorState(UCharIterator * iter1,UCharIterator * iter2,const char * n,int32_t middle)224 testIteratorState(UCharIterator *iter1, UCharIterator *iter2, const char *n, int32_t middle) {
225 UChar32 u[4];
226
227 UErrorCode errorCode;
228 UChar32 c;
229 uint32_t state;
230 int32_t i, j;
231
232 /* get four UChars from the middle of the string */
233 iter1->move(iter1, middle-2, UITER_ZERO);
234 for(i=0; i<4; ++i) {
235 c=iter1->next(iter1);
236 if(c<0) {
237 /* the test violates the assumptions, see comment above */
238 log_err("test error: %s[%d]=%d\n", n, middle-2+i, c);
239 return;
240 }
241 u[i]=c;
242 }
243
244 /* move to the middle and get the state */
245 iter1->move(iter1, -2, UITER_CURRENT);
246 state=uiter_getState(iter1);
247
248 /* set the state into the second iterator and compare the results */
249 errorCode=U_ZERO_ERROR;
250 uiter_setState(iter2, state, &errorCode);
251 if(U_FAILURE(errorCode)) {
252 log_err("%s->setState(0x%x) failed: %s\n", n, state, u_errorName(errorCode));
253 return;
254 }
255
256 c=iter2->current(iter2);
257 if(c!=u[2]) {
258 log_err("%s->current(at %d)=U+%04x!=U+%04x\n", n, middle, c, u[2]);
259 }
260
261 c=iter2->previous(iter2);
262 if(c!=u[1]) {
263 log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-1, c, u[1]);
264 }
265
266 iter2->move(iter2, 2, UITER_CURRENT);
267 c=iter2->next(iter2);
268 if(c!=u[3]) {
269 log_err("%s->next(at %d)=U+%04x!=U+%04x\n", n, middle+1, c, u[3]);
270 }
271
272 iter2->move(iter2, -3, UITER_CURRENT);
273 c=iter2->previous(iter2);
274 if(c!=u[0]) {
275 log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-2, c, u[0]);
276 }
277
278 /* move the second iterator back to the middle */
279 iter2->move(iter2, 1, UITER_CURRENT);
280 iter2->next(iter2);
281
282 /* check that both are in the middle */
283 i=iter1->getIndex(iter1, UITER_CURRENT);
284 j=iter2->getIndex(iter2, UITER_CURRENT);
285 if(i!=middle) {
286 log_err("%s->getIndex(current)=%d!=%d as expected\n", n, i, middle);
287 }
288 if(i!=j) {
289 log_err("%s->getIndex(current)=%d!=%d after setState()\n", n, j, i);
290 }
291
292 /* compare lengths */
293 i=iter1->getIndex(iter1, UITER_LENGTH);
294 j=iter2->getIndex(iter2, UITER_LENGTH);
295 if(i!=j) {
296 log_err("%s->getIndex(length)=%d!=%d before/after setState()\n", n, i, j);
297 }
298 }
299
300 static void
TestLenient8Iterator()301 TestLenient8Iterator() {
302 static const UChar text[]={
303 0x61, 0x62, 0x63,
304 /* dffd 107fd d801 dffd - in UTF-16, U+107fd=<d801 dffd> */
305 0xdffd, 0xd801, 0xdffd, 0xd801, 0xdffd,
306 0x78, 0x79, 0x7a, 0
307 };
308 static const uint8_t bytes[]={
309 0x61, 0x62, 0x63,
310 /* dffd 107fd d801 dffd - mixture */
311 0xed, 0xbf, 0xbd, 0xf0, 0x90, 0x9f, 0xbd, 0xed, 0xa0, 0x81, 0xed, 0xbf, 0xbd,
312 0x78, 0x79, 0x7a, 0
313 };
314
315 UCharIterator iter1, iter2;
316 UChar32 c1, c2;
317 int32_t length;
318
319 puts("test a UCharIterator for lenient 8-bit Unicode (accept single surrogates)");
320
321 /* compare the same string between UTF-16 and lenient-8 UCharIterators */
322 uiter_setString(&iter1, text, -1);
323 uiter_setLenient8(&iter2, (const char *)bytes, sizeof(bytes)-1);
324 compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator");
325
326 /* try again with length=-1 */
327 uiter_setLenient8(&iter2, (const char *)bytes, -1);
328 compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator_1");
329
330 /* test get/set state */
331 length=UPRV_LENGTHOF(text)-1;
332 uiter_setLenient8(&iter1, (const char*)bytes, -1);
333 testIteratorState(&iter1, &iter2, "Lenient8IteratorState", length/2);
334 testIteratorState(&iter1, &iter2, "Lenient8IteratorStatePlus1", length/2+1);
335
336 /* ---------------------------------------------------------------------- */
337
338 puts("no output so far means that the lenient-8 iterator works fine");
339
340 puts("iterate forward:\nUTF-16\tlenient-8");
341 uiter_setString(&iter1, text, -1);
342 iter1.move(&iter1, 0, UITER_START);
343 iter2.move(&iter2, 0, UITER_START);
344 for(;;) {
345 c1=iter1.next(&iter1);
346 c2=iter2.next(&iter2);
347 if(c1<0 && c2<0) {
348 break;
349 }
350 if(c1<0) {
351 printf("\t%04x\n", c2);
352 } else if(c2<0) {
353 printf("%04x\n", c1);
354 } else {
355 printf("%04x\t%04x\n", c1, c2);
356 }
357 }
358 }
359
360 extern int
main(int argc,const char * argv[])361 main(int argc, const char *argv[]) {
362 TestLenient8Iterator();
363 return 0;
364 }
365