• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2016 and later: Unicode, Inc. and others.
5 *   License & terms of use: http://www.unicode.org/copyright.html#License
6 *
7 *******************************************************************************
8 *******************************************************************************
9 *
10 *   Copyright (C) 2003-2014, International Business Machines
11 *   Corporation and others.  All Rights Reserved.
12 *
13 *******************************************************************************
14 *   file name:  uciter8.c
15 *   encoding:   US-ASCII
16 *   tab size:   8 (not used)
17 *   indentation:4
18 *
19 *   created on: 2003jan10
20 *   created by: Markus W. Scherer
21 *
22 *   This file contains sample code that illustrates reading
23 *   8-bit Unicode text leniently, accepting a mix of UTF-8 and CESU-8
24 *   and also accepting single surrogates.
25 */
26 
27 #include <stdio.h>
28 #include <string.h>
29 #include "unicode/utypes.h"
30 #include "unicode/uiter.h"
31 #include "uit_len8.h"
32 
33 #define log_err printf
34 
35 /* UCharIterator test ------------------------------------------------------- */
36 
37 /*
38  * The following code is a copy of the UCharIterator test code in
39  * source/test/cintltst/custrtst.c,
40  * testing the lenient-8 iterator instead of the UTF-8 one.
41  */
42 
43 /*
44  * Compare results from two iterators, should be same.
45  * Assume that the text is not empty and that
46  * iteration start==0 and iteration limit==length.
47  */
48 static void
compareIterators(UCharIterator * iter1,const char * n1,UCharIterator * iter2,const char * n2)49 compareIterators(UCharIterator *iter1, const char *n1,
50                  UCharIterator *iter2, const char *n2) {
51     int32_t i, pos1, pos2, middle, length;
52     UChar32 c1, c2;
53 
54     /* compare lengths */
55     length=iter1->getIndex(iter1, UITER_LENGTH);
56     pos2=iter2->getIndex(iter2, UITER_LENGTH);
57     if(length!=pos2) {
58         log_err("%s->getIndex(length)=%d != %d=%s->getIndex(length)\n", n1, length, pos2, n2);
59         return;
60     }
61 
62     /* set into the middle */
63     middle=length/2;
64 
65     pos1=iter1->move(iter1, middle, UITER_ZERO);
66     if(pos1!=middle) {
67         log_err("%s->move(from 0 to middle %d)=%d does not move to the middle\n", n1, middle, pos1);
68         return;
69     }
70 
71     pos2=iter2->move(iter2, middle, UITER_ZERO);
72     if(pos2!=middle) {
73         log_err("%s->move(from 0 to middle %d)=%d does not move to the middle\n", n2, middle, pos2);
74         return;
75     }
76 
77     /* test current() */
78     c1=iter1->current(iter1);
79     c2=iter2->current(iter2);
80     if(c1!=c2) {
81         log_err("%s->current()=U+%04x != U+%04x=%s->current() at middle=%d\n", n1, c1, c2, n2, middle);
82         return;
83     }
84 
85     /* move forward 3 UChars */
86     for(i=0; i<3; ++i) {
87         c1=iter1->next(iter1);
88         c2=iter2->next(iter2);
89         if(c1!=c2) {
90             log_err("%s->next()=U+%04x != U+%04x=%s->next() at %d (started in middle)\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
91             return;
92         }
93     }
94 
95     /* move backward 5 UChars */
96     for(i=0; i<5; ++i) {
97         c1=iter1->previous(iter1);
98         c2=iter2->previous(iter2);
99         if(c1!=c2) {
100             log_err("%s->previous()=U+%04x != U+%04x=%s->previous() at %d (started in middle)\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
101             return;
102         }
103     }
104 
105     /* iterate forward from the beginning */
106     pos1=iter1->move(iter1, 0, UITER_START);
107     if(pos1<0) {
108         log_err("%s->move(start) failed\n", n1);
109         return;
110     }
111     if(!iter1->hasNext(iter1)) {
112         log_err("%s->hasNext() at the start returns FALSE\n", n1);
113         return;
114     }
115 
116     pos2=iter2->move(iter2, 0, UITER_START);
117     if(pos2<0) {
118         log_err("%s->move(start) failed\n", n2);
119         return;
120     }
121     if(!iter2->hasNext(iter2)) {
122         log_err("%s->hasNext() at the start returns FALSE\n", n2);
123         return;
124     }
125 
126     do {
127         c1=iter1->next(iter1);
128         c2=iter2->next(iter2);
129         if(c1!=c2) {
130             log_err("%s->next()=U+%04x != U+%04x=%s->next() at %d\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
131             return;
132         }
133     } while(c1>=0);
134 
135     if(iter1->hasNext(iter1)) {
136         log_err("%s->hasNext() at the end returns TRUE\n", n1);
137         return;
138     }
139     if(iter2->hasNext(iter2)) {
140         log_err("%s->hasNext() at the end returns TRUE\n", n2);
141         return;
142     }
143 
144     /* back to the middle */
145     pos1=iter1->move(iter1, middle, UITER_ZERO);
146     if(pos1!=middle) {
147         log_err("%s->move(from end to middle %d)=%d does not move to the middle\n", n1, middle, pos1);
148         return;
149     }
150 
151     pos2=iter2->move(iter2, middle, UITER_ZERO);
152     if(pos2!=middle) {
153         log_err("%s->move(from end to middle %d)=%d does not move to the middle\n", n2, middle, pos2);
154         return;
155     }
156 
157     /* move to index 1 */
158     pos1=iter1->move(iter1, 1, UITER_ZERO);
159     if(pos1!=1) {
160         log_err("%s->move(from middle %d to 1)=%d does not move to 1\n", n1, middle, pos1);
161         return;
162     }
163 
164     pos2=iter2->move(iter2, 1, UITER_ZERO);
165     if(pos2!=1) {
166         log_err("%s->move(from middle %d to 1)=%d does not move to 1\n", n2, middle, pos2);
167         return;
168     }
169 
170     /* iterate backward from the end */
171     pos1=iter1->move(iter1, 0, UITER_LIMIT);
172     if(pos1<0) {
173         log_err("%s->move(limit) failed\n", n1);
174         return;
175     }
176     if(!iter1->hasPrevious(iter1)) {
177         log_err("%s->hasPrevious() at the end returns FALSE\n", n1);
178         return;
179     }
180 
181     pos2=iter2->move(iter2, 0, UITER_LIMIT);
182     if(pos2<0) {
183         log_err("%s->move(limit) failed\n", n2);
184         return;
185     }
186     if(!iter2->hasPrevious(iter2)) {
187         log_err("%s->hasPrevious() at the end returns FALSE\n", n2);
188         return;
189     }
190 
191     do {
192         c1=iter1->previous(iter1);
193         c2=iter2->previous(iter2);
194         if(c1!=c2) {
195             log_err("%s->previous()=U+%04x != U+%04x=%s->previous() at %d\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
196             return;
197         }
198     } while(c1>=0);
199 
200     if(iter1->hasPrevious(iter1)) {
201         log_err("%s->hasPrevious() at the start returns TRUE\n", n1);
202         return;
203     }
204     if(iter2->hasPrevious(iter2)) {
205         log_err("%s->hasPrevious() at the start returns TRUE\n", n2);
206         return;
207     }
208 }
209 
210 /*
211  * Test the iterator's getState() and setState() functions.
212  * iter1 and iter2 must be set up for the same iterator type and the same string
213  * but may be physically different structs (different addresses).
214  *
215  * Assume that the text is not empty and that
216  * iteration start==0 and iteration limit==length.
217  * It must be 2<=middle<=length-2.
218  */
219 static void
testIteratorState(UCharIterator * iter1,UCharIterator * iter2,const char * n,int32_t middle)220 testIteratorState(UCharIterator *iter1, UCharIterator *iter2, const char *n, int32_t middle) {
221     UChar32 u[4];
222 
223     UErrorCode errorCode;
224     UChar32 c;
225     uint32_t state;
226     int32_t i, j;
227 
228     /* get four UChars from the middle of the string */
229     iter1->move(iter1, middle-2, UITER_ZERO);
230     for(i=0; i<4; ++i) {
231         c=iter1->next(iter1);
232         if(c<0) {
233             /* the test violates the assumptions, see comment above */
234             log_err("test error: %s[%d]=%d\n", n, middle-2+i, c);
235             return;
236         }
237         u[i]=c;
238     }
239 
240     /* move to the middle and get the state */
241     iter1->move(iter1, -2, UITER_CURRENT);
242     state=uiter_getState(iter1);
243 
244     /* set the state into the second iterator and compare the results */
245     errorCode=U_ZERO_ERROR;
246     uiter_setState(iter2, state, &errorCode);
247     if(U_FAILURE(errorCode)) {
248         log_err("%s->setState(0x%x) failed: %s\n", n, state, u_errorName(errorCode));
249         return;
250     }
251 
252     c=iter2->current(iter2);
253     if(c!=u[2]) {
254         log_err("%s->current(at %d)=U+%04x!=U+%04x\n", n, middle, c, u[2]);
255     }
256 
257     c=iter2->previous(iter2);
258     if(c!=u[1]) {
259         log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-1, c, u[1]);
260     }
261 
262     iter2->move(iter2, 2, UITER_CURRENT);
263     c=iter2->next(iter2);
264     if(c!=u[3]) {
265         log_err("%s->next(at %d)=U+%04x!=U+%04x\n", n, middle+1, c, u[3]);
266     }
267 
268     iter2->move(iter2, -3, UITER_CURRENT);
269     c=iter2->previous(iter2);
270     if(c!=u[0]) {
271         log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-2, c, u[0]);
272     }
273 
274     /* move the second iterator back to the middle */
275     iter2->move(iter2, 1, UITER_CURRENT);
276     iter2->next(iter2);
277 
278     /* check that both are in the middle */
279     i=iter1->getIndex(iter1, UITER_CURRENT);
280     j=iter2->getIndex(iter2, UITER_CURRENT);
281     if(i!=middle) {
282         log_err("%s->getIndex(current)=%d!=%d as expected\n", n, i, middle);
283     }
284     if(i!=j) {
285         log_err("%s->getIndex(current)=%d!=%d after setState()\n", n, j, i);
286     }
287 
288     /* compare lengths */
289     i=iter1->getIndex(iter1, UITER_LENGTH);
290     j=iter2->getIndex(iter2, UITER_LENGTH);
291     if(i!=j) {
292         log_err("%s->getIndex(length)=%d!=%d before/after setState()\n", n, i, j);
293     }
294 }
295 
296 static void
TestLenient8Iterator()297 TestLenient8Iterator() {
298     static const UChar text[]={
299         0x61, 0x62, 0x63,
300         /* dffd 107fd             d801    dffd - in UTF-16, U+107fd=<d801 dffd> */
301         0xdffd, 0xd801, 0xdffd, 0xd801, 0xdffd,
302         0x78, 0x79, 0x7a, 0
303     };
304     static const uint8_t bytes[]={
305         0x61, 0x62, 0x63,
306         /* dffd            107fd                    d801               dffd - mixture */
307         0xed, 0xbf, 0xbd,  0xf0, 0x90, 0x9f, 0xbd,  0xed, 0xa0, 0x81,  0xed, 0xbf, 0xbd,
308         0x78, 0x79, 0x7a, 0
309     };
310 
311     UCharIterator iter1, iter2;
312     UChar32 c1, c2;
313     int32_t length;
314 
315     puts("test a UCharIterator for lenient 8-bit Unicode (accept single surrogates)");
316 
317     /* compare the same string between UTF-16 and lenient-8 UCharIterators */
318     uiter_setString(&iter1, text, -1);
319     uiter_setLenient8(&iter2, (const char *)bytes, sizeof(bytes)-1);
320     compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator");
321 
322     /* try again with length=-1 */
323     uiter_setLenient8(&iter2, (const char *)bytes, -1);
324     compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator_1");
325 
326     /* test get/set state */
327     length=UPRV_LENGTHOF(text)-1;
328     uiter_setLenient8(&iter1, (const char*)bytes, -1);
329     testIteratorState(&iter1, &iter2, "Lenient8IteratorState", length/2);
330     testIteratorState(&iter1, &iter2, "Lenient8IteratorStatePlus1", length/2+1);
331 
332     /* ---------------------------------------------------------------------- */
333 
334     puts("no output so far means that the lenient-8 iterator works fine");
335 
336     puts("iterate forward:\nUTF-16\tlenient-8");
337     uiter_setString(&iter1, text, -1);
338     iter1.move(&iter1, 0, UITER_START);
339     iter2.move(&iter2, 0, UITER_START);
340     for(;;) {
341         c1=iter1.next(&iter1);
342         c2=iter2.next(&iter2);
343         if(c1<0 && c2<0) {
344             break;
345         }
346         if(c1<0) {
347             printf("\t%04x\n", c2);
348         } else if(c2<0) {
349             printf("%04x\n", c1);
350         } else {
351             printf("%04x\t%04x\n", c1, c2);
352         }
353     }
354 }
355 
356 extern int
main(int argc,const char * argv[])357 main(int argc, const char *argv[]) {
358     TestLenient8Iterator();
359     return 0;
360 }
361