• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2003-2007, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  uciter8.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2003jan10
14 *   created by: Markus W. Scherer
15 *
16 *   This file contains sample code that illustrates reading
17 *   8-bit Unicode text leniently, accepting a mix of UTF-8 and CESU-8
18 *   and also accepting single surrogates.
19 */
20 
21 #include <stdio.h>
22 #include <string.h>
23 #include "unicode/utypes.h"
24 #include "unicode/uiter.h"
25 #include "uit_len8.h"
26 
27 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
28 
29 #define log_err printf
30 
31 /* UCharIterator test ------------------------------------------------------- */
32 
33 /*
34  * The following code is a copy of the UCharIterator test code in
35  * source/test/cintltst/custrtst.c,
36  * testing the lenient-8 iterator instead of the UTF-8 one.
37  */
38 
39 /*
40  * Compare results from two iterators, should be same.
41  * Assume that the text is not empty and that
42  * iteration start==0 and iteration limit==length.
43  */
44 static void
compareIterators(UCharIterator * iter1,const char * n1,UCharIterator * iter2,const char * n2)45 compareIterators(UCharIterator *iter1, const char *n1,
46                  UCharIterator *iter2, const char *n2) {
47     int32_t i, pos1, pos2, middle, length;
48     UChar32 c1, c2;
49 
50     /* compare lengths */
51     length=iter1->getIndex(iter1, UITER_LENGTH);
52     pos2=iter2->getIndex(iter2, UITER_LENGTH);
53     if(length!=pos2) {
54         log_err("%s->getIndex(length)=%d != %d=%s->getIndex(length)\n", n1, length, pos2, n2);
55         return;
56     }
57 
58     /* set into the middle */
59     middle=length/2;
60 
61     pos1=iter1->move(iter1, middle, UITER_ZERO);
62     if(pos1!=middle) {
63         log_err("%s->move(from 0 to middle %d)=%d does not move to the middle\n", n1, middle, pos1);
64         return;
65     }
66 
67     pos2=iter2->move(iter2, middle, UITER_ZERO);
68     if(pos2!=middle) {
69         log_err("%s->move(from 0 to middle %d)=%d does not move to the middle\n", n2, middle, pos2);
70         return;
71     }
72 
73     /* test current() */
74     c1=iter1->current(iter1);
75     c2=iter2->current(iter2);
76     if(c1!=c2) {
77         log_err("%s->current()=U+%04x != U+%04x=%s->current() at middle=%d\n", n1, c1, c2, n2, middle);
78         return;
79     }
80 
81     /* move forward 3 UChars */
82     for(i=0; i<3; ++i) {
83         c1=iter1->next(iter1);
84         c2=iter2->next(iter2);
85         if(c1!=c2) {
86             log_err("%s->next()=U+%04x != U+%04x=%s->next() at %d (started in middle)\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
87             return;
88         }
89     }
90 
91     /* move backward 5 UChars */
92     for(i=0; i<5; ++i) {
93         c1=iter1->previous(iter1);
94         c2=iter2->previous(iter2);
95         if(c1!=c2) {
96             log_err("%s->previous()=U+%04x != U+%04x=%s->previous() at %d (started in middle)\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
97             return;
98         }
99     }
100 
101     /* iterate forward from the beginning */
102     pos1=iter1->move(iter1, 0, UITER_START);
103     if(pos1<0) {
104         log_err("%s->move(start) failed\n", n1);
105         return;
106     }
107     if(!iter1->hasNext(iter1)) {
108         log_err("%s->hasNext() at the start returns FALSE\n", n1);
109         return;
110     }
111 
112     pos2=iter2->move(iter2, 0, UITER_START);
113     if(pos2<0) {
114         log_err("%s->move(start) failed\n", n2);
115         return;
116     }
117     if(!iter2->hasNext(iter2)) {
118         log_err("%s->hasNext() at the start returns FALSE\n", n2);
119         return;
120     }
121 
122     do {
123         c1=iter1->next(iter1);
124         c2=iter2->next(iter2);
125         if(c1!=c2) {
126             log_err("%s->next()=U+%04x != U+%04x=%s->next() at %d\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
127             return;
128         }
129     } while(c1>=0);
130 
131     if(iter1->hasNext(iter1)) {
132         log_err("%s->hasNext() at the end returns TRUE\n", n1);
133         return;
134     }
135     if(iter2->hasNext(iter2)) {
136         log_err("%s->hasNext() at the end returns TRUE\n", n2);
137         return;
138     }
139 
140     /* back to the middle */
141     pos1=iter1->move(iter1, middle, UITER_ZERO);
142     if(pos1!=middle) {
143         log_err("%s->move(from end to middle %d)=%d does not move to the middle\n", n1, middle, pos1);
144         return;
145     }
146 
147     pos2=iter2->move(iter2, middle, UITER_ZERO);
148     if(pos2!=middle) {
149         log_err("%s->move(from end to middle %d)=%d does not move to the middle\n", n2, middle, pos2);
150         return;
151     }
152 
153     /* move to index 1 */
154     pos1=iter1->move(iter1, 1, UITER_ZERO);
155     if(pos1!=1) {
156         log_err("%s->move(from middle %d to 1)=%d does not move to 1\n", n1, middle, pos1);
157         return;
158     }
159 
160     pos2=iter2->move(iter2, 1, UITER_ZERO);
161     if(pos2!=1) {
162         log_err("%s->move(from middle %d to 1)=%d does not move to 1\n", n2, middle, pos2);
163         return;
164     }
165 
166     /* iterate backward from the end */
167     pos1=iter1->move(iter1, 0, UITER_LIMIT);
168     if(pos1<0) {
169         log_err("%s->move(limit) failed\n", n1);
170         return;
171     }
172     if(!iter1->hasPrevious(iter1)) {
173         log_err("%s->hasPrevious() at the end returns FALSE\n", n1);
174         return;
175     }
176 
177     pos2=iter2->move(iter2, 0, UITER_LIMIT);
178     if(pos2<0) {
179         log_err("%s->move(limit) failed\n", n2);
180         return;
181     }
182     if(!iter2->hasPrevious(iter2)) {
183         log_err("%s->hasPrevious() at the end returns FALSE\n", n2);
184         return;
185     }
186 
187     do {
188         c1=iter1->previous(iter1);
189         c2=iter2->previous(iter2);
190         if(c1!=c2) {
191             log_err("%s->previous()=U+%04x != U+%04x=%s->previous() at %d\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
192             return;
193         }
194     } while(c1>=0);
195 
196     if(iter1->hasPrevious(iter1)) {
197         log_err("%s->hasPrevious() at the start returns TRUE\n", n1);
198         return;
199     }
200     if(iter2->hasPrevious(iter2)) {
201         log_err("%s->hasPrevious() at the start returns TRUE\n", n2);
202         return;
203     }
204 }
205 
206 /*
207  * Test the iterator's getState() and setState() functions.
208  * iter1 and iter2 must be set up for the same iterator type and the same string
209  * but may be physically different structs (different addresses).
210  *
211  * Assume that the text is not empty and that
212  * iteration start==0 and iteration limit==length.
213  * It must be 2<=middle<=length-2.
214  */
215 static void
testIteratorState(UCharIterator * iter1,UCharIterator * iter2,const char * n,int32_t middle)216 testIteratorState(UCharIterator *iter1, UCharIterator *iter2, const char *n, int32_t middle) {
217     UChar32 u[4];
218 
219     UErrorCode errorCode;
220     UChar32 c;
221     uint32_t state;
222     int32_t i, j;
223 
224     /* get four UChars from the middle of the string */
225     iter1->move(iter1, middle-2, UITER_ZERO);
226     for(i=0; i<4; ++i) {
227         c=iter1->next(iter1);
228         if(c<0) {
229             /* the test violates the assumptions, see comment above */
230             log_err("test error: %s[%d]=%d\n", n, middle-2+i, c);
231             return;
232         }
233         u[i]=c;
234     }
235 
236     /* move to the middle and get the state */
237     iter1->move(iter1, -2, UITER_CURRENT);
238     state=uiter_getState(iter1);
239 
240     /* set the state into the second iterator and compare the results */
241     errorCode=U_ZERO_ERROR;
242     uiter_setState(iter2, state, &errorCode);
243     if(U_FAILURE(errorCode)) {
244         log_err("%s->setState(0x%x) failed: %s\n", n, state, u_errorName(errorCode));
245         return;
246     }
247 
248     c=iter2->current(iter2);
249     if(c!=u[2]) {
250         log_err("%s->current(at %d)=U+%04x!=U+%04x\n", n, middle, c, u[2]);
251     }
252 
253     c=iter2->previous(iter2);
254     if(c!=u[1]) {
255         log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-1, c, u[1]);
256     }
257 
258     iter2->move(iter2, 2, UITER_CURRENT);
259     c=iter2->next(iter2);
260     if(c!=u[3]) {
261         log_err("%s->next(at %d)=U+%04x!=U+%04x\n", n, middle+1, c, u[3]);
262     }
263 
264     iter2->move(iter2, -3, UITER_CURRENT);
265     c=iter2->previous(iter2);
266     if(c!=u[0]) {
267         log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-2, c, u[0]);
268     }
269 
270     /* move the second iterator back to the middle */
271     iter2->move(iter2, 1, UITER_CURRENT);
272     iter2->next(iter2);
273 
274     /* check that both are in the middle */
275     i=iter1->getIndex(iter1, UITER_CURRENT);
276     j=iter2->getIndex(iter2, UITER_CURRENT);
277     if(i!=middle) {
278         log_err("%s->getIndex(current)=%d!=%d as expected\n", n, i, middle);
279     }
280     if(i!=j) {
281         log_err("%s->getIndex(current)=%d!=%d after setState()\n", n, j, i);
282     }
283 
284     /* compare lengths */
285     i=iter1->getIndex(iter1, UITER_LENGTH);
286     j=iter2->getIndex(iter2, UITER_LENGTH);
287     if(i!=j) {
288         log_err("%s->getIndex(length)=%d!=%d before/after setState()\n", n, i, j);
289     }
290 }
291 
292 static void
TestLenient8Iterator()293 TestLenient8Iterator() {
294     static const UChar text[]={
295         0x61, 0x62, 0x63,
296         /* dffd 107fd             d801    dffd - in UTF-16, U+107fd=<d801 dffd> */
297         0xdffd, 0xd801, 0xdffd, 0xd801, 0xdffd,
298         0x78, 0x79, 0x7a, 0
299     };
300     static const uint8_t bytes[]={
301         0x61, 0x62, 0x63,
302         /* dffd            107fd                    d801               dffd - mixture */
303         0xed, 0xbf, 0xbd,  0xf0, 0x90, 0x9f, 0xbd,  0xed, 0xa0, 0x81,  0xed, 0xbf, 0xbd,
304         0x78, 0x79, 0x7a, 0
305     };
306 
307     UCharIterator iter1, iter2;
308     UChar32 c1, c2;
309     int32_t length;
310 
311     puts("test a UCharIterator for lenient 8-bit Unicode (accept single surrogates)");
312 
313     /* compare the same string between UTF-16 and lenient-8 UCharIterators */
314     uiter_setString(&iter1, text, -1);
315     uiter_setLenient8(&iter2, (const char *)bytes, sizeof(bytes)-1);
316     compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator");
317 
318     /* try again with length=-1 */
319     uiter_setLenient8(&iter2, (const char *)bytes, -1);
320     compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator_1");
321 
322     /* test get/set state */
323     length=LENGTHOF(text)-1;
324     uiter_setLenient8(&iter1, (const char*)bytes, -1);
325     testIteratorState(&iter1, &iter2, "Lenient8IteratorState", length/2);
326     testIteratorState(&iter1, &iter2, "Lenient8IteratorStatePlus1", length/2+1);
327 
328     /* ---------------------------------------------------------------------- */
329 
330     puts("no output so far means that the lenient-8 iterator works fine");
331 
332     puts("iterate forward:\nUTF-16\tlenient-8");
333     uiter_setString(&iter1, text, -1);
334     iter1.move(&iter1, 0, UITER_START);
335     iter2.move(&iter2, 0, UITER_START);
336     for(;;) {
337         c1=iter1.next(&iter1);
338         c2=iter2.next(&iter2);
339         if(c1<0 && c2<0) {
340             break;
341         }
342         if(c1<0) {
343             printf("\t%04x\n", c2);
344         } else if(c2<0) {
345             printf("%04x\n", c1);
346         } else {
347             printf("%04x\t%04x\n", c1, c2);
348         }
349     }
350 }
351 
352 extern int
main(int argc,const char * argv[])353 main(int argc, const char *argv[]) {
354     TestLenient8Iterator();
355     return 0;
356 }
357