• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 1999-2006, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  uinvchar.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:2
12 *
13 *   created on: 2004sep14
14 *   created by: Markus W. Scherer
15 *
16 *   Functions for handling invariant characters, moved here from putil.c
17 *   for better modularization.
18 */
19 
20 #include "unicode/utypes.h"
21 #include "unicode/ustring.h"
22 #include "udataswp.h"
23 #include "cstring.h"
24 #include "cmemory.h"
25 #include "uassert.h"
26 #include "uinvchar.h"
27 
28 /* invariant-character handling --------------------------------------------- */
29 
30 /*
31  * These maps for ASCII to/from EBCDIC map invariant characters (see utypes.h)
32  * appropriately for most EBCDIC codepages.
33  *
34  * They currently also map most other ASCII graphic characters,
35  * appropriately for codepages 37 and 1047.
36  * Exceptions: The characters for []^ have different codes in 37 & 1047.
37  * Both versions are mapped to ASCII.
38  *
39  *    ASCII 37 1047
40  * [     5B BA   AD
41  * ]     5D BB   BD
42  * ^     5E B0   5F
43  *
44  * There are no mappings for variant characters from Unicode to EBCDIC.
45  *
46  * Currently, C0 control codes are also included in these maps.
47  * Exceptions: S/390 Open Edition swaps LF and NEL codes compared with other
48  * EBCDIC platforms; both codes (15 and 25) are mapped to ASCII LF (0A),
49  * but there is no mapping for ASCII LF back to EBCDIC.
50  *
51  *    ASCII EBCDIC S/390-OE
52  * LF    0A     25       15
53  * NEL   85     15       25
54  *
55  * The maps below explicitly exclude the variant
56  * control and graphical characters that are in ASCII-based
57  * codepages at 0x80 and above.
58  * "No mapping" is expressed by mapping to a 00 byte.
59  *
60  * These tables do not establish a converter or a codepage.
61  */
62 
63 static const uint8_t asciiFromEbcdic[256]={
64     0x00, 0x01, 0x02, 0x03, 0x00, 0x09, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
65     0x10, 0x11, 0x12, 0x13, 0x00, 0x0a, 0x08, 0x00, 0x18, 0x19, 0x00, 0x00, 0x1c, 0x1d, 0x1e, 0x1f,
66     0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x17, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x06, 0x07,
67     0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x14, 0x15, 0x00, 0x1a,
68 
69     0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2e, 0x3c, 0x28, 0x2b, 0x7c,
70     0x26, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x5e,
71     0x2d, 0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2c, 0x25, 0x5f, 0x3e, 0x3f,
72     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22,
73 
74     0x00, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
75     0x00, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
76     0x00, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00,
77     0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5b, 0x5d, 0x00, 0x5d, 0x00, 0x00,
78 
79     0x7b, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
80     0x7d, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
81     0x5c, 0x00, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
82     0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
83 };
84 
85 static const uint8_t ebcdicFromAscii[256]={
86     0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
87     0x10, 0x11, 0x12, 0x13, 0x3c, 0x3d, 0x32, 0x26, 0x18, 0x19, 0x3f, 0x27, 0x1c, 0x1d, 0x1e, 0x1f,
88     0x40, 0x00, 0x7f, 0x00, 0x00, 0x6c, 0x50, 0x7d, 0x4d, 0x5d, 0x5c, 0x4e, 0x6b, 0x60, 0x4b, 0x61,
89     0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0x7a, 0x5e, 0x4c, 0x7e, 0x6e, 0x6f,
90 
91     0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6,
92     0xd7, 0xd8, 0xd9, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0x00, 0x00, 0x00, 0x00, 0x6d,
93     0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
94     0x97, 0x98, 0x99, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0x00, 0x00, 0x00, 0x00, 0x07,
95 
96     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
98     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
99     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
100 
101     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
105 };
106 
107 /*
108  * Bit sets indicating which characters of the ASCII repertoire
109  * (by ASCII/Unicode code) are "invariant".
110  * See utypes.h for more details.
111  *
112  * As invariant are considered the characters of the ASCII repertoire except
113  * for the following:
114  * 21  '!' <exclamation mark>
115  * 23  '#' <number sign>
116  * 24  '$' <dollar sign>
117  *
118  * 40  '@' <commercial at>
119  *
120  * 5b  '[' <left bracket>
121  * 5c  '\' <backslash>
122  * 5d  ']' <right bracket>
123  * 5e  '^' <circumflex>
124  *
125  * 60  '`' <grave accent>
126  *
127  * 7b  '{' <left brace>
128  * 7c  '|' <vertical line>
129  * 7d  '}' <right brace>
130  * 7e  '~' <tilde>
131  */
132 static const uint32_t invariantChars[4]={
133     0xfffffbff, /* 00..1f but not 0a */
134     0xffffffe5, /* 20..3f but not 21 23 24 */
135     0x87fffffe, /* 40..5f but not 40 5b..5e */
136     0x87fffffe  /* 60..7f but not 60 7b..7e */
137 };
138 
139 /*
140  * test unsigned types (or values known to be non-negative) for invariant characters,
141  * tests ASCII-family character values
142  */
143 #define UCHAR_IS_INVARIANT(c) (((c)<=0x7f) && (invariantChars[(c)>>5]&((uint32_t)1<<((c)&0x1f)))!=0)
144 
145 /* test signed types for invariant characters, adds test for positive values */
146 #define SCHAR_IS_INVARIANT(c) ((0<=(c)) && UCHAR_IS_INVARIANT(c))
147 
148 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
149 #define CHAR_TO_UCHAR(c) c
150 #define UCHAR_TO_CHAR(c) c
151 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
152 #define CHAR_TO_UCHAR(u) asciiFromEbcdic[u]
153 #define UCHAR_TO_CHAR(u) ebcdicFromAscii[u]
154 #else
155 #   error U_CHARSET_FAMILY is not valid
156 #endif
157 
158 
159 U_CAPI void U_EXPORT2
u_charsToUChars(const char * cs,UChar * us,int32_t length)160 u_charsToUChars(const char *cs, UChar *us, int32_t length) {
161     UChar u;
162     uint8_t c;
163 
164     /*
165      * Allow the entire ASCII repertoire to be mapped _to_ Unicode.
166      * For EBCDIC systems, this works for characters with codes from
167      * codepages 37 and 1047 or compatible.
168      */
169     while(length>0) {
170         c=(uint8_t)(*cs++);
171         u=(UChar)CHAR_TO_UCHAR(c);
172         U_ASSERT((u!=0 || c==0)); /* only invariant chars converted? */
173         *us++=u;
174         --length;
175     }
176 }
177 
178 U_CAPI void U_EXPORT2
u_UCharsToChars(const UChar * us,char * cs,int32_t length)179 u_UCharsToChars(const UChar *us, char *cs, int32_t length) {
180     UChar u;
181 
182     while(length>0) {
183         u=*us++;
184         if(!UCHAR_IS_INVARIANT(u)) {
185             U_ASSERT(FALSE); /* Variant characters were used. These are not portable in ICU. */
186             u=0;
187         }
188         *cs++=(char)UCHAR_TO_CHAR(u);
189         --length;
190     }
191 }
192 
193 U_CAPI UBool U_EXPORT2
uprv_isInvariantString(const char * s,int32_t length)194 uprv_isInvariantString(const char *s, int32_t length) {
195     uint8_t c;
196 
197     for(;;) {
198         if(length<0) {
199             /* NUL-terminated */
200             c=(uint8_t)*s++;
201             if(c==0) {
202                 break;
203             }
204         } else {
205             /* count length */
206             if(length==0) {
207                 break;
208             }
209             --length;
210             c=(uint8_t)*s++;
211             if(c==0) {
212                 continue; /* NUL is invariant */
213             }
214         }
215         /* c!=0 now, one branch below checks c==0 for variant characters */
216 
217         /*
218          * no assertions here because these functions are legitimately called
219          * for strings with variant characters
220          */
221 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
222         if(!UCHAR_IS_INVARIANT(c)) {
223             return FALSE; /* found a variant char */
224         }
225 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
226         c=CHAR_TO_UCHAR(c);
227         if(c==0 || !UCHAR_IS_INVARIANT(c)) {
228             return FALSE; /* found a variant char */
229         }
230 #else
231 #   error U_CHARSET_FAMILY is not valid
232 #endif
233     }
234     return TRUE;
235 }
236 
237 U_CAPI UBool U_EXPORT2
uprv_isInvariantUString(const UChar * s,int32_t length)238 uprv_isInvariantUString(const UChar *s, int32_t length) {
239     UChar c;
240 
241     for(;;) {
242         if(length<0) {
243             /* NUL-terminated */
244             c=*s++;
245             if(c==0) {
246                 break;
247             }
248         } else {
249             /* count length */
250             if(length==0) {
251                 break;
252             }
253             --length;
254             c=*s++;
255         }
256 
257         /*
258          * no assertions here because these functions are legitimately called
259          * for strings with variant characters
260          */
261         if(!UCHAR_IS_INVARIANT(c)) {
262             return FALSE; /* found a variant char */
263         }
264     }
265     return TRUE;
266 }
267 
268 /* UDataSwapFn implementations used in udataswp.c ------- */
269 
270 /* convert ASCII to EBCDIC and verify that all characters are invariant */
271 U_CAPI int32_t U_EXPORT2
uprv_ebcdicFromAscii(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)272 uprv_ebcdicFromAscii(const UDataSwapper *ds,
273                      const void *inData, int32_t length, void *outData,
274                      UErrorCode *pErrorCode) {
275     const uint8_t *s;
276     uint8_t *t;
277     uint8_t c;
278 
279     int32_t count;
280 
281     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
282         return 0;
283     }
284     if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) {
285         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
286         return 0;
287     }
288 
289     /* setup and swapping */
290     s=(const uint8_t *)inData;
291     t=(uint8_t *)outData;
292     count=length;
293     while(count>0) {
294         c=*s++;
295         if(!UCHAR_IS_INVARIANT(c)) {
296             udata_printError(ds, "uprv_ebcdicFromAscii() string[%d] contains a variant character in position %d\n",
297                              length, length-count);
298             *pErrorCode=U_INVALID_CHAR_FOUND;
299             return 0;
300         }
301         *t++=ebcdicFromAscii[c];
302         --count;
303     }
304 
305     return length;
306 }
307 
308 /* this function only checks and copies ASCII strings without conversion */
309 U_CFUNC int32_t
uprv_copyAscii(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)310 uprv_copyAscii(const UDataSwapper *ds,
311                const void *inData, int32_t length, void *outData,
312                UErrorCode *pErrorCode) {
313     const uint8_t *s;
314     uint8_t c;
315 
316     int32_t count;
317 
318     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
319         return 0;
320     }
321     if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) {
322         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
323         return 0;
324     }
325 
326     /* setup and checking */
327     s=(const uint8_t *)inData;
328     count=length;
329     while(count>0) {
330         c=*s++;
331         if(!UCHAR_IS_INVARIANT(c)) {
332             udata_printError(ds, "uprv_copyFromAscii() string[%d] contains a variant character in position %d\n",
333                              length, length-count);
334             *pErrorCode=U_INVALID_CHAR_FOUND;
335             return 0;
336         }
337         --count;
338     }
339 
340     if(length>0 && inData!=outData) {
341         uprv_memcpy(outData, inData, length);
342     }
343 
344     return length;
345 }
346 
347 /* convert EBCDIC to ASCII and verify that all characters are invariant */
348 U_CFUNC int32_t
uprv_asciiFromEbcdic(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)349 uprv_asciiFromEbcdic(const UDataSwapper *ds,
350                      const void *inData, int32_t length, void *outData,
351                      UErrorCode *pErrorCode) {
352     const uint8_t *s;
353     uint8_t *t;
354     uint8_t c;
355 
356     int32_t count;
357 
358     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
359         return 0;
360     }
361     if(ds==NULL || inData==NULL || length<0 ||  (length>0 && outData==NULL)) {
362         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
363         return 0;
364     }
365 
366     /* setup and swapping */
367     s=(const uint8_t *)inData;
368     t=(uint8_t *)outData;
369     count=length;
370     while(count>0) {
371         c=*s++;
372         if(c!=0 && ((c=asciiFromEbcdic[c])==0 || !UCHAR_IS_INVARIANT(c))) {
373             udata_printError(ds, "uprv_asciiFromEbcdic() string[%d] contains a variant character in position %d\n",
374                              length, length-count);
375             *pErrorCode=U_INVALID_CHAR_FOUND;
376             return 0;
377         }
378         *t++=c;
379         --count;
380     }
381 
382     return length;
383 }
384 
385 /* this function only checks and copies EBCDIC strings without conversion */
386 U_CFUNC int32_t
uprv_copyEbcdic(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)387 uprv_copyEbcdic(const UDataSwapper *ds,
388                 const void *inData, int32_t length, void *outData,
389                 UErrorCode *pErrorCode) {
390     const uint8_t *s;
391     uint8_t c;
392 
393     int32_t count;
394 
395     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
396         return 0;
397     }
398     if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) {
399         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
400         return 0;
401     }
402 
403     /* setup and checking */
404     s=(const uint8_t *)inData;
405     count=length;
406     while(count>0) {
407         c=*s++;
408         if(c!=0 && ((c=asciiFromEbcdic[c])==0 || !UCHAR_IS_INVARIANT(c))) {
409             udata_printError(ds, "uprv_copyEbcdic() string[%] contains a variant character in position %d\n",
410                              length, length-count);
411             *pErrorCode=U_INVALID_CHAR_FOUND;
412             return 0;
413         }
414         --count;
415     }
416 
417     if(length>0 && inData!=outData) {
418         uprv_memcpy(outData, inData, length);
419     }
420 
421     return length;
422 }
423 
424 /* compare invariant strings; variant characters compare less than others and unlike each other */
425 U_CFUNC int32_t
uprv_compareInvAscii(const UDataSwapper * ds,const char * outString,int32_t outLength,const UChar * localString,int32_t localLength)426 uprv_compareInvAscii(const UDataSwapper *ds,
427                      const char *outString, int32_t outLength,
428                      const UChar *localString, int32_t localLength) {
429     int32_t minLength;
430     UChar32 c1, c2;
431     uint8_t c;
432 
433     if(outString==NULL || outLength<-1 || localString==NULL || localLength<-1) {
434         return 0;
435     }
436 
437     if(outLength<0) {
438         outLength=(int32_t)uprv_strlen(outString);
439     }
440     if(localLength<0) {
441         localLength=u_strlen(localString);
442     }
443 
444     minLength= outLength<localLength ? outLength : localLength;
445 
446     while(minLength>0) {
447         c=(uint8_t)*outString++;
448         if(UCHAR_IS_INVARIANT(c)) {
449             c1=c;
450         } else {
451             c1=-1;
452         }
453 
454         c2=*localString++;
455         if(!UCHAR_IS_INVARIANT(c2)) {
456             c1=-2;
457         }
458 
459         if((c1-=c2)!=0) {
460             return c1;
461         }
462 
463         --minLength;
464     }
465 
466     /* strings start with same prefix, compare lengths */
467     return outLength-localLength;
468 }
469 
470 U_CFUNC int32_t
uprv_compareInvEbcdic(const UDataSwapper * ds,const char * outString,int32_t outLength,const UChar * localString,int32_t localLength)471 uprv_compareInvEbcdic(const UDataSwapper *ds,
472                       const char *outString, int32_t outLength,
473                       const UChar *localString, int32_t localLength) {
474     int32_t minLength;
475     UChar32 c1, c2;
476     uint8_t c;
477 
478     if(outString==NULL || outLength<-1 || localString==NULL || localLength<-1) {
479         return 0;
480     }
481 
482     if(outLength<0) {
483         outLength=(int32_t)uprv_strlen(outString);
484     }
485     if(localLength<0) {
486         localLength=u_strlen(localString);
487     }
488 
489     minLength= outLength<localLength ? outLength : localLength;
490 
491     while(minLength>0) {
492         c=(uint8_t)*outString++;
493         if(c==0) {
494             c1=0;
495         } else if((c1=asciiFromEbcdic[c])!=0 && UCHAR_IS_INVARIANT(c1)) {
496             /* c1 is set */
497         } else {
498             c1=-1;
499         }
500 
501         c2=*localString++;
502         if(!UCHAR_IS_INVARIANT(c2)) {
503             c1=-2;
504         }
505 
506         if((c1-=c2)!=0) {
507             return c1;
508         }
509 
510         --minLength;
511     }
512 
513     /* strings start with same prefix, compare lengths */
514     return outLength-localLength;
515 }
516