1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 1999-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: uinvchar.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:2
12 *
13 * created on: 2004sep14
14 * created by: Markus W. Scherer
15 *
16 * Functions for handling invariant characters, moved here from putil.c
17 * for better modularization.
18 */
19
20 #include "unicode/utypes.h"
21 #include "unicode/ustring.h"
22 #include "udataswp.h"
23 #include "cstring.h"
24 #include "cmemory.h"
25 #include "uassert.h"
26 #include "uinvchar.h"
27
28 /* invariant-character handling --------------------------------------------- */
29
30 /*
31 * These maps for ASCII to/from EBCDIC map invariant characters (see utypes.h)
32 * appropriately for most EBCDIC codepages.
33 *
34 * They currently also map most other ASCII graphic characters,
35 * appropriately for codepages 37 and 1047.
36 * Exceptions: The characters for []^ have different codes in 37 & 1047.
37 * Both versions are mapped to ASCII.
38 *
39 * ASCII 37 1047
40 * [ 5B BA AD
41 * ] 5D BB BD
42 * ^ 5E B0 5F
43 *
44 * There are no mappings for variant characters from Unicode to EBCDIC.
45 *
46 * Currently, C0 control codes are also included in these maps.
47 * Exceptions: S/390 Open Edition swaps LF and NEL codes compared with other
48 * EBCDIC platforms; both codes (15 and 25) are mapped to ASCII LF (0A),
49 * but there is no mapping for ASCII LF back to EBCDIC.
50 *
51 * ASCII EBCDIC S/390-OE
52 * LF 0A 25 15
53 * NEL 85 15 25
54 *
55 * The maps below explicitly exclude the variant
56 * control and graphical characters that are in ASCII-based
57 * codepages at 0x80 and above.
58 * "No mapping" is expressed by mapping to a 00 byte.
59 *
60 * These tables do not establish a converter or a codepage.
61 */
62
63 static const uint8_t asciiFromEbcdic[256]={
64 0x00, 0x01, 0x02, 0x03, 0x00, 0x09, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
65 0x10, 0x11, 0x12, 0x13, 0x00, 0x0a, 0x08, 0x00, 0x18, 0x19, 0x00, 0x00, 0x1c, 0x1d, 0x1e, 0x1f,
66 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x17, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x06, 0x07,
67 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x14, 0x15, 0x00, 0x1a,
68
69 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2e, 0x3c, 0x28, 0x2b, 0x7c,
70 0x26, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x5e,
71 0x2d, 0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2c, 0x25, 0x5f, 0x3e, 0x3f,
72 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22,
73
74 0x00, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
75 0x00, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
76 0x00, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00,
77 0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5b, 0x5d, 0x00, 0x5d, 0x00, 0x00,
78
79 0x7b, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
80 0x7d, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
81 0x5c, 0x00, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
82 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
83 };
84
85 static const uint8_t ebcdicFromAscii[256]={
86 0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
87 0x10, 0x11, 0x12, 0x13, 0x3c, 0x3d, 0x32, 0x26, 0x18, 0x19, 0x3f, 0x27, 0x1c, 0x1d, 0x1e, 0x1f,
88 0x40, 0x00, 0x7f, 0x00, 0x00, 0x6c, 0x50, 0x7d, 0x4d, 0x5d, 0x5c, 0x4e, 0x6b, 0x60, 0x4b, 0x61,
89 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0x7a, 0x5e, 0x4c, 0x7e, 0x6e, 0x6f,
90
91 0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6,
92 0xd7, 0xd8, 0xd9, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0x00, 0x00, 0x00, 0x00, 0x6d,
93 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
94 0x97, 0x98, 0x99, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0x00, 0x00, 0x00, 0x00, 0x07,
95
96 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
98 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
99 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
100
101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
105 };
106
107 /*
108 * Bit sets indicating which characters of the ASCII repertoire
109 * (by ASCII/Unicode code) are "invariant".
110 * See utypes.h for more details.
111 *
112 * As invariant are considered the characters of the ASCII repertoire except
113 * for the following:
114 * 21 '!' <exclamation mark>
115 * 23 '#' <number sign>
116 * 24 '$' <dollar sign>
117 *
118 * 40 '@' <commercial at>
119 *
120 * 5b '[' <left bracket>
121 * 5c '\' <backslash>
122 * 5d ']' <right bracket>
123 * 5e '^' <circumflex>
124 *
125 * 60 '`' <grave accent>
126 *
127 * 7b '{' <left brace>
128 * 7c '|' <vertical line>
129 * 7d '}' <right brace>
130 * 7e '~' <tilde>
131 */
132 static const uint32_t invariantChars[4]={
133 0xfffffbff, /* 00..1f but not 0a */
134 0xffffffe5, /* 20..3f but not 21 23 24 */
135 0x87fffffe, /* 40..5f but not 40 5b..5e */
136 0x87fffffe /* 60..7f but not 60 7b..7e */
137 };
138
139 /*
140 * test unsigned types (or values known to be non-negative) for invariant characters,
141 * tests ASCII-family character values
142 */
143 #define UCHAR_IS_INVARIANT(c) (((c)<=0x7f) && (invariantChars[(c)>>5]&((uint32_t)1<<((c)&0x1f)))!=0)
144
145 /* test signed types for invariant characters, adds test for positive values */
146 #define SCHAR_IS_INVARIANT(c) ((0<=(c)) && UCHAR_IS_INVARIANT(c))
147
148 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
149 #define CHAR_TO_UCHAR(c) c
150 #define UCHAR_TO_CHAR(c) c
151 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
152 #define CHAR_TO_UCHAR(u) asciiFromEbcdic[u]
153 #define UCHAR_TO_CHAR(u) ebcdicFromAscii[u]
154 #else
155 # error U_CHARSET_FAMILY is not valid
156 #endif
157
158
159 U_CAPI void U_EXPORT2
u_charsToUChars(const char * cs,UChar * us,int32_t length)160 u_charsToUChars(const char *cs, UChar *us, int32_t length) {
161 UChar u;
162 uint8_t c;
163
164 /*
165 * Allow the entire ASCII repertoire to be mapped _to_ Unicode.
166 * For EBCDIC systems, this works for characters with codes from
167 * codepages 37 and 1047 or compatible.
168 */
169 while(length>0) {
170 c=(uint8_t)(*cs++);
171 u=(UChar)CHAR_TO_UCHAR(c);
172 U_ASSERT((u!=0 || c==0)); /* only invariant chars converted? */
173 *us++=u;
174 --length;
175 }
176 }
177
178 U_CAPI void U_EXPORT2
u_UCharsToChars(const UChar * us,char * cs,int32_t length)179 u_UCharsToChars(const UChar *us, char *cs, int32_t length) {
180 UChar u;
181
182 while(length>0) {
183 u=*us++;
184 if(!UCHAR_IS_INVARIANT(u)) {
185 U_ASSERT(FALSE); /* Variant characters were used. These are not portable in ICU. */
186 u=0;
187 }
188 *cs++=(char)UCHAR_TO_CHAR(u);
189 --length;
190 }
191 }
192
193 U_CAPI UBool U_EXPORT2
uprv_isInvariantString(const char * s,int32_t length)194 uprv_isInvariantString(const char *s, int32_t length) {
195 uint8_t c;
196
197 for(;;) {
198 if(length<0) {
199 /* NUL-terminated */
200 c=(uint8_t)*s++;
201 if(c==0) {
202 break;
203 }
204 } else {
205 /* count length */
206 if(length==0) {
207 break;
208 }
209 --length;
210 c=(uint8_t)*s++;
211 if(c==0) {
212 continue; /* NUL is invariant */
213 }
214 }
215 /* c!=0 now, one branch below checks c==0 for variant characters */
216
217 /*
218 * no assertions here because these functions are legitimately called
219 * for strings with variant characters
220 */
221 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
222 if(!UCHAR_IS_INVARIANT(c)) {
223 return FALSE; /* found a variant char */
224 }
225 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
226 c=CHAR_TO_UCHAR(c);
227 if(c==0 || !UCHAR_IS_INVARIANT(c)) {
228 return FALSE; /* found a variant char */
229 }
230 #else
231 # error U_CHARSET_FAMILY is not valid
232 #endif
233 }
234 return TRUE;
235 }
236
237 U_CAPI UBool U_EXPORT2
uprv_isInvariantUString(const UChar * s,int32_t length)238 uprv_isInvariantUString(const UChar *s, int32_t length) {
239 UChar c;
240
241 for(;;) {
242 if(length<0) {
243 /* NUL-terminated */
244 c=*s++;
245 if(c==0) {
246 break;
247 }
248 } else {
249 /* count length */
250 if(length==0) {
251 break;
252 }
253 --length;
254 c=*s++;
255 }
256
257 /*
258 * no assertions here because these functions are legitimately called
259 * for strings with variant characters
260 */
261 if(!UCHAR_IS_INVARIANT(c)) {
262 return FALSE; /* found a variant char */
263 }
264 }
265 return TRUE;
266 }
267
268 /* UDataSwapFn implementations used in udataswp.c ------- */
269
270 /* convert ASCII to EBCDIC and verify that all characters are invariant */
271 U_CAPI int32_t U_EXPORT2
uprv_ebcdicFromAscii(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)272 uprv_ebcdicFromAscii(const UDataSwapper *ds,
273 const void *inData, int32_t length, void *outData,
274 UErrorCode *pErrorCode) {
275 const uint8_t *s;
276 uint8_t *t;
277 uint8_t c;
278
279 int32_t count;
280
281 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
282 return 0;
283 }
284 if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) {
285 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
286 return 0;
287 }
288
289 /* setup and swapping */
290 s=(const uint8_t *)inData;
291 t=(uint8_t *)outData;
292 count=length;
293 while(count>0) {
294 c=*s++;
295 if(!UCHAR_IS_INVARIANT(c)) {
296 udata_printError(ds, "uprv_ebcdicFromAscii() string[%d] contains a variant character in position %d\n",
297 length, length-count);
298 *pErrorCode=U_INVALID_CHAR_FOUND;
299 return 0;
300 }
301 *t++=ebcdicFromAscii[c];
302 --count;
303 }
304
305 return length;
306 }
307
308 /* this function only checks and copies ASCII strings without conversion */
309 U_CFUNC int32_t
uprv_copyAscii(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)310 uprv_copyAscii(const UDataSwapper *ds,
311 const void *inData, int32_t length, void *outData,
312 UErrorCode *pErrorCode) {
313 const uint8_t *s;
314 uint8_t c;
315
316 int32_t count;
317
318 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
319 return 0;
320 }
321 if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) {
322 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
323 return 0;
324 }
325
326 /* setup and checking */
327 s=(const uint8_t *)inData;
328 count=length;
329 while(count>0) {
330 c=*s++;
331 if(!UCHAR_IS_INVARIANT(c)) {
332 udata_printError(ds, "uprv_copyFromAscii() string[%d] contains a variant character in position %d\n",
333 length, length-count);
334 *pErrorCode=U_INVALID_CHAR_FOUND;
335 return 0;
336 }
337 --count;
338 }
339
340 if(length>0 && inData!=outData) {
341 uprv_memcpy(outData, inData, length);
342 }
343
344 return length;
345 }
346
347 /* convert EBCDIC to ASCII and verify that all characters are invariant */
348 U_CFUNC int32_t
uprv_asciiFromEbcdic(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)349 uprv_asciiFromEbcdic(const UDataSwapper *ds,
350 const void *inData, int32_t length, void *outData,
351 UErrorCode *pErrorCode) {
352 const uint8_t *s;
353 uint8_t *t;
354 uint8_t c;
355
356 int32_t count;
357
358 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
359 return 0;
360 }
361 if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) {
362 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
363 return 0;
364 }
365
366 /* setup and swapping */
367 s=(const uint8_t *)inData;
368 t=(uint8_t *)outData;
369 count=length;
370 while(count>0) {
371 c=*s++;
372 if(c!=0 && ((c=asciiFromEbcdic[c])==0 || !UCHAR_IS_INVARIANT(c))) {
373 udata_printError(ds, "uprv_asciiFromEbcdic() string[%d] contains a variant character in position %d\n",
374 length, length-count);
375 *pErrorCode=U_INVALID_CHAR_FOUND;
376 return 0;
377 }
378 *t++=c;
379 --count;
380 }
381
382 return length;
383 }
384
385 /* this function only checks and copies EBCDIC strings without conversion */
386 U_CFUNC int32_t
uprv_copyEbcdic(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)387 uprv_copyEbcdic(const UDataSwapper *ds,
388 const void *inData, int32_t length, void *outData,
389 UErrorCode *pErrorCode) {
390 const uint8_t *s;
391 uint8_t c;
392
393 int32_t count;
394
395 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
396 return 0;
397 }
398 if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) {
399 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
400 return 0;
401 }
402
403 /* setup and checking */
404 s=(const uint8_t *)inData;
405 count=length;
406 while(count>0) {
407 c=*s++;
408 if(c!=0 && ((c=asciiFromEbcdic[c])==0 || !UCHAR_IS_INVARIANT(c))) {
409 udata_printError(ds, "uprv_copyEbcdic() string[%] contains a variant character in position %d\n",
410 length, length-count);
411 *pErrorCode=U_INVALID_CHAR_FOUND;
412 return 0;
413 }
414 --count;
415 }
416
417 if(length>0 && inData!=outData) {
418 uprv_memcpy(outData, inData, length);
419 }
420
421 return length;
422 }
423
424 /* compare invariant strings; variant characters compare less than others and unlike each other */
425 U_CFUNC int32_t
uprv_compareInvAscii(const UDataSwapper * ds,const char * outString,int32_t outLength,const UChar * localString,int32_t localLength)426 uprv_compareInvAscii(const UDataSwapper *ds,
427 const char *outString, int32_t outLength,
428 const UChar *localString, int32_t localLength) {
429 int32_t minLength;
430 UChar32 c1, c2;
431 uint8_t c;
432
433 if(outString==NULL || outLength<-1 || localString==NULL || localLength<-1) {
434 return 0;
435 }
436
437 if(outLength<0) {
438 outLength=(int32_t)uprv_strlen(outString);
439 }
440 if(localLength<0) {
441 localLength=u_strlen(localString);
442 }
443
444 minLength= outLength<localLength ? outLength : localLength;
445
446 while(minLength>0) {
447 c=(uint8_t)*outString++;
448 if(UCHAR_IS_INVARIANT(c)) {
449 c1=c;
450 } else {
451 c1=-1;
452 }
453
454 c2=*localString++;
455 if(!UCHAR_IS_INVARIANT(c2)) {
456 c2=-2;
457 }
458
459 if((c1-=c2)!=0) {
460 return c1;
461 }
462
463 --minLength;
464 }
465
466 /* strings start with same prefix, compare lengths */
467 return outLength-localLength;
468 }
469
470 U_CFUNC int32_t
uprv_compareInvEbcdic(const UDataSwapper * ds,const char * outString,int32_t outLength,const UChar * localString,int32_t localLength)471 uprv_compareInvEbcdic(const UDataSwapper *ds,
472 const char *outString, int32_t outLength,
473 const UChar *localString, int32_t localLength) {
474 int32_t minLength;
475 UChar32 c1, c2;
476 uint8_t c;
477
478 if(outString==NULL || outLength<-1 || localString==NULL || localLength<-1) {
479 return 0;
480 }
481
482 if(outLength<0) {
483 outLength=(int32_t)uprv_strlen(outString);
484 }
485 if(localLength<0) {
486 localLength=u_strlen(localString);
487 }
488
489 minLength= outLength<localLength ? outLength : localLength;
490
491 while(minLength>0) {
492 c=(uint8_t)*outString++;
493 if(c==0) {
494 c1=0;
495 } else if((c1=asciiFromEbcdic[c])!=0 && UCHAR_IS_INVARIANT(c1)) {
496 /* c1 is set */
497 } else {
498 c1=-1;
499 }
500
501 c2=*localString++;
502 if(!UCHAR_IS_INVARIANT(c2)) {
503 c2=-2;
504 }
505
506 if((c1-=c2)!=0) {
507 return c1;
508 }
509
510 --minLength;
511 }
512
513 /* strings start with same prefix, compare lengths */
514 return outLength-localLength;
515 }
516
517 U_CAPI int32_t U_EXPORT2
uprv_compareInvEbcdicAsAscii(const char * s1,const char * s2)518 uprv_compareInvEbcdicAsAscii(const char *s1, const char *s2) {
519 int32_t c1, c2;
520
521 for(;; ++s1, ++s2) {
522 c1=(uint8_t)*s1;
523 c2=(uint8_t)*s2;
524 if(c1!=c2) {
525 if(c1!=0 && ((c1=asciiFromEbcdic[c1])==0 || !UCHAR_IS_INVARIANT(c1))) {
526 c1=-(int32_t)(uint8_t)*s1;
527 }
528 if(c2!=0 && ((c2=asciiFromEbcdic[c2])==0 || !UCHAR_IS_INVARIANT(c2))) {
529 c2=-(int32_t)(uint8_t)*s2;
530 }
531 return c1-c2;
532 } else if(c1==0) {
533 return 0;
534 }
535 }
536 }
537
538
539 U_INTERNAL uint8_t* U_EXPORT2
uprv_aestrncpy(uint8_t * dst,const uint8_t * src,int32_t n)540 uprv_aestrncpy(uint8_t *dst, const uint8_t *src, int32_t n)
541 {
542 uint8_t *orig_dst = dst;
543
544 if(n==-1) {
545 n = uprv_strlen((const char*)src)+1; /* copy NUL */
546 }
547 /* copy non-null */
548 while(*src && n>0) {
549 *(dst++) = asciiFromEbcdic[*(src++)];
550 n--;
551 }
552 /* pad */
553 while(n>0) {
554 *(dst++) = 0;
555 n--;
556 }
557 return orig_dst;
558 }
559
560 U_INTERNAL uint8_t* U_EXPORT2
uprv_eastrncpy(uint8_t * dst,const uint8_t * src,int32_t n)561 uprv_eastrncpy(uint8_t *dst, const uint8_t *src, int32_t n)
562 {
563 uint8_t *orig_dst = dst;
564
565 if(n==-1) {
566 n = uprv_strlen((const char*)src)+1; /* copy NUL */
567 }
568 /* copy non-null */
569 while(*src && n>0) {
570 char ch = ebcdicFromAscii[*(src++)];
571 if(ch == 0) {
572 ch = ebcdicFromAscii[0x3f]; /* questionmark (subchar) */
573 }
574 *(dst++) = ch;
575 n--;
576 }
577 /* pad */
578 while(n>0) {
579 *(dst++) = 0;
580 n--;
581 }
582 return orig_dst;
583 }
584
585