1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2009, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u7.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
13 *
14 * UTF-7 converter implementation. Used to be in ucnv_utf.c.
15 */
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_CONVERSION
20
21 #include "unicode/ucnv.h"
22 #include "ucnv_bld.h"
23 #include "ucnv_cnv.h"
24
25 /* UTF-7 -------------------------------------------------------------------- */
26
27 /*
28 * UTF-7 is a stateful encoding of Unicode.
29 * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
30 * It was intended for use in Internet email systems, using in its bytewise
31 * encoding only a subset of 7-bit US-ASCII.
32 * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
33 * occasionally used.
34 *
35 * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
36 * characters directly or in base64. Especially, the characters in set O
37 * as defined in the RFC (see below) may be encoded directly but are not
38 * allowed in, e.g., email headers.
39 * By default, the ICU UTF-7 converter encodes set O directly.
40 * By choosing the option "version=1", set O will be escaped instead.
41 * For example:
42 * utf7Converter=ucnv_open("UTF-7,version=1");
43 *
44 * For details about email headers see RFC 2047.
45 */
46
47 /*
48 * Tests for US-ASCII characters belonging to character classes
49 * defined in UTF-7.
50 *
51 * Set D (directly encoded characters) consists of the following
52 * characters: the upper and lower case letters A through Z
53 * and a through z, the 10 digits 0-9, and the following nine special
54 * characters (note that "+" and "=" are omitted):
55 * '(),-./:?
56 *
57 * Set O (optional direct characters) consists of the following
58 * characters (note that "\" and "~" are omitted):
59 * !"#$%&*;<=>@[]^_`{|}
60 *
61 * According to the rules in RFC 2152, the byte values for the following
62 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
63 * - all C0 control codes except for CR LF TAB
64 * - BACKSLASH
65 * - TILDE
66 * - DEL
67 * - all codes beyond US-ASCII, i.e. all >127
68 */
69 #define inSetD(c) \
70 ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
71 (uint8_t)((c)-48)<10 || /* digits */ \
72 (uint8_t)((c)-39)<3 || /* '() */ \
73 (uint8_t)((c)-44)<4 || /* ,-./ */ \
74 (c)==58 || (c)==63 /* :? */ \
75 )
76
77 #define inSetO(c) \
78 ((uint8_t)((c)-33)<6 || /* !"#$%& */ \
79 (uint8_t)((c)-59)<4 || /* ;<=> */ \
80 (uint8_t)((c)-93)<4 || /* ]^_` */ \
81 (uint8_t)((c)-123)<3 || /* {|} */ \
82 (c)==42 || (c)==64 || (c)==91 /* *@[ */ \
83 )
84
85 #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
86 #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
87
88 #define PLUS 43
89 #define MINUS 45
90 #define BACKSLASH 92
91 #define TILDE 126
92
93 /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
94 #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
95
96 /* encode directly sets D and O and CR LF SP TAB */
97 static const UBool encodeDirectlyMaximum[128]={
98 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
99 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101
102 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
103 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
104
105 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
107
108 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
110 };
111
112 /* encode directly set D and CR LF SP TAB but not set O */
113 static const UBool encodeDirectlyRestricted[128]={
114 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
115 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117
118 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
119 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
120
121 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
122 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
123
124 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
125 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
126 };
127
128 static const uint8_t
129 toBase64[64]={
130 /* A-Z */
131 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
132 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
133 /* a-z */
134 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
135 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
136 /* 0-9 */
137 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
138 /* +/ */
139 43, 47
140 };
141
142 static const int8_t
143 fromBase64[128]={
144 /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
145 -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
146 -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
147
148 /* general punctuation with + and / and a special value (-2) for - */
149 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
150 /* digits */
151 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
152
153 /* A-Z */
154 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
155 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
156
157 /* a-z */
158 -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
159 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
160 };
161
162 /*
163 * converter status values:
164 *
165 * toUnicodeStatus:
166 * 24 inDirectMode (boolean)
167 * 23..16 base64Counter (-1..7)
168 * 15..0 bits (up to 14 bits incoming base64)
169 *
170 * fromUnicodeStatus:
171 * 31..28 version (0: set O direct 1: set O escaped)
172 * 24 inDirectMode (boolean)
173 * 23..16 base64Counter (0..2)
174 * 7..0 bits (6 bits outgoing base64)
175 *
176 */
177
178 static void
_UTF7Reset(UConverter * cnv,UConverterResetChoice choice)179 _UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
180 if(choice<=UCNV_RESET_TO_UNICODE) {
181 /* reset toUnicode */
182 cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
183 cnv->toULength=0;
184 }
185 if(choice!=UCNV_RESET_TO_UNICODE) {
186 /* reset fromUnicode */
187 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
188 }
189 }
190
191 static void
_UTF7Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)192 _UTF7Open(UConverter *cnv,
193 UConverterLoadArgs *pArgs,
194 UErrorCode *pErrorCode) {
195 if(UCNV_GET_VERSION(cnv)<=1) {
196 /* TODO(markus): Should just use cnv->options rather than copying the version number. */
197 cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
198 _UTF7Reset(cnv, UCNV_RESET_BOTH);
199 } else {
200 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
201 }
202 }
203
204 static void
_UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)205 _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
206 UErrorCode *pErrorCode) {
207 UConverter *cnv;
208 const uint8_t *source, *sourceLimit;
209 UChar *target;
210 const UChar *targetLimit;
211 int32_t *offsets;
212
213 uint8_t *bytes;
214 uint8_t byteIndex;
215
216 int32_t length, targetCapacity;
217
218 /* UTF-7 state */
219 uint16_t bits;
220 int8_t base64Counter;
221 UBool inDirectMode;
222
223 int8_t base64Value;
224
225 int32_t sourceIndex, nextSourceIndex;
226
227 uint8_t b;
228 /* set up the local pointers */
229 cnv=pArgs->converter;
230
231 source=(const uint8_t *)pArgs->source;
232 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
233 target=pArgs->target;
234 targetLimit=pArgs->targetLimit;
235 offsets=pArgs->offsets;
236 /* get the state machine state */
237 {
238 uint32_t status=cnv->toUnicodeStatus;
239 inDirectMode=(UBool)((status>>24)&1);
240 base64Counter=(int8_t)(status>>16);
241 bits=(uint16_t)status;
242 }
243 bytes=cnv->toUBytes;
244 byteIndex=cnv->toULength;
245
246 /* sourceIndex=-1 if the current character began in the previous buffer */
247 sourceIndex=byteIndex==0 ? 0 : -1;
248 nextSourceIndex=0;
249
250 if(inDirectMode) {
251 directMode:
252 /*
253 * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
254 * with their US-ASCII byte values.
255 * Backslash and Tilde and most control characters are not allowed in UTF-7.
256 * A plus sign starts Unicode (or "escape") Mode.
257 *
258 * In Direct Mode, only the sourceIndex is used.
259 */
260 byteIndex=0;
261 length=(int32_t)(sourceLimit-source);
262 targetCapacity=(int32_t)(targetLimit-target);
263 if(length>targetCapacity) {
264 length=targetCapacity;
265 }
266 while(length>0) {
267 b=*source++;
268 if(!isLegalUTF7(b)) {
269 /* illegal */
270 bytes[0]=b;
271 byteIndex=1;
272 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
273 break;
274 } else if(b!=PLUS) {
275 /* write directly encoded character */
276 *target++=b;
277 if(offsets!=NULL) {
278 *offsets++=sourceIndex++;
279 }
280 } else /* PLUS */ {
281 /* switch to Unicode mode */
282 nextSourceIndex=++sourceIndex;
283 inDirectMode=FALSE;
284 byteIndex=0;
285 bits=0;
286 base64Counter=-1;
287 goto unicodeMode;
288 }
289 --length;
290 }
291 if(source<sourceLimit && target>=targetLimit) {
292 /* target is full */
293 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
294 }
295 } else {
296 unicodeMode:
297 /*
298 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
299 * The base64 sequence ends with any character that is not in the base64 alphabet.
300 * A terminating minus sign is consumed.
301 *
302 * In Unicode Mode, the sourceIndex has the index to the start of the current
303 * base64 bytes, while nextSourceIndex is precisely parallel to source,
304 * keeping the index to the following byte.
305 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
306 */
307 while(source<sourceLimit) {
308 if(target<targetLimit) {
309 bytes[byteIndex++]=b=*source++;
310 ++nextSourceIndex;
311 if(b>=126) {
312 /* illegal - test other illegal US-ASCII values by base64Value==-3 */
313 inDirectMode=TRUE;
314 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
315 break;
316 } else if((base64Value=fromBase64[b])>=0) {
317 /* collect base64 bytes into UChars */
318 switch(base64Counter) {
319 case -1: /* -1 is immediately after the + */
320 case 0:
321 bits=base64Value;
322 base64Counter=1;
323 break;
324 case 1:
325 case 3:
326 case 4:
327 case 6:
328 bits=(uint16_t)((bits<<6)|base64Value);
329 ++base64Counter;
330 break;
331 case 2:
332 *target++=(UChar)((bits<<4)|(base64Value>>2));
333 if(offsets!=NULL) {
334 *offsets++=sourceIndex;
335 sourceIndex=nextSourceIndex-1;
336 }
337 bytes[0]=b; /* keep this byte in case an error occurs */
338 byteIndex=1;
339 bits=(uint16_t)(base64Value&3);
340 base64Counter=3;
341 break;
342 case 5:
343 *target++=(UChar)((bits<<2)|(base64Value>>4));
344 if(offsets!=NULL) {
345 *offsets++=sourceIndex;
346 sourceIndex=nextSourceIndex-1;
347 }
348 bytes[0]=b; /* keep this byte in case an error occurs */
349 byteIndex=1;
350 bits=(uint16_t)(base64Value&15);
351 base64Counter=6;
352 break;
353 case 7:
354 *target++=(UChar)((bits<<6)|base64Value);
355 if(offsets!=NULL) {
356 *offsets++=sourceIndex;
357 sourceIndex=nextSourceIndex;
358 }
359 byteIndex=0;
360 bits=0;
361 base64Counter=0;
362 break;
363 default:
364 /* will never occur */
365 break;
366 }
367 } else if(base64Value==-2) {
368 /* minus sign terminates the base64 sequence */
369 inDirectMode=TRUE;
370 if(base64Counter==-1) {
371 /* +- i.e. a minus immediately following a plus */
372 *target++=PLUS;
373 if(offsets!=NULL) {
374 *offsets++=sourceIndex-1;
375 }
376 } else {
377 /* absorb the minus and leave the Unicode Mode */
378 if(bits!=0) {
379 /* bits are illegally left over, a UChar is incomplete */
380 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
381 break;
382 }
383 }
384 sourceIndex=nextSourceIndex;
385 goto directMode;
386 } else if(base64Value==-1) /* for any legal character except base64 and minus sign */ {
387 /* leave the Unicode Mode */
388 inDirectMode=TRUE;
389 if(base64Counter==-1) {
390 /* illegal: + immediately followed by something other than base64 or minus sign */
391 /* include the plus sign in the reported sequence */
392 --sourceIndex;
393 bytes[0]=PLUS;
394 bytes[1]=b;
395 byteIndex=2;
396 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
397 break;
398 } else if(bits==0) {
399 /* un-read the character in case it is a plus sign */
400 --source;
401 sourceIndex=nextSourceIndex-1;
402 goto directMode;
403 } else {
404 /* bits are illegally left over, a UChar is incomplete */
405 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
406 break;
407 }
408 } else /* base64Value==-3 for illegal characters */ {
409 /* illegal */
410 inDirectMode=TRUE;
411 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
412 break;
413 }
414 } else {
415 /* target is full */
416 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
417 break;
418 }
419 }
420 }
421
422 if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
423 /*
424 * if we are in Unicode mode, then the byteIndex might not be 0,
425 * but that is ok if bits==0
426 * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
427 * (not true for IMAP-mailbox-name where we must end in direct mode)
428 */
429 byteIndex=0;
430 }
431
432 /* set the converter state back into UConverter */
433 cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
434 cnv->toULength=byteIndex;
435
436 /* write back the updated pointers */
437 pArgs->source=(const char *)source;
438 pArgs->target=target;
439 pArgs->offsets=offsets;
440 return;
441 }
442
443 static void
_UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)444 _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
445 UErrorCode *pErrorCode) {
446 UConverter *cnv;
447 const UChar *source, *sourceLimit;
448 uint8_t *target, *targetLimit;
449 int32_t *offsets;
450
451 int32_t length, targetCapacity, sourceIndex;
452 UChar c;
453
454 /* UTF-7 state */
455 const UBool *encodeDirectly;
456 uint8_t bits;
457 int8_t base64Counter;
458 UBool inDirectMode;
459
460 /* set up the local pointers */
461 cnv=pArgs->converter;
462
463 /* set up the local pointers */
464 source=pArgs->source;
465 sourceLimit=pArgs->sourceLimit;
466 target=(uint8_t *)pArgs->target;
467 targetLimit=(uint8_t *)pArgs->targetLimit;
468 offsets=pArgs->offsets;
469
470 /* get the state machine state */
471 {
472 uint32_t status=cnv->fromUnicodeStatus;
473 encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
474 inDirectMode=(UBool)((status>>24)&1);
475 base64Counter=(int8_t)(status>>16);
476 bits=(uint8_t)status;
477 }
478
479 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
480 sourceIndex=0;
481
482 if(inDirectMode) {
483 directMode:
484 length=(int32_t)(sourceLimit-source);
485 targetCapacity=(int32_t)(targetLimit-target);
486 if(length>targetCapacity) {
487 length=targetCapacity;
488 }
489 while(length>0) {
490 c=*source++;
491 /* currently always encode CR LF SP TAB directly */
492 if(c<=127 && encodeDirectly[c]) {
493 /* encode directly */
494 *target++=(uint8_t)c;
495 if(offsets!=NULL) {
496 *offsets++=sourceIndex++;
497 }
498 } else if(c==PLUS) {
499 /* output +- for + */
500 *target++=PLUS;
501 if(target<targetLimit) {
502 *target++=MINUS;
503 if(offsets!=NULL) {
504 *offsets++=sourceIndex;
505 *offsets++=sourceIndex++;
506 }
507 /* realign length and targetCapacity */
508 goto directMode;
509 } else {
510 if(offsets!=NULL) {
511 *offsets++=sourceIndex++;
512 }
513 cnv->charErrorBuffer[0]=MINUS;
514 cnv->charErrorBufferLength=1;
515 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
516 break;
517 }
518 } else {
519 /* un-read this character and switch to Unicode Mode */
520 --source;
521 *target++=PLUS;
522 if(offsets!=NULL) {
523 *offsets++=sourceIndex;
524 }
525 inDirectMode=FALSE;
526 base64Counter=0;
527 goto unicodeMode;
528 }
529 --length;
530 }
531 if(source<sourceLimit && target>=targetLimit) {
532 /* target is full */
533 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
534 }
535 } else {
536 unicodeMode:
537 while(source<sourceLimit) {
538 if(target<targetLimit) {
539 c=*source++;
540 if(c<=127 && encodeDirectly[c]) {
541 /* encode directly */
542 inDirectMode=TRUE;
543
544 /* trick: back out this character to make this easier */
545 --source;
546
547 /* terminate the base64 sequence */
548 if(base64Counter!=0) {
549 /* write remaining bits for the previous character */
550 *target++=toBase64[bits];
551 if(offsets!=NULL) {
552 *offsets++=sourceIndex-1;
553 }
554 }
555 if(fromBase64[c]!=-1) {
556 /* need to terminate with a minus */
557 if(target<targetLimit) {
558 *target++=MINUS;
559 if(offsets!=NULL) {
560 *offsets++=sourceIndex-1;
561 }
562 } else {
563 cnv->charErrorBuffer[0]=MINUS;
564 cnv->charErrorBufferLength=1;
565 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
566 break;
567 }
568 }
569 goto directMode;
570 } else {
571 /*
572 * base64 this character:
573 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
574 * and the bits of this character, each implicitly in UTF-16BE.
575 *
576 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
577 * character to the next. The actual 2 or 4 bits are shifted to the left edge
578 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
579 */
580 switch(base64Counter) {
581 case 0:
582 *target++=toBase64[c>>10];
583 if(target<targetLimit) {
584 *target++=toBase64[(c>>4)&0x3f];
585 if(offsets!=NULL) {
586 *offsets++=sourceIndex;
587 *offsets++=sourceIndex++;
588 }
589 } else {
590 if(offsets!=NULL) {
591 *offsets++=sourceIndex++;
592 }
593 cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
594 cnv->charErrorBufferLength=1;
595 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
596 }
597 bits=(uint8_t)((c&15)<<2);
598 base64Counter=1;
599 break;
600 case 1:
601 *target++=toBase64[bits|(c>>14)];
602 if(target<targetLimit) {
603 *target++=toBase64[(c>>8)&0x3f];
604 if(target<targetLimit) {
605 *target++=toBase64[(c>>2)&0x3f];
606 if(offsets!=NULL) {
607 *offsets++=sourceIndex;
608 *offsets++=sourceIndex;
609 *offsets++=sourceIndex++;
610 }
611 } else {
612 if(offsets!=NULL) {
613 *offsets++=sourceIndex;
614 *offsets++=sourceIndex++;
615 }
616 cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
617 cnv->charErrorBufferLength=1;
618 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
619 }
620 } else {
621 if(offsets!=NULL) {
622 *offsets++=sourceIndex++;
623 }
624 cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
625 cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
626 cnv->charErrorBufferLength=2;
627 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
628 }
629 bits=(uint8_t)((c&3)<<4);
630 base64Counter=2;
631 break;
632 case 2:
633 *target++=toBase64[bits|(c>>12)];
634 if(target<targetLimit) {
635 *target++=toBase64[(c>>6)&0x3f];
636 if(target<targetLimit) {
637 *target++=toBase64[c&0x3f];
638 if(offsets!=NULL) {
639 *offsets++=sourceIndex;
640 *offsets++=sourceIndex;
641 *offsets++=sourceIndex++;
642 }
643 } else {
644 if(offsets!=NULL) {
645 *offsets++=sourceIndex;
646 *offsets++=sourceIndex++;
647 }
648 cnv->charErrorBuffer[0]=toBase64[c&0x3f];
649 cnv->charErrorBufferLength=1;
650 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
651 }
652 } else {
653 if(offsets!=NULL) {
654 *offsets++=sourceIndex++;
655 }
656 cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
657 cnv->charErrorBuffer[1]=toBase64[c&0x3f];
658 cnv->charErrorBufferLength=2;
659 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
660 }
661 bits=0;
662 base64Counter=0;
663 break;
664 default:
665 /* will never occur */
666 break;
667 }
668 }
669 } else {
670 /* target is full */
671 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
672 break;
673 }
674 }
675 }
676
677 if(pArgs->flush && source>=sourceLimit) {
678 /* flush remaining bits to the target */
679 if(!inDirectMode && base64Counter!=0) {
680 if(target<targetLimit) {
681 *target++=toBase64[bits];
682 if(offsets!=NULL) {
683 *offsets++=sourceIndex-1;
684 }
685 } else {
686 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
687 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
688 }
689 }
690 /* reset the state for the next conversion */
691 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
692 } else {
693 /* set the converter state back into UConverter */
694 cnv->fromUnicodeStatus=
695 (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
696 ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
697 }
698
699 /* write back the updated pointers */
700 pArgs->source=source;
701 pArgs->target=(char *)target;
702 pArgs->offsets=offsets;
703 return;
704 }
705
706 static const char *
_UTF7GetName(const UConverter * cnv)707 _UTF7GetName(const UConverter *cnv) {
708 switch(cnv->fromUnicodeStatus>>28) {
709 case 1:
710 return "UTF-7,version=1";
711 default:
712 return "UTF-7";
713 }
714 }
715
716 static const UConverterImpl _UTF7Impl={
717 UCNV_UTF7,
718
719 NULL,
720 NULL,
721
722 _UTF7Open,
723 NULL,
724 _UTF7Reset,
725
726 _UTF7ToUnicodeWithOffsets,
727 _UTF7ToUnicodeWithOffsets,
728 _UTF7FromUnicodeWithOffsets,
729 _UTF7FromUnicodeWithOffsets,
730 NULL,
731
732 NULL,
733 _UTF7GetName,
734 NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
735 NULL,
736 ucnv_getCompleteUnicodeSet
737 };
738
739 static const UConverterStaticData _UTF7StaticData={
740 sizeof(UConverterStaticData),
741 "UTF-7",
742 0, /* TODO CCSID for UTF-7 */
743 UCNV_IBM, UCNV_UTF7,
744 1, 4,
745 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
746 FALSE, FALSE,
747 0,
748 0,
749 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
750 };
751
752 const UConverterSharedData _UTF7Data={
753 sizeof(UConverterSharedData), ~((uint32_t)0),
754 NULL, NULL, &_UTF7StaticData, FALSE, &_UTF7Impl,
755 0
756 };
757
758 /* IMAP mailbox name encoding ----------------------------------------------- */
759
760 /*
761 * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
762 * http://www.ietf.org/rfc/rfc2060.txt
763 *
764 * 5.1.3. Mailbox International Naming Convention
765 *
766 * By convention, international mailbox names are specified using a
767 * modified version of the UTF-7 encoding described in [UTF-7]. The
768 * purpose of these modifications is to correct the following problems
769 * with UTF-7:
770 *
771 * 1) UTF-7 uses the "+" character for shifting; this conflicts with
772 * the common use of "+" in mailbox names, in particular USENET
773 * newsgroup names.
774 *
775 * 2) UTF-7's encoding is BASE64 which uses the "/" character; this
776 * conflicts with the use of "/" as a popular hierarchy delimiter.
777 *
778 * 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
779 * the use of "\" as a popular hierarchy delimiter.
780 *
781 * 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
782 * the use of "~" in some servers as a home directory indicator.
783 *
784 * 5) UTF-7 permits multiple alternate forms to represent the same
785 * string; in particular, printable US-ASCII chararacters can be
786 * represented in encoded form.
787 *
788 * In modified UTF-7, printable US-ASCII characters except for "&"
789 * represent themselves; that is, characters with octet values 0x20-0x25
790 * and 0x27-0x7e. The character "&" (0x26) is represented by the two-
791 * octet sequence "&-".
792 *
793 * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
794 * Unicode 16-bit octets) are represented in modified BASE64, with a
795 * further modification from [UTF-7] that "," is used instead of "/".
796 * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
797 * character which can represent itself.
798 *
799 * "&" is used to shift to modified BASE64 and "-" to shift back to US-
800 * ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that
801 * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
802 * ").
803 *
804 * For example, here is a mailbox name which mixes English, Japanese,
805 * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
806 */
807
808 /*
809 * Tests for US-ASCII characters belonging to character classes
810 * defined in UTF-7.
811 *
812 * Set D (directly encoded characters) consists of the following
813 * characters: the upper and lower case letters A through Z
814 * and a through z, the 10 digits 0-9, and the following nine special
815 * characters (note that "+" and "=" are omitted):
816 * '(),-./:?
817 *
818 * Set O (optional direct characters) consists of the following
819 * characters (note that "\" and "~" are omitted):
820 * !"#$%&*;<=>@[]^_`{|}
821 *
822 * According to the rules in RFC 2152, the byte values for the following
823 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
824 * - all C0 control codes except for CR LF TAB
825 * - BACKSLASH
826 * - TILDE
827 * - DEL
828 * - all codes beyond US-ASCII, i.e. all >127
829 */
830
831 /* uses '&' not '+' to start a base64 sequence */
832 #define AMPERSAND 0x26
833 #define COMMA 0x2c
834 #define SLASH 0x2f
835
836 /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
837 #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
838
839 /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
840 #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
841
842 #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
843 #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
844
845 /*
846 * converter status values:
847 *
848 * toUnicodeStatus:
849 * 24 inDirectMode (boolean)
850 * 23..16 base64Counter (-1..7)
851 * 15..0 bits (up to 14 bits incoming base64)
852 *
853 * fromUnicodeStatus:
854 * 24 inDirectMode (boolean)
855 * 23..16 base64Counter (0..2)
856 * 7..0 bits (6 bits outgoing base64)
857 *
858 * ignore bits 31..25
859 */
860
861 static void
_IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)862 _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
863 UErrorCode *pErrorCode) {
864 UConverter *cnv;
865 const uint8_t *source, *sourceLimit;
866 UChar *target;
867 const UChar *targetLimit;
868 int32_t *offsets;
869
870 uint8_t *bytes;
871 uint8_t byteIndex;
872
873 int32_t length, targetCapacity;
874
875 /* UTF-7 state */
876 uint16_t bits;
877 int8_t base64Counter;
878 UBool inDirectMode;
879
880 int8_t base64Value;
881
882 int32_t sourceIndex, nextSourceIndex;
883
884 UChar c;
885 uint8_t b;
886
887 /* set up the local pointers */
888 cnv=pArgs->converter;
889
890 source=(const uint8_t *)pArgs->source;
891 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
892 target=pArgs->target;
893 targetLimit=pArgs->targetLimit;
894 offsets=pArgs->offsets;
895 /* get the state machine state */
896 {
897 uint32_t status=cnv->toUnicodeStatus;
898 inDirectMode=(UBool)((status>>24)&1);
899 base64Counter=(int8_t)(status>>16);
900 bits=(uint16_t)status;
901 }
902 bytes=cnv->toUBytes;
903 byteIndex=cnv->toULength;
904
905 /* sourceIndex=-1 if the current character began in the previous buffer */
906 sourceIndex=byteIndex==0 ? 0 : -1;
907 nextSourceIndex=0;
908
909 if(inDirectMode) {
910 directMode:
911 /*
912 * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
913 * with their US-ASCII byte values.
914 * An ampersand starts Unicode (or "escape") Mode.
915 *
916 * In Direct Mode, only the sourceIndex is used.
917 */
918 byteIndex=0;
919 length=(int32_t)(sourceLimit-source);
920 targetCapacity=(int32_t)(targetLimit-target);
921 if(length>targetCapacity) {
922 length=targetCapacity;
923 }
924 while(length>0) {
925 b=*source++;
926 if(!isLegalIMAP(b)) {
927 /* illegal */
928 bytes[0]=b;
929 byteIndex=1;
930 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
931 break;
932 } else if(b!=AMPERSAND) {
933 /* write directly encoded character */
934 *target++=b;
935 if(offsets!=NULL) {
936 *offsets++=sourceIndex++;
937 }
938 } else /* AMPERSAND */ {
939 /* switch to Unicode mode */
940 nextSourceIndex=++sourceIndex;
941 inDirectMode=FALSE;
942 byteIndex=0;
943 bits=0;
944 base64Counter=-1;
945 goto unicodeMode;
946 }
947 --length;
948 }
949 if(source<sourceLimit && target>=targetLimit) {
950 /* target is full */
951 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
952 }
953 } else {
954 unicodeMode:
955 /*
956 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
957 * The base64 sequence ends with any character that is not in the base64 alphabet.
958 * A terminating minus sign is consumed.
959 * US-ASCII must not be base64-ed.
960 *
961 * In Unicode Mode, the sourceIndex has the index to the start of the current
962 * base64 bytes, while nextSourceIndex is precisely parallel to source,
963 * keeping the index to the following byte.
964 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
965 */
966 while(source<sourceLimit) {
967 if(target<targetLimit) {
968 bytes[byteIndex++]=b=*source++;
969 ++nextSourceIndex;
970 if(b>0x7e) {
971 /* illegal - test other illegal US-ASCII values by base64Value==-3 */
972 inDirectMode=TRUE;
973 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
974 break;
975 } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
976 /* collect base64 bytes into UChars */
977 switch(base64Counter) {
978 case -1: /* -1 is immediately after the & */
979 case 0:
980 bits=base64Value;
981 base64Counter=1;
982 break;
983 case 1:
984 case 3:
985 case 4:
986 case 6:
987 bits=(uint16_t)((bits<<6)|base64Value);
988 ++base64Counter;
989 break;
990 case 2:
991 c=(UChar)((bits<<4)|(base64Value>>2));
992 if(isLegalIMAP(c)) {
993 /* illegal */
994 inDirectMode=TRUE;
995 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
996 goto endloop;
997 }
998 *target++=c;
999 if(offsets!=NULL) {
1000 *offsets++=sourceIndex;
1001 sourceIndex=nextSourceIndex-1;
1002 }
1003 bytes[0]=b; /* keep this byte in case an error occurs */
1004 byteIndex=1;
1005 bits=(uint16_t)(base64Value&3);
1006 base64Counter=3;
1007 break;
1008 case 5:
1009 c=(UChar)((bits<<2)|(base64Value>>4));
1010 if(isLegalIMAP(c)) {
1011 /* illegal */
1012 inDirectMode=TRUE;
1013 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1014 goto endloop;
1015 }
1016 *target++=c;
1017 if(offsets!=NULL) {
1018 *offsets++=sourceIndex;
1019 sourceIndex=nextSourceIndex-1;
1020 }
1021 bytes[0]=b; /* keep this byte in case an error occurs */
1022 byteIndex=1;
1023 bits=(uint16_t)(base64Value&15);
1024 base64Counter=6;
1025 break;
1026 case 7:
1027 c=(UChar)((bits<<6)|base64Value);
1028 if(isLegalIMAP(c)) {
1029 /* illegal */
1030 inDirectMode=TRUE;
1031 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1032 goto endloop;
1033 }
1034 *target++=c;
1035 if(offsets!=NULL) {
1036 *offsets++=sourceIndex;
1037 sourceIndex=nextSourceIndex;
1038 }
1039 byteIndex=0;
1040 bits=0;
1041 base64Counter=0;
1042 break;
1043 default:
1044 /* will never occur */
1045 break;
1046 }
1047 } else if(base64Value==-2) {
1048 /* minus sign terminates the base64 sequence */
1049 inDirectMode=TRUE;
1050 if(base64Counter==-1) {
1051 /* &- i.e. a minus immediately following an ampersand */
1052 *target++=AMPERSAND;
1053 if(offsets!=NULL) {
1054 *offsets++=sourceIndex-1;
1055 }
1056 } else {
1057 /* absorb the minus and leave the Unicode Mode */
1058 if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
1059 /* bits are illegally left over, a UChar is incomplete */
1060 /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
1061 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1062 break;
1063 }
1064 }
1065 sourceIndex=nextSourceIndex;
1066 goto directMode;
1067 } else {
1068 if(base64Counter==-1) {
1069 /* illegal: & immediately followed by something other than base64 or minus sign */
1070 /* include the ampersand in the reported sequence */
1071 --sourceIndex;
1072 bytes[0]=AMPERSAND;
1073 bytes[1]=b;
1074 byteIndex=2;
1075 }
1076 /* base64Value==-1 for characters that are illegal only in Unicode mode */
1077 /* base64Value==-3 for illegal characters */
1078 /* illegal */
1079 inDirectMode=TRUE;
1080 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1081 break;
1082 }
1083 } else {
1084 /* target is full */
1085 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1086 break;
1087 }
1088 }
1089 }
1090 endloop:
1091
1092 /*
1093 * the end of the input stream and detection of truncated input
1094 * are handled by the framework, but here we must check if we are in Unicode
1095 * mode and byteIndex==0 because we must end in direct mode
1096 *
1097 * conditions:
1098 * successful
1099 * in Unicode mode and byteIndex==0
1100 * end of input and no truncated input
1101 */
1102 if( U_SUCCESS(*pErrorCode) &&
1103 !inDirectMode && byteIndex==0 &&
1104 pArgs->flush && source>=sourceLimit
1105 ) {
1106 if(base64Counter==-1) {
1107 /* & at the very end of the input */
1108 /* make the ampersand the reported sequence */
1109 bytes[0]=AMPERSAND;
1110 byteIndex=1;
1111 }
1112 /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1113
1114 inDirectMode=TRUE; /* avoid looping */
1115 *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1116 }
1117
1118 /* set the converter state back into UConverter */
1119 cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
1120 cnv->toULength=byteIndex;
1121
1122 /* write back the updated pointers */
1123 pArgs->source=(const char *)source;
1124 pArgs->target=target;
1125 pArgs->offsets=offsets;
1126 return;
1127 }
1128
1129 static void
_IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)1130 _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1131 UErrorCode *pErrorCode) {
1132 UConverter *cnv;
1133 const UChar *source, *sourceLimit;
1134 uint8_t *target, *targetLimit;
1135 int32_t *offsets;
1136
1137 int32_t length, targetCapacity, sourceIndex;
1138 UChar c;
1139 uint8_t b;
1140
1141 /* UTF-7 state */
1142 uint8_t bits;
1143 int8_t base64Counter;
1144 UBool inDirectMode;
1145
1146 /* set up the local pointers */
1147 cnv=pArgs->converter;
1148
1149 /* set up the local pointers */
1150 source=pArgs->source;
1151 sourceLimit=pArgs->sourceLimit;
1152 target=(uint8_t *)pArgs->target;
1153 targetLimit=(uint8_t *)pArgs->targetLimit;
1154 offsets=pArgs->offsets;
1155
1156 /* get the state machine state */
1157 {
1158 uint32_t status=cnv->fromUnicodeStatus;
1159 inDirectMode=(UBool)((status>>24)&1);
1160 base64Counter=(int8_t)(status>>16);
1161 bits=(uint8_t)status;
1162 }
1163
1164 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1165 sourceIndex=0;
1166
1167 if(inDirectMode) {
1168 directMode:
1169 length=(int32_t)(sourceLimit-source);
1170 targetCapacity=(int32_t)(targetLimit-target);
1171 if(length>targetCapacity) {
1172 length=targetCapacity;
1173 }
1174 while(length>0) {
1175 c=*source++;
1176 /* encode 0x20..0x7e except '&' directly */
1177 if(inSetDIMAP(c)) {
1178 /* encode directly */
1179 *target++=(uint8_t)c;
1180 if(offsets!=NULL) {
1181 *offsets++=sourceIndex++;
1182 }
1183 } else if(c==AMPERSAND) {
1184 /* output &- for & */
1185 *target++=AMPERSAND;
1186 if(target<targetLimit) {
1187 *target++=MINUS;
1188 if(offsets!=NULL) {
1189 *offsets++=sourceIndex;
1190 *offsets++=sourceIndex++;
1191 }
1192 /* realign length and targetCapacity */
1193 goto directMode;
1194 } else {
1195 if(offsets!=NULL) {
1196 *offsets++=sourceIndex++;
1197 }
1198 cnv->charErrorBuffer[0]=MINUS;
1199 cnv->charErrorBufferLength=1;
1200 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1201 break;
1202 }
1203 } else {
1204 /* un-read this character and switch to Unicode Mode */
1205 --source;
1206 *target++=AMPERSAND;
1207 if(offsets!=NULL) {
1208 *offsets++=sourceIndex;
1209 }
1210 inDirectMode=FALSE;
1211 base64Counter=0;
1212 goto unicodeMode;
1213 }
1214 --length;
1215 }
1216 if(source<sourceLimit && target>=targetLimit) {
1217 /* target is full */
1218 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1219 }
1220 } else {
1221 unicodeMode:
1222 while(source<sourceLimit) {
1223 if(target<targetLimit) {
1224 c=*source++;
1225 if(isLegalIMAP(c)) {
1226 /* encode directly */
1227 inDirectMode=TRUE;
1228
1229 /* trick: back out this character to make this easier */
1230 --source;
1231
1232 /* terminate the base64 sequence */
1233 if(base64Counter!=0) {
1234 /* write remaining bits for the previous character */
1235 *target++=TO_BASE64_IMAP(bits);
1236 if(offsets!=NULL) {
1237 *offsets++=sourceIndex-1;
1238 }
1239 }
1240 /* need to terminate with a minus */
1241 if(target<targetLimit) {
1242 *target++=MINUS;
1243 if(offsets!=NULL) {
1244 *offsets++=sourceIndex-1;
1245 }
1246 } else {
1247 cnv->charErrorBuffer[0]=MINUS;
1248 cnv->charErrorBufferLength=1;
1249 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1250 break;
1251 }
1252 goto directMode;
1253 } else {
1254 /*
1255 * base64 this character:
1256 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1257 * and the bits of this character, each implicitly in UTF-16BE.
1258 *
1259 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1260 * character to the next. The actual 2 or 4 bits are shifted to the left edge
1261 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1262 */
1263 switch(base64Counter) {
1264 case 0:
1265 b=(uint8_t)(c>>10);
1266 *target++=TO_BASE64_IMAP(b);
1267 if(target<targetLimit) {
1268 b=(uint8_t)((c>>4)&0x3f);
1269 *target++=TO_BASE64_IMAP(b);
1270 if(offsets!=NULL) {
1271 *offsets++=sourceIndex;
1272 *offsets++=sourceIndex++;
1273 }
1274 } else {
1275 if(offsets!=NULL) {
1276 *offsets++=sourceIndex++;
1277 }
1278 b=(uint8_t)((c>>4)&0x3f);
1279 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1280 cnv->charErrorBufferLength=1;
1281 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1282 }
1283 bits=(uint8_t)((c&15)<<2);
1284 base64Counter=1;
1285 break;
1286 case 1:
1287 b=(uint8_t)(bits|(c>>14));
1288 *target++=TO_BASE64_IMAP(b);
1289 if(target<targetLimit) {
1290 b=(uint8_t)((c>>8)&0x3f);
1291 *target++=TO_BASE64_IMAP(b);
1292 if(target<targetLimit) {
1293 b=(uint8_t)((c>>2)&0x3f);
1294 *target++=TO_BASE64_IMAP(b);
1295 if(offsets!=NULL) {
1296 *offsets++=sourceIndex;
1297 *offsets++=sourceIndex;
1298 *offsets++=sourceIndex++;
1299 }
1300 } else {
1301 if(offsets!=NULL) {
1302 *offsets++=sourceIndex;
1303 *offsets++=sourceIndex++;
1304 }
1305 b=(uint8_t)((c>>2)&0x3f);
1306 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1307 cnv->charErrorBufferLength=1;
1308 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1309 }
1310 } else {
1311 if(offsets!=NULL) {
1312 *offsets++=sourceIndex++;
1313 }
1314 b=(uint8_t)((c>>8)&0x3f);
1315 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1316 b=(uint8_t)((c>>2)&0x3f);
1317 cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1318 cnv->charErrorBufferLength=2;
1319 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1320 }
1321 bits=(uint8_t)((c&3)<<4);
1322 base64Counter=2;
1323 break;
1324 case 2:
1325 b=(uint8_t)(bits|(c>>12));
1326 *target++=TO_BASE64_IMAP(b);
1327 if(target<targetLimit) {
1328 b=(uint8_t)((c>>6)&0x3f);
1329 *target++=TO_BASE64_IMAP(b);
1330 if(target<targetLimit) {
1331 b=(uint8_t)(c&0x3f);
1332 *target++=TO_BASE64_IMAP(b);
1333 if(offsets!=NULL) {
1334 *offsets++=sourceIndex;
1335 *offsets++=sourceIndex;
1336 *offsets++=sourceIndex++;
1337 }
1338 } else {
1339 if(offsets!=NULL) {
1340 *offsets++=sourceIndex;
1341 *offsets++=sourceIndex++;
1342 }
1343 b=(uint8_t)(c&0x3f);
1344 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1345 cnv->charErrorBufferLength=1;
1346 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1347 }
1348 } else {
1349 if(offsets!=NULL) {
1350 *offsets++=sourceIndex++;
1351 }
1352 b=(uint8_t)((c>>6)&0x3f);
1353 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1354 b=(uint8_t)(c&0x3f);
1355 cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1356 cnv->charErrorBufferLength=2;
1357 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1358 }
1359 bits=0;
1360 base64Counter=0;
1361 break;
1362 default:
1363 /* will never occur */
1364 break;
1365 }
1366 }
1367 } else {
1368 /* target is full */
1369 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1370 break;
1371 }
1372 }
1373 }
1374
1375 if(pArgs->flush && source>=sourceLimit) {
1376 /* flush remaining bits to the target */
1377 if(!inDirectMode) {
1378 if(base64Counter!=0) {
1379 if(target<targetLimit) {
1380 *target++=TO_BASE64_IMAP(bits);
1381 if(offsets!=NULL) {
1382 *offsets++=sourceIndex-1;
1383 }
1384 } else {
1385 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
1386 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1387 }
1388 }
1389 /* need to terminate with a minus */
1390 if(target<targetLimit) {
1391 *target++=MINUS;
1392 if(offsets!=NULL) {
1393 *offsets++=sourceIndex-1;
1394 }
1395 } else {
1396 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
1397 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1398 }
1399 }
1400 /* reset the state for the next conversion */
1401 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
1402 } else {
1403 /* set the converter state back into UConverter */
1404 cnv->fromUnicodeStatus=
1405 (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
1406 ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
1407 }
1408
1409 /* write back the updated pointers */
1410 pArgs->source=source;
1411 pArgs->target=(char *)target;
1412 pArgs->offsets=offsets;
1413 return;
1414 }
1415
1416 static const UConverterImpl _IMAPImpl={
1417 UCNV_IMAP_MAILBOX,
1418
1419 NULL,
1420 NULL,
1421
1422 _UTF7Open,
1423 NULL,
1424 _UTF7Reset,
1425
1426 _IMAPToUnicodeWithOffsets,
1427 _IMAPToUnicodeWithOffsets,
1428 _IMAPFromUnicodeWithOffsets,
1429 _IMAPFromUnicodeWithOffsets,
1430 NULL,
1431
1432 NULL,
1433 NULL,
1434 NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1435 NULL,
1436 ucnv_getCompleteUnicodeSet
1437 };
1438
1439 static const UConverterStaticData _IMAPStaticData={
1440 sizeof(UConverterStaticData),
1441 "IMAP-mailbox-name",
1442 0, /* TODO CCSID for IMAP-mailbox-name */
1443 UCNV_IBM, UCNV_IMAP_MAILBOX,
1444 1, 4,
1445 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1446 FALSE, FALSE,
1447 0,
1448 0,
1449 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1450 };
1451
1452 const UConverterSharedData _IMAPData={
1453 sizeof(UConverterSharedData), ~((uint32_t)0),
1454 NULL, NULL, &_IMAPStaticData, FALSE, &_IMAPImpl,
1455 0
1456 };
1457
1458 #endif
1459