1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u7.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
13 *
14 * UTF-7 converter implementation. Used to be in ucnv_utf.c.
15 */
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_CONVERSION
20
21 #include "unicode/ucnv.h"
22 #include "ucnv_bld.h"
23 #include "ucnv_cnv.h"
24
25 /* UTF-7 -------------------------------------------------------------------- */
26
27 /*
28 * UTF-7 is a stateful encoding of Unicode.
29 * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
30 * It was intended for use in Internet email systems, using in its bytewise
31 * encoding only a subset of 7-bit US-ASCII.
32 * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
33 * occasionally used.
34 *
35 * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
36 * characters directly or in base64. Especially, the characters in set O
37 * as defined in the RFC (see below) may be encoded directly but are not
38 * allowed in, e.g., email headers.
39 * By default, the ICU UTF-7 converter encodes set O directly.
40 * By choosing the option "version=1", set O will be escaped instead.
41 * For example:
42 * utf7Converter=ucnv_open("UTF-7,version=1");
43 *
44 * For details about email headers see RFC 2047.
45 */
46
47 /*
48 * Tests for US-ASCII characters belonging to character classes
49 * defined in UTF-7.
50 *
51 * Set D (directly encoded characters) consists of the following
52 * characters: the upper and lower case letters A through Z
53 * and a through z, the 10 digits 0-9, and the following nine special
54 * characters (note that "+" and "=" are omitted):
55 * '(),-./:?
56 *
57 * Set O (optional direct characters) consists of the following
58 * characters (note that "\" and "~" are omitted):
59 * !"#$%&*;<=>@[]^_`{|}
60 *
61 * According to the rules in RFC 2152, the byte values for the following
62 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
63 * - all C0 control codes except for CR LF TAB
64 * - BACKSLASH
65 * - TILDE
66 * - DEL
67 * - all codes beyond US-ASCII, i.e. all >127
68 */
69 #define inSetD(c) \
70 ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
71 (uint8_t)((c)-48)<10 || /* digits */ \
72 (uint8_t)((c)-39)<3 || /* '() */ \
73 (uint8_t)((c)-44)<4 || /* ,-./ */ \
74 (c)==58 || (c)==63 /* :? */ \
75 )
76
77 #define inSetO(c) \
78 ((uint8_t)((c)-33)<6 || /* !"#$%& */ \
79 (uint8_t)((c)-59)<4 || /* ;<=> */ \
80 (uint8_t)((c)-93)<4 || /* ]^_` */ \
81 (uint8_t)((c)-123)<3 || /* {|} */ \
82 (c)==42 || (c)==64 || (c)==91 /* *@[ */ \
83 )
84
85 #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
86 #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
87
88 #define PLUS 43
89 #define MINUS 45
90 #define BACKSLASH 92
91 #define TILDE 126
92
93 /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
94 #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
95
96 /* encode directly sets D and O and CR LF SP TAB */
97 static const UBool encodeDirectlyMaximum[128]={
98 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
99 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101
102 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
103 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
104
105 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
107
108 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
110 };
111
112 /* encode directly set D and CR LF SP TAB but not set O */
113 static const UBool encodeDirectlyRestricted[128]={
114 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
115 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117
118 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
119 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
120
121 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
122 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
123
124 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
125 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
126 };
127
128 static const uint8_t
129 toBase64[64]={
130 /* A-Z */
131 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
132 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
133 /* a-z */
134 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
135 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
136 /* 0-9 */
137 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
138 /* +/ */
139 43, 47
140 };
141
142 static const int8_t
143 fromBase64[128]={
144 /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
145 -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
146 -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
147
148 /* general punctuation with + and / and a special value (-2) for - */
149 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
150 /* digits */
151 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
152
153 /* A-Z */
154 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
155 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
156
157 /* a-z */
158 -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
159 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
160 };
161
162 /*
163 * converter status values:
164 *
165 * toUnicodeStatus:
166 * 24 inDirectMode (boolean)
167 * 23..16 base64Counter (-1..7)
168 * 15..0 bits (up to 14 bits incoming base64)
169 *
170 * fromUnicodeStatus:
171 * 31..28 version (0: set O direct 1: set O escaped)
172 * 24 inDirectMode (boolean)
173 * 23..16 base64Counter (0..2)
174 * 7..0 bits (6 bits outgoing base64)
175 *
176 */
177
178 static void
_UTF7Reset(UConverter * cnv,UConverterResetChoice choice)179 _UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
180 if(choice<=UCNV_RESET_TO_UNICODE) {
181 /* reset toUnicode */
182 cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
183 cnv->toULength=0;
184 }
185 if(choice!=UCNV_RESET_TO_UNICODE) {
186 /* reset fromUnicode */
187 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
188 }
189 }
190
191 static void
_UTF7Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)192 _UTF7Open(UConverter *cnv,
193 UConverterLoadArgs *pArgs,
194 UErrorCode *pErrorCode) {
195 if(UCNV_GET_VERSION(cnv)<=1) {
196 /* TODO(markus): Should just use cnv->options rather than copying the version number. */
197 cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
198 _UTF7Reset(cnv, UCNV_RESET_BOTH);
199 } else {
200 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
201 }
202 }
203
204 static void
_UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)205 _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
206 UErrorCode *pErrorCode) {
207 UConverter *cnv;
208 const uint8_t *source, *sourceLimit;
209 UChar *target;
210 const UChar *targetLimit;
211 int32_t *offsets;
212
213 uint8_t *bytes;
214 uint8_t byteIndex;
215
216 int32_t length, targetCapacity;
217
218 /* UTF-7 state */
219 uint16_t bits;
220 int8_t base64Counter;
221 UBool inDirectMode;
222
223 int8_t base64Value;
224
225 int32_t sourceIndex, nextSourceIndex;
226
227 uint8_t b;
228 /* set up the local pointers */
229 cnv=pArgs->converter;
230
231 source=(const uint8_t *)pArgs->source;
232 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
233 target=pArgs->target;
234 targetLimit=pArgs->targetLimit;
235 offsets=pArgs->offsets;
236 /* get the state machine state */
237 {
238 uint32_t status=cnv->toUnicodeStatus;
239 inDirectMode=(UBool)((status>>24)&1);
240 base64Counter=(int8_t)(status>>16);
241 bits=(uint16_t)status;
242 }
243 bytes=cnv->toUBytes;
244 byteIndex=cnv->toULength;
245
246 /* sourceIndex=-1 if the current character began in the previous buffer */
247 sourceIndex=byteIndex==0 ? 0 : -1;
248 nextSourceIndex=0;
249
250 if(inDirectMode) {
251 directMode:
252 /*
253 * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
254 * with their US-ASCII byte values.
255 * Backslash and Tilde and most control characters are not allowed in UTF-7.
256 * A plus sign starts Unicode (or "escape") Mode.
257 *
258 * In Direct Mode, only the sourceIndex is used.
259 */
260 byteIndex=0;
261 length=(int32_t)(sourceLimit-source);
262 targetCapacity=(int32_t)(targetLimit-target);
263 if(length>targetCapacity) {
264 length=targetCapacity;
265 }
266 while(length>0) {
267 b=*source++;
268 if(!isLegalUTF7(b)) {
269 /* illegal */
270 bytes[0]=b;
271 byteIndex=1;
272 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
273 break;
274 } else if(b!=PLUS) {
275 /* write directly encoded character */
276 *target++=b;
277 if(offsets!=NULL) {
278 *offsets++=sourceIndex++;
279 }
280 } else /* PLUS */ {
281 /* switch to Unicode mode */
282 nextSourceIndex=++sourceIndex;
283 inDirectMode=FALSE;
284 byteIndex=0;
285 bits=0;
286 base64Counter=-1;
287 goto unicodeMode;
288 }
289 --length;
290 }
291 if(source<sourceLimit && target>=targetLimit) {
292 /* target is full */
293 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
294 }
295 } else {
296 unicodeMode:
297 /*
298 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
299 * The base64 sequence ends with any character that is not in the base64 alphabet.
300 * A terminating minus sign is consumed.
301 *
302 * In Unicode Mode, the sourceIndex has the index to the start of the current
303 * base64 bytes, while nextSourceIndex is precisely parallel to source,
304 * keeping the index to the following byte.
305 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
306 */
307 while(source<sourceLimit) {
308 if(target<targetLimit) {
309 bytes[byteIndex++]=b=*source++;
310 ++nextSourceIndex;
311 base64Value = -3; /* initialize as illegal */
312 if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) {
313 /* either
314 * base64Value==-1 for any legal character except base64 and minus sign, or
315 * base64Value==-3 for illegal characters:
316 * 1. In either case, leave Unicode mode.
317 * 2.1. If we ended with an incomplete UChar or none after the +, then
318 * generate an error for the preceding erroneous sequence and deal with
319 * the current (possibly illegal) character next time through.
320 * 2.2. Else the current char comes after a complete UChar, which was already
321 * pushed to the output buf, so:
322 * 2.2.1. If the current char is legal, just save it for processing next time.
323 * It may be for example, a plus which we need to deal with in direct mode.
324 * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
325 */
326 inDirectMode=TRUE;
327 if(base64Counter==-1) {
328 /* illegal: + immediately followed by something other than base64 or minus sign */
329 /* include the plus sign in the reported sequence, but not the subsequent char */
330 --source;
331 bytes[0]=PLUS;
332 byteIndex=1;
333 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
334 break;
335 } else if(bits!=0) {
336 /* bits are illegally left over, a UChar is incomplete */
337 /* don't include current char (legal or illegal) in error seq */
338 --source;
339 --byteIndex;
340 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
341 break;
342 } else {
343 /* previous UChar was complete */
344 if(base64Value==-3) {
345 /* current character is illegal, deal with it here */
346 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
347 break;
348 } else {
349 /* un-read the current character in case it is a plus sign */
350 --source;
351 sourceIndex=nextSourceIndex-1;
352 goto directMode;
353 }
354 }
355 } else if(base64Value>=0) {
356 /* collect base64 bytes into UChars */
357 switch(base64Counter) {
358 case -1: /* -1 is immediately after the + */
359 case 0:
360 bits=base64Value;
361 base64Counter=1;
362 break;
363 case 1:
364 case 3:
365 case 4:
366 case 6:
367 bits=(uint16_t)((bits<<6)|base64Value);
368 ++base64Counter;
369 break;
370 case 2:
371 *target++=(UChar)((bits<<4)|(base64Value>>2));
372 if(offsets!=NULL) {
373 *offsets++=sourceIndex;
374 sourceIndex=nextSourceIndex-1;
375 }
376 bytes[0]=b; /* keep this byte in case an error occurs */
377 byteIndex=1;
378 bits=(uint16_t)(base64Value&3);
379 base64Counter=3;
380 break;
381 case 5:
382 *target++=(UChar)((bits<<2)|(base64Value>>4));
383 if(offsets!=NULL) {
384 *offsets++=sourceIndex;
385 sourceIndex=nextSourceIndex-1;
386 }
387 bytes[0]=b; /* keep this byte in case an error occurs */
388 byteIndex=1;
389 bits=(uint16_t)(base64Value&15);
390 base64Counter=6;
391 break;
392 case 7:
393 *target++=(UChar)((bits<<6)|base64Value);
394 if(offsets!=NULL) {
395 *offsets++=sourceIndex;
396 sourceIndex=nextSourceIndex;
397 }
398 byteIndex=0;
399 bits=0;
400 base64Counter=0;
401 break;
402 default:
403 /* will never occur */
404 break;
405 }
406 } else /*base64Value==-2*/ {
407 /* minus sign terminates the base64 sequence */
408 inDirectMode=TRUE;
409 if(base64Counter==-1) {
410 /* +- i.e. a minus immediately following a plus */
411 *target++=PLUS;
412 if(offsets!=NULL) {
413 *offsets++=sourceIndex-1;
414 }
415 } else {
416 /* absorb the minus and leave the Unicode Mode */
417 if(bits!=0) {
418 /* bits are illegally left over, a UChar is incomplete */
419 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
420 break;
421 }
422 }
423 sourceIndex=nextSourceIndex;
424 goto directMode;
425 }
426 } else {
427 /* target is full */
428 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
429 break;
430 }
431 }
432 }
433
434 if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
435 /*
436 * if we are in Unicode mode, then the byteIndex might not be 0,
437 * but that is ok if bits==0
438 * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
439 * (not true for IMAP-mailbox-name where we must end in direct mode)
440 */
441 byteIndex=0;
442 }
443
444 /* set the converter state back into UConverter */
445 cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
446 cnv->toULength=byteIndex;
447
448 /* write back the updated pointers */
449 pArgs->source=(const char *)source;
450 pArgs->target=target;
451 pArgs->offsets=offsets;
452 return;
453 }
454
455 static void
_UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)456 _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
457 UErrorCode *pErrorCode) {
458 UConverter *cnv;
459 const UChar *source, *sourceLimit;
460 uint8_t *target, *targetLimit;
461 int32_t *offsets;
462
463 int32_t length, targetCapacity, sourceIndex;
464 UChar c;
465
466 /* UTF-7 state */
467 const UBool *encodeDirectly;
468 uint8_t bits;
469 int8_t base64Counter;
470 UBool inDirectMode;
471
472 /* set up the local pointers */
473 cnv=pArgs->converter;
474
475 /* set up the local pointers */
476 source=pArgs->source;
477 sourceLimit=pArgs->sourceLimit;
478 target=(uint8_t *)pArgs->target;
479 targetLimit=(uint8_t *)pArgs->targetLimit;
480 offsets=pArgs->offsets;
481
482 /* get the state machine state */
483 {
484 uint32_t status=cnv->fromUnicodeStatus;
485 encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
486 inDirectMode=(UBool)((status>>24)&1);
487 base64Counter=(int8_t)(status>>16);
488 bits=(uint8_t)status;
489 }
490
491 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
492 sourceIndex=0;
493
494 if(inDirectMode) {
495 directMode:
496 length=(int32_t)(sourceLimit-source);
497 targetCapacity=(int32_t)(targetLimit-target);
498 if(length>targetCapacity) {
499 length=targetCapacity;
500 }
501 while(length>0) {
502 c=*source++;
503 /* currently always encode CR LF SP TAB directly */
504 if(c<=127 && encodeDirectly[c]) {
505 /* encode directly */
506 *target++=(uint8_t)c;
507 if(offsets!=NULL) {
508 *offsets++=sourceIndex++;
509 }
510 } else if(c==PLUS) {
511 /* output +- for + */
512 *target++=PLUS;
513 if(target<targetLimit) {
514 *target++=MINUS;
515 if(offsets!=NULL) {
516 *offsets++=sourceIndex;
517 *offsets++=sourceIndex++;
518 }
519 /* realign length and targetCapacity */
520 goto directMode;
521 } else {
522 if(offsets!=NULL) {
523 *offsets++=sourceIndex++;
524 }
525 cnv->charErrorBuffer[0]=MINUS;
526 cnv->charErrorBufferLength=1;
527 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
528 break;
529 }
530 } else {
531 /* un-read this character and switch to Unicode Mode */
532 --source;
533 *target++=PLUS;
534 if(offsets!=NULL) {
535 *offsets++=sourceIndex;
536 }
537 inDirectMode=FALSE;
538 base64Counter=0;
539 goto unicodeMode;
540 }
541 --length;
542 }
543 if(source<sourceLimit && target>=targetLimit) {
544 /* target is full */
545 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
546 }
547 } else {
548 unicodeMode:
549 while(source<sourceLimit) {
550 if(target<targetLimit) {
551 c=*source++;
552 if(c<=127 && encodeDirectly[c]) {
553 /* encode directly */
554 inDirectMode=TRUE;
555
556 /* trick: back out this character to make this easier */
557 --source;
558
559 /* terminate the base64 sequence */
560 if(base64Counter!=0) {
561 /* write remaining bits for the previous character */
562 *target++=toBase64[bits];
563 if(offsets!=NULL) {
564 *offsets++=sourceIndex-1;
565 }
566 }
567 if(fromBase64[c]!=-1) {
568 /* need to terminate with a minus */
569 if(target<targetLimit) {
570 *target++=MINUS;
571 if(offsets!=NULL) {
572 *offsets++=sourceIndex-1;
573 }
574 } else {
575 cnv->charErrorBuffer[0]=MINUS;
576 cnv->charErrorBufferLength=1;
577 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
578 break;
579 }
580 }
581 goto directMode;
582 } else {
583 /*
584 * base64 this character:
585 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
586 * and the bits of this character, each implicitly in UTF-16BE.
587 *
588 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
589 * character to the next. The actual 2 or 4 bits are shifted to the left edge
590 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
591 */
592 switch(base64Counter) {
593 case 0:
594 *target++=toBase64[c>>10];
595 if(target<targetLimit) {
596 *target++=toBase64[(c>>4)&0x3f];
597 if(offsets!=NULL) {
598 *offsets++=sourceIndex;
599 *offsets++=sourceIndex++;
600 }
601 } else {
602 if(offsets!=NULL) {
603 *offsets++=sourceIndex++;
604 }
605 cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
606 cnv->charErrorBufferLength=1;
607 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
608 }
609 bits=(uint8_t)((c&15)<<2);
610 base64Counter=1;
611 break;
612 case 1:
613 *target++=toBase64[bits|(c>>14)];
614 if(target<targetLimit) {
615 *target++=toBase64[(c>>8)&0x3f];
616 if(target<targetLimit) {
617 *target++=toBase64[(c>>2)&0x3f];
618 if(offsets!=NULL) {
619 *offsets++=sourceIndex;
620 *offsets++=sourceIndex;
621 *offsets++=sourceIndex++;
622 }
623 } else {
624 if(offsets!=NULL) {
625 *offsets++=sourceIndex;
626 *offsets++=sourceIndex++;
627 }
628 cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
629 cnv->charErrorBufferLength=1;
630 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
631 }
632 } else {
633 if(offsets!=NULL) {
634 *offsets++=sourceIndex++;
635 }
636 cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
637 cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
638 cnv->charErrorBufferLength=2;
639 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
640 }
641 bits=(uint8_t)((c&3)<<4);
642 base64Counter=2;
643 break;
644 case 2:
645 *target++=toBase64[bits|(c>>12)];
646 if(target<targetLimit) {
647 *target++=toBase64[(c>>6)&0x3f];
648 if(target<targetLimit) {
649 *target++=toBase64[c&0x3f];
650 if(offsets!=NULL) {
651 *offsets++=sourceIndex;
652 *offsets++=sourceIndex;
653 *offsets++=sourceIndex++;
654 }
655 } else {
656 if(offsets!=NULL) {
657 *offsets++=sourceIndex;
658 *offsets++=sourceIndex++;
659 }
660 cnv->charErrorBuffer[0]=toBase64[c&0x3f];
661 cnv->charErrorBufferLength=1;
662 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
663 }
664 } else {
665 if(offsets!=NULL) {
666 *offsets++=sourceIndex++;
667 }
668 cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
669 cnv->charErrorBuffer[1]=toBase64[c&0x3f];
670 cnv->charErrorBufferLength=2;
671 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
672 }
673 bits=0;
674 base64Counter=0;
675 break;
676 default:
677 /* will never occur */
678 break;
679 }
680 }
681 } else {
682 /* target is full */
683 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
684 break;
685 }
686 }
687 }
688
689 if(pArgs->flush && source>=sourceLimit) {
690 /* flush remaining bits to the target */
691 if(!inDirectMode) {
692 if (base64Counter!=0) {
693 if(target<targetLimit) {
694 *target++=toBase64[bits];
695 if(offsets!=NULL) {
696 *offsets++=sourceIndex-1;
697 }
698 } else {
699 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
700 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
701 }
702 }
703 /* Add final MINUS to terminate unicodeMode */
704 if(target<targetLimit) {
705 *target++=MINUS;
706 if(offsets!=NULL) {
707 *offsets++=sourceIndex-1;
708 }
709 } else {
710 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
711 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
712 }
713 }
714 /* reset the state for the next conversion */
715 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
716 } else {
717 /* set the converter state back into UConverter */
718 cnv->fromUnicodeStatus=
719 (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
720 ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
721 }
722
723 /* write back the updated pointers */
724 pArgs->source=source;
725 pArgs->target=(char *)target;
726 pArgs->offsets=offsets;
727 return;
728 }
729
730 static const char *
_UTF7GetName(const UConverter * cnv)731 _UTF7GetName(const UConverter *cnv) {
732 switch(cnv->fromUnicodeStatus>>28) {
733 case 1:
734 return "UTF-7,version=1";
735 default:
736 return "UTF-7";
737 }
738 }
739
740 static const UConverterImpl _UTF7Impl={
741 UCNV_UTF7,
742
743 NULL,
744 NULL,
745
746 _UTF7Open,
747 NULL,
748 _UTF7Reset,
749
750 _UTF7ToUnicodeWithOffsets,
751 _UTF7ToUnicodeWithOffsets,
752 _UTF7FromUnicodeWithOffsets,
753 _UTF7FromUnicodeWithOffsets,
754 NULL,
755
756 NULL,
757 _UTF7GetName,
758 NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
759 NULL,
760 ucnv_getCompleteUnicodeSet
761 };
762
763 static const UConverterStaticData _UTF7StaticData={
764 sizeof(UConverterStaticData),
765 "UTF-7",
766 0, /* TODO CCSID for UTF-7 */
767 UCNV_IBM, UCNV_UTF7,
768 1, 4,
769 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
770 FALSE, FALSE,
771 0,
772 0,
773 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
774 };
775
776 const UConverterSharedData _UTF7Data={
777 sizeof(UConverterSharedData), ~((uint32_t)0),
778 NULL, NULL, &_UTF7StaticData, FALSE, &_UTF7Impl,
779 0
780 };
781
782 /* IMAP mailbox name encoding ----------------------------------------------- */
783
784 /*
785 * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
786 * http://www.ietf.org/rfc/rfc2060.txt
787 *
788 * 5.1.3. Mailbox International Naming Convention
789 *
790 * By convention, international mailbox names are specified using a
791 * modified version of the UTF-7 encoding described in [UTF-7]. The
792 * purpose of these modifications is to correct the following problems
793 * with UTF-7:
794 *
795 * 1) UTF-7 uses the "+" character for shifting; this conflicts with
796 * the common use of "+" in mailbox names, in particular USENET
797 * newsgroup names.
798 *
799 * 2) UTF-7's encoding is BASE64 which uses the "/" character; this
800 * conflicts with the use of "/" as a popular hierarchy delimiter.
801 *
802 * 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
803 * the use of "\" as a popular hierarchy delimiter.
804 *
805 * 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
806 * the use of "~" in some servers as a home directory indicator.
807 *
808 * 5) UTF-7 permits multiple alternate forms to represent the same
809 * string; in particular, printable US-ASCII chararacters can be
810 * represented in encoded form.
811 *
812 * In modified UTF-7, printable US-ASCII characters except for "&"
813 * represent themselves; that is, characters with octet values 0x20-0x25
814 * and 0x27-0x7e. The character "&" (0x26) is represented by the two-
815 * octet sequence "&-".
816 *
817 * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
818 * Unicode 16-bit octets) are represented in modified BASE64, with a
819 * further modification from [UTF-7] that "," is used instead of "/".
820 * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
821 * character which can represent itself.
822 *
823 * "&" is used to shift to modified BASE64 and "-" to shift back to US-
824 * ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that
825 * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
826 * ").
827 *
828 * For example, here is a mailbox name which mixes English, Japanese,
829 * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
830 */
831
832 /*
833 * Tests for US-ASCII characters belonging to character classes
834 * defined in UTF-7.
835 *
836 * Set D (directly encoded characters) consists of the following
837 * characters: the upper and lower case letters A through Z
838 * and a through z, the 10 digits 0-9, and the following nine special
839 * characters (note that "+" and "=" are omitted):
840 * '(),-./:?
841 *
842 * Set O (optional direct characters) consists of the following
843 * characters (note that "\" and "~" are omitted):
844 * !"#$%&*;<=>@[]^_`{|}
845 *
846 * According to the rules in RFC 2152, the byte values for the following
847 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
848 * - all C0 control codes except for CR LF TAB
849 * - BACKSLASH
850 * - TILDE
851 * - DEL
852 * - all codes beyond US-ASCII, i.e. all >127
853 */
854
855 /* uses '&' not '+' to start a base64 sequence */
856 #define AMPERSAND 0x26
857 #define COMMA 0x2c
858 #define SLASH 0x2f
859
860 /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
861 #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
862
863 /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
864 #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
865
866 #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
867 #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
868
869 /*
870 * converter status values:
871 *
872 * toUnicodeStatus:
873 * 24 inDirectMode (boolean)
874 * 23..16 base64Counter (-1..7)
875 * 15..0 bits (up to 14 bits incoming base64)
876 *
877 * fromUnicodeStatus:
878 * 24 inDirectMode (boolean)
879 * 23..16 base64Counter (0..2)
880 * 7..0 bits (6 bits outgoing base64)
881 *
882 * ignore bits 31..25
883 */
884
885 static void
_IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)886 _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
887 UErrorCode *pErrorCode) {
888 UConverter *cnv;
889 const uint8_t *source, *sourceLimit;
890 UChar *target;
891 const UChar *targetLimit;
892 int32_t *offsets;
893
894 uint8_t *bytes;
895 uint8_t byteIndex;
896
897 int32_t length, targetCapacity;
898
899 /* UTF-7 state */
900 uint16_t bits;
901 int8_t base64Counter;
902 UBool inDirectMode;
903
904 int8_t base64Value;
905
906 int32_t sourceIndex, nextSourceIndex;
907
908 UChar c;
909 uint8_t b;
910
911 /* set up the local pointers */
912 cnv=pArgs->converter;
913
914 source=(const uint8_t *)pArgs->source;
915 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
916 target=pArgs->target;
917 targetLimit=pArgs->targetLimit;
918 offsets=pArgs->offsets;
919 /* get the state machine state */
920 {
921 uint32_t status=cnv->toUnicodeStatus;
922 inDirectMode=(UBool)((status>>24)&1);
923 base64Counter=(int8_t)(status>>16);
924 bits=(uint16_t)status;
925 }
926 bytes=cnv->toUBytes;
927 byteIndex=cnv->toULength;
928
929 /* sourceIndex=-1 if the current character began in the previous buffer */
930 sourceIndex=byteIndex==0 ? 0 : -1;
931 nextSourceIndex=0;
932
933 if(inDirectMode) {
934 directMode:
935 /*
936 * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
937 * with their US-ASCII byte values.
938 * An ampersand starts Unicode (or "escape") Mode.
939 *
940 * In Direct Mode, only the sourceIndex is used.
941 */
942 byteIndex=0;
943 length=(int32_t)(sourceLimit-source);
944 targetCapacity=(int32_t)(targetLimit-target);
945 if(length>targetCapacity) {
946 length=targetCapacity;
947 }
948 while(length>0) {
949 b=*source++;
950 if(!isLegalIMAP(b)) {
951 /* illegal */
952 bytes[0]=b;
953 byteIndex=1;
954 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
955 break;
956 } else if(b!=AMPERSAND) {
957 /* write directly encoded character */
958 *target++=b;
959 if(offsets!=NULL) {
960 *offsets++=sourceIndex++;
961 }
962 } else /* AMPERSAND */ {
963 /* switch to Unicode mode */
964 nextSourceIndex=++sourceIndex;
965 inDirectMode=FALSE;
966 byteIndex=0;
967 bits=0;
968 base64Counter=-1;
969 goto unicodeMode;
970 }
971 --length;
972 }
973 if(source<sourceLimit && target>=targetLimit) {
974 /* target is full */
975 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
976 }
977 } else {
978 unicodeMode:
979 /*
980 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
981 * The base64 sequence ends with any character that is not in the base64 alphabet.
982 * A terminating minus sign is consumed.
983 * US-ASCII must not be base64-ed.
984 *
985 * In Unicode Mode, the sourceIndex has the index to the start of the current
986 * base64 bytes, while nextSourceIndex is precisely parallel to source,
987 * keeping the index to the following byte.
988 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
989 */
990 while(source<sourceLimit) {
991 if(target<targetLimit) {
992 bytes[byteIndex++]=b=*source++;
993 ++nextSourceIndex;
994 if(b>0x7e) {
995 /* illegal - test other illegal US-ASCII values by base64Value==-3 */
996 inDirectMode=TRUE;
997 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
998 break;
999 } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
1000 /* collect base64 bytes into UChars */
1001 switch(base64Counter) {
1002 case -1: /* -1 is immediately after the & */
1003 case 0:
1004 bits=base64Value;
1005 base64Counter=1;
1006 break;
1007 case 1:
1008 case 3:
1009 case 4:
1010 case 6:
1011 bits=(uint16_t)((bits<<6)|base64Value);
1012 ++base64Counter;
1013 break;
1014 case 2:
1015 c=(UChar)((bits<<4)|(base64Value>>2));
1016 if(isLegalIMAP(c)) {
1017 /* illegal */
1018 inDirectMode=TRUE;
1019 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1020 goto endloop;
1021 }
1022 *target++=c;
1023 if(offsets!=NULL) {
1024 *offsets++=sourceIndex;
1025 sourceIndex=nextSourceIndex-1;
1026 }
1027 bytes[0]=b; /* keep this byte in case an error occurs */
1028 byteIndex=1;
1029 bits=(uint16_t)(base64Value&3);
1030 base64Counter=3;
1031 break;
1032 case 5:
1033 c=(UChar)((bits<<2)|(base64Value>>4));
1034 if(isLegalIMAP(c)) {
1035 /* illegal */
1036 inDirectMode=TRUE;
1037 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1038 goto endloop;
1039 }
1040 *target++=c;
1041 if(offsets!=NULL) {
1042 *offsets++=sourceIndex;
1043 sourceIndex=nextSourceIndex-1;
1044 }
1045 bytes[0]=b; /* keep this byte in case an error occurs */
1046 byteIndex=1;
1047 bits=(uint16_t)(base64Value&15);
1048 base64Counter=6;
1049 break;
1050 case 7:
1051 c=(UChar)((bits<<6)|base64Value);
1052 if(isLegalIMAP(c)) {
1053 /* illegal */
1054 inDirectMode=TRUE;
1055 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1056 goto endloop;
1057 }
1058 *target++=c;
1059 if(offsets!=NULL) {
1060 *offsets++=sourceIndex;
1061 sourceIndex=nextSourceIndex;
1062 }
1063 byteIndex=0;
1064 bits=0;
1065 base64Counter=0;
1066 break;
1067 default:
1068 /* will never occur */
1069 break;
1070 }
1071 } else if(base64Value==-2) {
1072 /* minus sign terminates the base64 sequence */
1073 inDirectMode=TRUE;
1074 if(base64Counter==-1) {
1075 /* &- i.e. a minus immediately following an ampersand */
1076 *target++=AMPERSAND;
1077 if(offsets!=NULL) {
1078 *offsets++=sourceIndex-1;
1079 }
1080 } else {
1081 /* absorb the minus and leave the Unicode Mode */
1082 if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
1083 /* bits are illegally left over, a UChar is incomplete */
1084 /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
1085 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1086 break;
1087 }
1088 }
1089 sourceIndex=nextSourceIndex;
1090 goto directMode;
1091 } else {
1092 if(base64Counter==-1) {
1093 /* illegal: & immediately followed by something other than base64 or minus sign */
1094 /* include the ampersand in the reported sequence */
1095 --sourceIndex;
1096 bytes[0]=AMPERSAND;
1097 bytes[1]=b;
1098 byteIndex=2;
1099 }
1100 /* base64Value==-1 for characters that are illegal only in Unicode mode */
1101 /* base64Value==-3 for illegal characters */
1102 /* illegal */
1103 inDirectMode=TRUE;
1104 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1105 break;
1106 }
1107 } else {
1108 /* target is full */
1109 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1110 break;
1111 }
1112 }
1113 }
1114 endloop:
1115
1116 /*
1117 * the end of the input stream and detection of truncated input
1118 * are handled by the framework, but here we must check if we are in Unicode
1119 * mode and byteIndex==0 because we must end in direct mode
1120 *
1121 * conditions:
1122 * successful
1123 * in Unicode mode and byteIndex==0
1124 * end of input and no truncated input
1125 */
1126 if( U_SUCCESS(*pErrorCode) &&
1127 !inDirectMode && byteIndex==0 &&
1128 pArgs->flush && source>=sourceLimit
1129 ) {
1130 if(base64Counter==-1) {
1131 /* & at the very end of the input */
1132 /* make the ampersand the reported sequence */
1133 bytes[0]=AMPERSAND;
1134 byteIndex=1;
1135 }
1136 /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1137
1138 inDirectMode=TRUE; /* avoid looping */
1139 *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1140 }
1141
1142 /* set the converter state back into UConverter */
1143 cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
1144 cnv->toULength=byteIndex;
1145
1146 /* write back the updated pointers */
1147 pArgs->source=(const char *)source;
1148 pArgs->target=target;
1149 pArgs->offsets=offsets;
1150 return;
1151 }
1152
1153 static void
_IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)1154 _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1155 UErrorCode *pErrorCode) {
1156 UConverter *cnv;
1157 const UChar *source, *sourceLimit;
1158 uint8_t *target, *targetLimit;
1159 int32_t *offsets;
1160
1161 int32_t length, targetCapacity, sourceIndex;
1162 UChar c;
1163 uint8_t b;
1164
1165 /* UTF-7 state */
1166 uint8_t bits;
1167 int8_t base64Counter;
1168 UBool inDirectMode;
1169
1170 /* set up the local pointers */
1171 cnv=pArgs->converter;
1172
1173 /* set up the local pointers */
1174 source=pArgs->source;
1175 sourceLimit=pArgs->sourceLimit;
1176 target=(uint8_t *)pArgs->target;
1177 targetLimit=(uint8_t *)pArgs->targetLimit;
1178 offsets=pArgs->offsets;
1179
1180 /* get the state machine state */
1181 {
1182 uint32_t status=cnv->fromUnicodeStatus;
1183 inDirectMode=(UBool)((status>>24)&1);
1184 base64Counter=(int8_t)(status>>16);
1185 bits=(uint8_t)status;
1186 }
1187
1188 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1189 sourceIndex=0;
1190
1191 if(inDirectMode) {
1192 directMode:
1193 length=(int32_t)(sourceLimit-source);
1194 targetCapacity=(int32_t)(targetLimit-target);
1195 if(length>targetCapacity) {
1196 length=targetCapacity;
1197 }
1198 while(length>0) {
1199 c=*source++;
1200 /* encode 0x20..0x7e except '&' directly */
1201 if(inSetDIMAP(c)) {
1202 /* encode directly */
1203 *target++=(uint8_t)c;
1204 if(offsets!=NULL) {
1205 *offsets++=sourceIndex++;
1206 }
1207 } else if(c==AMPERSAND) {
1208 /* output &- for & */
1209 *target++=AMPERSAND;
1210 if(target<targetLimit) {
1211 *target++=MINUS;
1212 if(offsets!=NULL) {
1213 *offsets++=sourceIndex;
1214 *offsets++=sourceIndex++;
1215 }
1216 /* realign length and targetCapacity */
1217 goto directMode;
1218 } else {
1219 if(offsets!=NULL) {
1220 *offsets++=sourceIndex++;
1221 }
1222 cnv->charErrorBuffer[0]=MINUS;
1223 cnv->charErrorBufferLength=1;
1224 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1225 break;
1226 }
1227 } else {
1228 /* un-read this character and switch to Unicode Mode */
1229 --source;
1230 *target++=AMPERSAND;
1231 if(offsets!=NULL) {
1232 *offsets++=sourceIndex;
1233 }
1234 inDirectMode=FALSE;
1235 base64Counter=0;
1236 goto unicodeMode;
1237 }
1238 --length;
1239 }
1240 if(source<sourceLimit && target>=targetLimit) {
1241 /* target is full */
1242 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1243 }
1244 } else {
1245 unicodeMode:
1246 while(source<sourceLimit) {
1247 if(target<targetLimit) {
1248 c=*source++;
1249 if(isLegalIMAP(c)) {
1250 /* encode directly */
1251 inDirectMode=TRUE;
1252
1253 /* trick: back out this character to make this easier */
1254 --source;
1255
1256 /* terminate the base64 sequence */
1257 if(base64Counter!=0) {
1258 /* write remaining bits for the previous character */
1259 *target++=TO_BASE64_IMAP(bits);
1260 if(offsets!=NULL) {
1261 *offsets++=sourceIndex-1;
1262 }
1263 }
1264 /* need to terminate with a minus */
1265 if(target<targetLimit) {
1266 *target++=MINUS;
1267 if(offsets!=NULL) {
1268 *offsets++=sourceIndex-1;
1269 }
1270 } else {
1271 cnv->charErrorBuffer[0]=MINUS;
1272 cnv->charErrorBufferLength=1;
1273 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1274 break;
1275 }
1276 goto directMode;
1277 } else {
1278 /*
1279 * base64 this character:
1280 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1281 * and the bits of this character, each implicitly in UTF-16BE.
1282 *
1283 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1284 * character to the next. The actual 2 or 4 bits are shifted to the left edge
1285 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1286 */
1287 switch(base64Counter) {
1288 case 0:
1289 b=(uint8_t)(c>>10);
1290 *target++=TO_BASE64_IMAP(b);
1291 if(target<targetLimit) {
1292 b=(uint8_t)((c>>4)&0x3f);
1293 *target++=TO_BASE64_IMAP(b);
1294 if(offsets!=NULL) {
1295 *offsets++=sourceIndex;
1296 *offsets++=sourceIndex++;
1297 }
1298 } else {
1299 if(offsets!=NULL) {
1300 *offsets++=sourceIndex++;
1301 }
1302 b=(uint8_t)((c>>4)&0x3f);
1303 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1304 cnv->charErrorBufferLength=1;
1305 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1306 }
1307 bits=(uint8_t)((c&15)<<2);
1308 base64Counter=1;
1309 break;
1310 case 1:
1311 b=(uint8_t)(bits|(c>>14));
1312 *target++=TO_BASE64_IMAP(b);
1313 if(target<targetLimit) {
1314 b=(uint8_t)((c>>8)&0x3f);
1315 *target++=TO_BASE64_IMAP(b);
1316 if(target<targetLimit) {
1317 b=(uint8_t)((c>>2)&0x3f);
1318 *target++=TO_BASE64_IMAP(b);
1319 if(offsets!=NULL) {
1320 *offsets++=sourceIndex;
1321 *offsets++=sourceIndex;
1322 *offsets++=sourceIndex++;
1323 }
1324 } else {
1325 if(offsets!=NULL) {
1326 *offsets++=sourceIndex;
1327 *offsets++=sourceIndex++;
1328 }
1329 b=(uint8_t)((c>>2)&0x3f);
1330 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1331 cnv->charErrorBufferLength=1;
1332 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1333 }
1334 } else {
1335 if(offsets!=NULL) {
1336 *offsets++=sourceIndex++;
1337 }
1338 b=(uint8_t)((c>>8)&0x3f);
1339 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1340 b=(uint8_t)((c>>2)&0x3f);
1341 cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1342 cnv->charErrorBufferLength=2;
1343 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1344 }
1345 bits=(uint8_t)((c&3)<<4);
1346 base64Counter=2;
1347 break;
1348 case 2:
1349 b=(uint8_t)(bits|(c>>12));
1350 *target++=TO_BASE64_IMAP(b);
1351 if(target<targetLimit) {
1352 b=(uint8_t)((c>>6)&0x3f);
1353 *target++=TO_BASE64_IMAP(b);
1354 if(target<targetLimit) {
1355 b=(uint8_t)(c&0x3f);
1356 *target++=TO_BASE64_IMAP(b);
1357 if(offsets!=NULL) {
1358 *offsets++=sourceIndex;
1359 *offsets++=sourceIndex;
1360 *offsets++=sourceIndex++;
1361 }
1362 } else {
1363 if(offsets!=NULL) {
1364 *offsets++=sourceIndex;
1365 *offsets++=sourceIndex++;
1366 }
1367 b=(uint8_t)(c&0x3f);
1368 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1369 cnv->charErrorBufferLength=1;
1370 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1371 }
1372 } else {
1373 if(offsets!=NULL) {
1374 *offsets++=sourceIndex++;
1375 }
1376 b=(uint8_t)((c>>6)&0x3f);
1377 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1378 b=(uint8_t)(c&0x3f);
1379 cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1380 cnv->charErrorBufferLength=2;
1381 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1382 }
1383 bits=0;
1384 base64Counter=0;
1385 break;
1386 default:
1387 /* will never occur */
1388 break;
1389 }
1390 }
1391 } else {
1392 /* target is full */
1393 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1394 break;
1395 }
1396 }
1397 }
1398
1399 if(pArgs->flush && source>=sourceLimit) {
1400 /* flush remaining bits to the target */
1401 if(!inDirectMode) {
1402 if(base64Counter!=0) {
1403 if(target<targetLimit) {
1404 *target++=TO_BASE64_IMAP(bits);
1405 if(offsets!=NULL) {
1406 *offsets++=sourceIndex-1;
1407 }
1408 } else {
1409 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
1410 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1411 }
1412 }
1413 /* need to terminate with a minus */
1414 if(target<targetLimit) {
1415 *target++=MINUS;
1416 if(offsets!=NULL) {
1417 *offsets++=sourceIndex-1;
1418 }
1419 } else {
1420 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
1421 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1422 }
1423 }
1424 /* reset the state for the next conversion */
1425 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
1426 } else {
1427 /* set the converter state back into UConverter */
1428 cnv->fromUnicodeStatus=
1429 (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
1430 ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
1431 }
1432
1433 /* write back the updated pointers */
1434 pArgs->source=source;
1435 pArgs->target=(char *)target;
1436 pArgs->offsets=offsets;
1437 return;
1438 }
1439
1440 static const UConverterImpl _IMAPImpl={
1441 UCNV_IMAP_MAILBOX,
1442
1443 NULL,
1444 NULL,
1445
1446 _UTF7Open,
1447 NULL,
1448 _UTF7Reset,
1449
1450 _IMAPToUnicodeWithOffsets,
1451 _IMAPToUnicodeWithOffsets,
1452 _IMAPFromUnicodeWithOffsets,
1453 _IMAPFromUnicodeWithOffsets,
1454 NULL,
1455
1456 NULL,
1457 NULL,
1458 NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1459 NULL,
1460 ucnv_getCompleteUnicodeSet
1461 };
1462
1463 static const UConverterStaticData _IMAPStaticData={
1464 sizeof(UConverterStaticData),
1465 "IMAP-mailbox-name",
1466 0, /* TODO CCSID for IMAP-mailbox-name */
1467 UCNV_IBM, UCNV_IMAP_MAILBOX,
1468 1, 4,
1469 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1470 FALSE, FALSE,
1471 0,
1472 0,
1473 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1474 };
1475
1476 const UConverterSharedData _IMAPData={
1477 sizeof(UConverterSharedData), ~((uint32_t)0),
1478 NULL, NULL, &_IMAPStaticData, FALSE, &_IMAPImpl,
1479 0
1480 };
1481
1482 #endif
1483