• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (C) 2002-2011, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   file name:  ucnv_u7.c
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2002jul01
12 *   created by: Markus W. Scherer
13 *
14 *   UTF-7 converter implementation. Used to be in ucnv_utf.c.
15 */
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_CONVERSION
20 
21 #include "unicode/ucnv.h"
22 #include "ucnv_bld.h"
23 #include "ucnv_cnv.h"
24 
25 /* UTF-7 -------------------------------------------------------------------- */
26 
27 /*
28  * UTF-7 is a stateful encoding of Unicode.
29  * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
30  * It was intended for use in Internet email systems, using in its bytewise
31  * encoding only a subset of 7-bit US-ASCII.
32  * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
33  * occasionally used.
34  *
35  * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
36  * characters directly or in base64. Especially, the characters in set O
37  * as defined in the RFC (see below) may be encoded directly but are not
38  * allowed in, e.g., email headers.
39  * By default, the ICU UTF-7 converter encodes set O directly.
40  * By choosing the option "version=1", set O will be escaped instead.
41  * For example:
42  *     utf7Converter=ucnv_open("UTF-7,version=1");
43  *
44  * For details about email headers see RFC 2047.
45  */
46 
47 /*
48  * Tests for US-ASCII characters belonging to character classes
49  * defined in UTF-7.
50  *
51  * Set D (directly encoded characters) consists of the following
52  * characters: the upper and lower case letters A through Z
53  * and a through z, the 10 digits 0-9, and the following nine special
54  * characters (note that "+" and "=" are omitted):
55  *     '(),-./:?
56  *
57  * Set O (optional direct characters) consists of the following
58  * characters (note that "\" and "~" are omitted):
59  *     !"#$%&*;<=>@[]^_`{|}
60  *
61  * According to the rules in RFC 2152, the byte values for the following
62  * US-ASCII characters are not used in UTF-7 and are therefore illegal:
63  * - all C0 control codes except for CR LF TAB
64  * - BACKSLASH
65  * - TILDE
66  * - DEL
67  * - all codes beyond US-ASCII, i.e. all >127
68  */
69 #define inSetD(c) \
70     ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
71      (uint8_t)((c)-48)<10 ||    /* digits */ \
72      (uint8_t)((c)-39)<3 ||     /* '() */ \
73      (uint8_t)((c)-44)<4 ||     /* ,-./ */ \
74      (c)==58 || (c)==63         /* :? */ \
75     )
76 
77 #define inSetO(c) \
78     ((uint8_t)((c)-33)<6 ||         /* !"#$%& */ \
79      (uint8_t)((c)-59)<4 ||         /* ;<=> */ \
80      (uint8_t)((c)-93)<4 ||         /* ]^_` */ \
81      (uint8_t)((c)-123)<3 ||        /* {|} */ \
82      (c)==42 || (c)==64 || (c)==91  /* *@[ */ \
83     )
84 
85 #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
86 #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
87 
88 #define PLUS  43
89 #define MINUS 45
90 #define BACKSLASH 92
91 #define TILDE 126
92 
93 /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
94 #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
95 
96 /* encode directly sets D and O and CR LF SP TAB */
97 static const UBool encodeDirectlyMaximum[128]={
98  /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
99     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
100     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101 
102     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
103     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
104 
105     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
106     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
107 
108     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
109     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
110 };
111 
112 /* encode directly set D and CR LF SP TAB but not set O */
113 static const UBool encodeDirectlyRestricted[128]={
114  /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
115     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
116     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117 
118     1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
119     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
120 
121     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
122     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
123 
124     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
125     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
126 };
127 
128 static const uint8_t
129 toBase64[64]={
130     /* A-Z */
131     65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
132     78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
133     /* a-z */
134     97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
135     110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
136     /* 0-9 */
137     48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
138     /* +/ */
139     43, 47
140 };
141 
142 static const int8_t
143 fromBase64[128]={
144     /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
145     -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
146     -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
147 
148     /* general punctuation with + and / and a special value (-2) for - */
149     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
150     /* digits */
151     52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
152 
153     /* A-Z */
154     -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
155     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
156 
157     /* a-z */
158     -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
159     41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
160 };
161 
162 /*
163  * converter status values:
164  *
165  * toUnicodeStatus:
166  *     24 inDirectMode (boolean)
167  * 23..16 base64Counter (-1..7)
168  * 15..0  bits (up to 14 bits incoming base64)
169  *
170  * fromUnicodeStatus:
171  * 31..28 version (0: set O direct  1: set O escaped)
172  *     24 inDirectMode (boolean)
173  * 23..16 base64Counter (0..2)
174  *  7..0  bits (6 bits outgoing base64)
175  *
176  */
177 
178 static void
_UTF7Reset(UConverter * cnv,UConverterResetChoice choice)179 _UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
180     if(choice<=UCNV_RESET_TO_UNICODE) {
181         /* reset toUnicode */
182         cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
183         cnv->toULength=0;
184     }
185     if(choice!=UCNV_RESET_TO_UNICODE) {
186         /* reset fromUnicode */
187         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
188     }
189 }
190 
191 static void
_UTF7Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)192 _UTF7Open(UConverter *cnv,
193           UConverterLoadArgs *pArgs,
194           UErrorCode *pErrorCode) {
195     if(UCNV_GET_VERSION(cnv)<=1) {
196         /* TODO(markus): Should just use cnv->options rather than copying the version number. */
197         cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
198         _UTF7Reset(cnv, UCNV_RESET_BOTH);
199     } else {
200         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
201     }
202 }
203 
204 static void
_UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)205 _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
206                           UErrorCode *pErrorCode) {
207     UConverter *cnv;
208     const uint8_t *source, *sourceLimit;
209     UChar *target;
210     const UChar *targetLimit;
211     int32_t *offsets;
212 
213     uint8_t *bytes;
214     uint8_t byteIndex;
215 
216     int32_t length, targetCapacity;
217 
218     /* UTF-7 state */
219     uint16_t bits;
220     int8_t base64Counter;
221     UBool inDirectMode;
222 
223     int8_t base64Value;
224 
225     int32_t sourceIndex, nextSourceIndex;
226 
227     uint8_t b;
228     /* set up the local pointers */
229     cnv=pArgs->converter;
230 
231     source=(const uint8_t *)pArgs->source;
232     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
233     target=pArgs->target;
234     targetLimit=pArgs->targetLimit;
235     offsets=pArgs->offsets;
236     /* get the state machine state */
237     {
238         uint32_t status=cnv->toUnicodeStatus;
239         inDirectMode=(UBool)((status>>24)&1);
240         base64Counter=(int8_t)(status>>16);
241         bits=(uint16_t)status;
242     }
243     bytes=cnv->toUBytes;
244     byteIndex=cnv->toULength;
245 
246     /* sourceIndex=-1 if the current character began in the previous buffer */
247     sourceIndex=byteIndex==0 ? 0 : -1;
248     nextSourceIndex=0;
249 
250     if(inDirectMode) {
251 directMode:
252         /*
253          * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
254          * with their US-ASCII byte values.
255          * Backslash and Tilde and most control characters are not allowed in UTF-7.
256          * A plus sign starts Unicode (or "escape") Mode.
257          *
258          * In Direct Mode, only the sourceIndex is used.
259          */
260         byteIndex=0;
261         length=(int32_t)(sourceLimit-source);
262         targetCapacity=(int32_t)(targetLimit-target);
263         if(length>targetCapacity) {
264             length=targetCapacity;
265         }
266         while(length>0) {
267             b=*source++;
268             if(!isLegalUTF7(b)) {
269                 /* illegal */
270                 bytes[0]=b;
271                 byteIndex=1;
272                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
273                 break;
274             } else if(b!=PLUS) {
275                 /* write directly encoded character */
276                 *target++=b;
277                 if(offsets!=NULL) {
278                     *offsets++=sourceIndex++;
279                 }
280             } else /* PLUS */ {
281                 /* switch to Unicode mode */
282                 nextSourceIndex=++sourceIndex;
283                 inDirectMode=FALSE;
284                 byteIndex=0;
285                 bits=0;
286                 base64Counter=-1;
287                 goto unicodeMode;
288             }
289             --length;
290         }
291         if(source<sourceLimit && target>=targetLimit) {
292             /* target is full */
293             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
294         }
295     } else {
296 unicodeMode:
297         /*
298          * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
299          * The base64 sequence ends with any character that is not in the base64 alphabet.
300          * A terminating minus sign is consumed.
301          *
302          * In Unicode Mode, the sourceIndex has the index to the start of the current
303          * base64 bytes, while nextSourceIndex is precisely parallel to source,
304          * keeping the index to the following byte.
305          * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
306          */
307         while(source<sourceLimit) {
308             if(target<targetLimit) {
309                 bytes[byteIndex++]=b=*source++;
310                 ++nextSourceIndex;
311                 base64Value = -3; /* initialize as illegal */
312                 if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) {
313                     /* either
314                      * base64Value==-1 for any legal character except base64 and minus sign, or
315                      * base64Value==-3 for illegal characters:
316                      * 1. In either case, leave Unicode mode.
317                      * 2.1. If we ended with an incomplete UChar or none after the +, then
318                      *      generate an error for the preceding erroneous sequence and deal with
319                      *      the current (possibly illegal) character next time through.
320                      * 2.2. Else the current char comes after a complete UChar, which was already
321                      *      pushed to the output buf, so:
322                      * 2.2.1. If the current char is legal, just save it for processing next time.
323                      *        It may be for example, a plus which we need to deal with in direct mode.
324                      * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
325                      */
326                     inDirectMode=TRUE;
327                     if(base64Counter==-1) {
328                         /* illegal: + immediately followed by something other than base64 or minus sign */
329                         /* include the plus sign in the reported sequence, but not the subsequent char */
330                         --source;
331                         bytes[0]=PLUS;
332                         byteIndex=1;
333                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
334                         break;
335                     } else if(bits!=0) {
336                         /* bits are illegally left over, a UChar is incomplete */
337                         /* don't include current char (legal or illegal) in error seq */
338                         --source;
339                         --byteIndex;
340                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
341                         break;
342                     } else {
343                         /* previous UChar was complete */
344                         if(base64Value==-3) {
345                             /* current character is illegal, deal with it here */
346                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
347                             break;
348                         } else {
349                             /* un-read the current character in case it is a plus sign */
350                             --source;
351                             sourceIndex=nextSourceIndex-1;
352                             goto directMode;
353                         }
354                     }
355                 } else if(base64Value>=0) {
356                     /* collect base64 bytes into UChars */
357                     switch(base64Counter) {
358                     case -1: /* -1 is immediately after the + */
359                     case 0:
360                         bits=base64Value;
361                         base64Counter=1;
362                         break;
363                     case 1:
364                     case 3:
365                     case 4:
366                     case 6:
367                         bits=(uint16_t)((bits<<6)|base64Value);
368                         ++base64Counter;
369                         break;
370                     case 2:
371                         *target++=(UChar)((bits<<4)|(base64Value>>2));
372                         if(offsets!=NULL) {
373                             *offsets++=sourceIndex;
374                             sourceIndex=nextSourceIndex-1;
375                         }
376                         bytes[0]=b; /* keep this byte in case an error occurs */
377                         byteIndex=1;
378                         bits=(uint16_t)(base64Value&3);
379                         base64Counter=3;
380                         break;
381                     case 5:
382                         *target++=(UChar)((bits<<2)|(base64Value>>4));
383                         if(offsets!=NULL) {
384                             *offsets++=sourceIndex;
385                             sourceIndex=nextSourceIndex-1;
386                         }
387                         bytes[0]=b; /* keep this byte in case an error occurs */
388                         byteIndex=1;
389                         bits=(uint16_t)(base64Value&15);
390                         base64Counter=6;
391                         break;
392                     case 7:
393                         *target++=(UChar)((bits<<6)|base64Value);
394                         if(offsets!=NULL) {
395                             *offsets++=sourceIndex;
396                             sourceIndex=nextSourceIndex;
397                         }
398                         byteIndex=0;
399                         bits=0;
400                         base64Counter=0;
401                         break;
402                     default:
403                         /* will never occur */
404                         break;
405                     }
406                 } else /*base64Value==-2*/ {
407                     /* minus sign terminates the base64 sequence */
408                     inDirectMode=TRUE;
409                     if(base64Counter==-1) {
410                         /* +- i.e. a minus immediately following a plus */
411                         *target++=PLUS;
412                         if(offsets!=NULL) {
413                             *offsets++=sourceIndex-1;
414                         }
415                     } else {
416                         /* absorb the minus and leave the Unicode Mode */
417                         if(bits!=0) {
418                             /* bits are illegally left over, a UChar is incomplete */
419                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
420                             break;
421                         }
422                     }
423                     sourceIndex=nextSourceIndex;
424                     goto directMode;
425                 }
426             } else {
427                 /* target is full */
428                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
429                 break;
430             }
431         }
432     }
433 
434     if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
435         /*
436          * if we are in Unicode mode, then the byteIndex might not be 0,
437          * but that is ok if bits==0
438          * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
439          * (not true for IMAP-mailbox-name where we must end in direct mode)
440          */
441         byteIndex=0;
442     }
443 
444     /* set the converter state back into UConverter */
445     cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
446     cnv->toULength=byteIndex;
447 
448     /* write back the updated pointers */
449     pArgs->source=(const char *)source;
450     pArgs->target=target;
451     pArgs->offsets=offsets;
452     return;
453 }
454 
455 static void
_UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)456 _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
457                             UErrorCode *pErrorCode) {
458     UConverter *cnv;
459     const UChar *source, *sourceLimit;
460     uint8_t *target, *targetLimit;
461     int32_t *offsets;
462 
463     int32_t length, targetCapacity, sourceIndex;
464     UChar c;
465 
466     /* UTF-7 state */
467     const UBool *encodeDirectly;
468     uint8_t bits;
469     int8_t base64Counter;
470     UBool inDirectMode;
471 
472     /* set up the local pointers */
473     cnv=pArgs->converter;
474 
475     /* set up the local pointers */
476     source=pArgs->source;
477     sourceLimit=pArgs->sourceLimit;
478     target=(uint8_t *)pArgs->target;
479     targetLimit=(uint8_t *)pArgs->targetLimit;
480     offsets=pArgs->offsets;
481 
482     /* get the state machine state */
483     {
484         uint32_t status=cnv->fromUnicodeStatus;
485         encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
486         inDirectMode=(UBool)((status>>24)&1);
487         base64Counter=(int8_t)(status>>16);
488         bits=(uint8_t)status;
489     }
490 
491     /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
492     sourceIndex=0;
493 
494     if(inDirectMode) {
495 directMode:
496         length=(int32_t)(sourceLimit-source);
497         targetCapacity=(int32_t)(targetLimit-target);
498         if(length>targetCapacity) {
499             length=targetCapacity;
500         }
501         while(length>0) {
502             c=*source++;
503             /* currently always encode CR LF SP TAB directly */
504             if(c<=127 && encodeDirectly[c]) {
505                 /* encode directly */
506                 *target++=(uint8_t)c;
507                 if(offsets!=NULL) {
508                     *offsets++=sourceIndex++;
509                 }
510             } else if(c==PLUS) {
511                 /* output +- for + */
512                 *target++=PLUS;
513                 if(target<targetLimit) {
514                     *target++=MINUS;
515                     if(offsets!=NULL) {
516                         *offsets++=sourceIndex;
517                         *offsets++=sourceIndex++;
518                     }
519                     /* realign length and targetCapacity */
520                     goto directMode;
521                 } else {
522                     if(offsets!=NULL) {
523                         *offsets++=sourceIndex++;
524                     }
525                     cnv->charErrorBuffer[0]=MINUS;
526                     cnv->charErrorBufferLength=1;
527                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
528                     break;
529                 }
530             } else {
531                 /* un-read this character and switch to Unicode Mode */
532                 --source;
533                 *target++=PLUS;
534                 if(offsets!=NULL) {
535                     *offsets++=sourceIndex;
536                 }
537                 inDirectMode=FALSE;
538                 base64Counter=0;
539                 goto unicodeMode;
540             }
541             --length;
542         }
543         if(source<sourceLimit && target>=targetLimit) {
544             /* target is full */
545             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
546         }
547     } else {
548 unicodeMode:
549         while(source<sourceLimit) {
550             if(target<targetLimit) {
551                 c=*source++;
552                 if(c<=127 && encodeDirectly[c]) {
553                     /* encode directly */
554                     inDirectMode=TRUE;
555 
556                     /* trick: back out this character to make this easier */
557                     --source;
558 
559                     /* terminate the base64 sequence */
560                     if(base64Counter!=0) {
561                         /* write remaining bits for the previous character */
562                         *target++=toBase64[bits];
563                         if(offsets!=NULL) {
564                             *offsets++=sourceIndex-1;
565                         }
566                     }
567                     if(fromBase64[c]!=-1) {
568                         /* need to terminate with a minus */
569                         if(target<targetLimit) {
570                             *target++=MINUS;
571                             if(offsets!=NULL) {
572                                 *offsets++=sourceIndex-1;
573                             }
574                         } else {
575                             cnv->charErrorBuffer[0]=MINUS;
576                             cnv->charErrorBufferLength=1;
577                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
578                             break;
579                         }
580                     }
581                     goto directMode;
582                 } else {
583                     /*
584                      * base64 this character:
585                      * Output 2 or 3 base64 bytes for the remaining bits of the previous character
586                      * and the bits of this character, each implicitly in UTF-16BE.
587                      *
588                      * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
589                      * character to the next. The actual 2 or 4 bits are shifted to the left edge
590                      * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
591                      */
592                     switch(base64Counter) {
593                     case 0:
594                         *target++=toBase64[c>>10];
595                         if(target<targetLimit) {
596                             *target++=toBase64[(c>>4)&0x3f];
597                             if(offsets!=NULL) {
598                                 *offsets++=sourceIndex;
599                                 *offsets++=sourceIndex++;
600                             }
601                         } else {
602                             if(offsets!=NULL) {
603                                 *offsets++=sourceIndex++;
604                             }
605                             cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
606                             cnv->charErrorBufferLength=1;
607                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
608                         }
609                         bits=(uint8_t)((c&15)<<2);
610                         base64Counter=1;
611                         break;
612                     case 1:
613                         *target++=toBase64[bits|(c>>14)];
614                         if(target<targetLimit) {
615                             *target++=toBase64[(c>>8)&0x3f];
616                             if(target<targetLimit) {
617                                 *target++=toBase64[(c>>2)&0x3f];
618                                 if(offsets!=NULL) {
619                                     *offsets++=sourceIndex;
620                                     *offsets++=sourceIndex;
621                                     *offsets++=sourceIndex++;
622                                 }
623                             } else {
624                                 if(offsets!=NULL) {
625                                     *offsets++=sourceIndex;
626                                     *offsets++=sourceIndex++;
627                                 }
628                                 cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
629                                 cnv->charErrorBufferLength=1;
630                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
631                             }
632                         } else {
633                             if(offsets!=NULL) {
634                                 *offsets++=sourceIndex++;
635                             }
636                             cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
637                             cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
638                             cnv->charErrorBufferLength=2;
639                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
640                         }
641                         bits=(uint8_t)((c&3)<<4);
642                         base64Counter=2;
643                         break;
644                     case 2:
645                         *target++=toBase64[bits|(c>>12)];
646                         if(target<targetLimit) {
647                             *target++=toBase64[(c>>6)&0x3f];
648                             if(target<targetLimit) {
649                                 *target++=toBase64[c&0x3f];
650                                 if(offsets!=NULL) {
651                                     *offsets++=sourceIndex;
652                                     *offsets++=sourceIndex;
653                                     *offsets++=sourceIndex++;
654                                 }
655                             } else {
656                                 if(offsets!=NULL) {
657                                     *offsets++=sourceIndex;
658                                     *offsets++=sourceIndex++;
659                                 }
660                                 cnv->charErrorBuffer[0]=toBase64[c&0x3f];
661                                 cnv->charErrorBufferLength=1;
662                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
663                             }
664                         } else {
665                             if(offsets!=NULL) {
666                                 *offsets++=sourceIndex++;
667                             }
668                             cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
669                             cnv->charErrorBuffer[1]=toBase64[c&0x3f];
670                             cnv->charErrorBufferLength=2;
671                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
672                         }
673                         bits=0;
674                         base64Counter=0;
675                         break;
676                     default:
677                         /* will never occur */
678                         break;
679                     }
680                 }
681             } else {
682                 /* target is full */
683                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
684                 break;
685             }
686         }
687     }
688 
689     if(pArgs->flush && source>=sourceLimit) {
690         /* flush remaining bits to the target */
691         if(!inDirectMode) {
692             if (base64Counter!=0) {
693                 if(target<targetLimit) {
694                     *target++=toBase64[bits];
695                     if(offsets!=NULL) {
696                         *offsets++=sourceIndex-1;
697                     }
698                 } else {
699                     cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
700                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
701                 }
702             }
703             /* Add final MINUS to terminate unicodeMode */
704             if(target<targetLimit) {
705                 *target++=MINUS;
706                 if(offsets!=NULL) {
707                     *offsets++=sourceIndex-1;
708                 }
709             } else {
710                 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
711                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
712             }
713         }
714         /* reset the state for the next conversion */
715         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
716     } else {
717         /* set the converter state back into UConverter */
718         cnv->fromUnicodeStatus=
719             (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
720             ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
721     }
722 
723     /* write back the updated pointers */
724     pArgs->source=source;
725     pArgs->target=(char *)target;
726     pArgs->offsets=offsets;
727     return;
728 }
729 
730 static const char *
_UTF7GetName(const UConverter * cnv)731 _UTF7GetName(const UConverter *cnv) {
732     switch(cnv->fromUnicodeStatus>>28) {
733     case 1:
734         return "UTF-7,version=1";
735     default:
736         return "UTF-7";
737     }
738 }
739 
740 static const UConverterImpl _UTF7Impl={
741     UCNV_UTF7,
742 
743     NULL,
744     NULL,
745 
746     _UTF7Open,
747     NULL,
748     _UTF7Reset,
749 
750     _UTF7ToUnicodeWithOffsets,
751     _UTF7ToUnicodeWithOffsets,
752     _UTF7FromUnicodeWithOffsets,
753     _UTF7FromUnicodeWithOffsets,
754     NULL,
755 
756     NULL,
757     _UTF7GetName,
758     NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
759     NULL,
760     ucnv_getCompleteUnicodeSet
761 };
762 
763 static const UConverterStaticData _UTF7StaticData={
764     sizeof(UConverterStaticData),
765     "UTF-7",
766     0, /* TODO CCSID for UTF-7 */
767     UCNV_IBM, UCNV_UTF7,
768     1, 4,
769     { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
770     FALSE, FALSE,
771     0,
772     0,
773     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
774 };
775 
776 const UConverterSharedData _UTF7Data={
777     sizeof(UConverterSharedData), ~((uint32_t)0),
778     NULL, NULL, &_UTF7StaticData, FALSE, &_UTF7Impl,
779     0
780 };
781 
782 /* IMAP mailbox name encoding ----------------------------------------------- */
783 
784 /*
785  * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
786  * http://www.ietf.org/rfc/rfc2060.txt
787  *
788  * 5.1.3.  Mailbox International Naming Convention
789  *
790  * By convention, international mailbox names are specified using a
791  * modified version of the UTF-7 encoding described in [UTF-7].  The
792  * purpose of these modifications is to correct the following problems
793  * with UTF-7:
794  *
795  *    1) UTF-7 uses the "+" character for shifting; this conflicts with
796  *       the common use of "+" in mailbox names, in particular USENET
797  *       newsgroup names.
798  *
799  *    2) UTF-7's encoding is BASE64 which uses the "/" character; this
800  *       conflicts with the use of "/" as a popular hierarchy delimiter.
801  *
802  *    3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
803  *       the use of "\" as a popular hierarchy delimiter.
804  *
805  *    4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
806  *       the use of "~" in some servers as a home directory indicator.
807  *
808  *    5) UTF-7 permits multiple alternate forms to represent the same
809  *       string; in particular, printable US-ASCII chararacters can be
810  *       represented in encoded form.
811  *
812  * In modified UTF-7, printable US-ASCII characters except for "&"
813  * represent themselves; that is, characters with octet values 0x20-0x25
814  * and 0x27-0x7e.  The character "&" (0x26) is represented by the two-
815  * octet sequence "&-".
816  *
817  * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
818  * Unicode 16-bit octets) are represented in modified BASE64, with a
819  * further modification from [UTF-7] that "," is used instead of "/".
820  * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
821  * character which can represent itself.
822  *
823  * "&" is used to shift to modified BASE64 and "-" to shift back to US-
824  * ASCII.  All names start in US-ASCII, and MUST end in US-ASCII (that
825  * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
826  * ").
827  *
828  * For example, here is a mailbox name which mixes English, Japanese,
829  * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
830  */
831 
832 /*
833  * Tests for US-ASCII characters belonging to character classes
834  * defined in UTF-7.
835  *
836  * Set D (directly encoded characters) consists of the following
837  * characters: the upper and lower case letters A through Z
838  * and a through z, the 10 digits 0-9, and the following nine special
839  * characters (note that "+" and "=" are omitted):
840  *     '(),-./:?
841  *
842  * Set O (optional direct characters) consists of the following
843  * characters (note that "\" and "~" are omitted):
844  *     !"#$%&*;<=>@[]^_`{|}
845  *
846  * According to the rules in RFC 2152, the byte values for the following
847  * US-ASCII characters are not used in UTF-7 and are therefore illegal:
848  * - all C0 control codes except for CR LF TAB
849  * - BACKSLASH
850  * - TILDE
851  * - DEL
852  * - all codes beyond US-ASCII, i.e. all >127
853  */
854 
855 /* uses '&' not '+' to start a base64 sequence */
856 #define AMPERSAND 0x26
857 #define COMMA 0x2c
858 #define SLASH 0x2f
859 
860 /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
861 #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
862 
863 /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
864 #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
865 
866 #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
867 #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
868 
869 /*
870  * converter status values:
871  *
872  * toUnicodeStatus:
873  *     24 inDirectMode (boolean)
874  * 23..16 base64Counter (-1..7)
875  * 15..0  bits (up to 14 bits incoming base64)
876  *
877  * fromUnicodeStatus:
878  *     24 inDirectMode (boolean)
879  * 23..16 base64Counter (0..2)
880  *  7..0  bits (6 bits outgoing base64)
881  *
882  * ignore bits 31..25
883  */
884 
885 static void
_IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)886 _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
887                           UErrorCode *pErrorCode) {
888     UConverter *cnv;
889     const uint8_t *source, *sourceLimit;
890     UChar *target;
891     const UChar *targetLimit;
892     int32_t *offsets;
893 
894     uint8_t *bytes;
895     uint8_t byteIndex;
896 
897     int32_t length, targetCapacity;
898 
899     /* UTF-7 state */
900     uint16_t bits;
901     int8_t base64Counter;
902     UBool inDirectMode;
903 
904     int8_t base64Value;
905 
906     int32_t sourceIndex, nextSourceIndex;
907 
908     UChar c;
909     uint8_t b;
910 
911     /* set up the local pointers */
912     cnv=pArgs->converter;
913 
914     source=(const uint8_t *)pArgs->source;
915     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
916     target=pArgs->target;
917     targetLimit=pArgs->targetLimit;
918     offsets=pArgs->offsets;
919     /* get the state machine state */
920     {
921         uint32_t status=cnv->toUnicodeStatus;
922         inDirectMode=(UBool)((status>>24)&1);
923         base64Counter=(int8_t)(status>>16);
924         bits=(uint16_t)status;
925     }
926     bytes=cnv->toUBytes;
927     byteIndex=cnv->toULength;
928 
929     /* sourceIndex=-1 if the current character began in the previous buffer */
930     sourceIndex=byteIndex==0 ? 0 : -1;
931     nextSourceIndex=0;
932 
933     if(inDirectMode) {
934 directMode:
935         /*
936          * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
937          * with their US-ASCII byte values.
938          * An ampersand starts Unicode (or "escape") Mode.
939          *
940          * In Direct Mode, only the sourceIndex is used.
941          */
942         byteIndex=0;
943         length=(int32_t)(sourceLimit-source);
944         targetCapacity=(int32_t)(targetLimit-target);
945         if(length>targetCapacity) {
946             length=targetCapacity;
947         }
948         while(length>0) {
949             b=*source++;
950             if(!isLegalIMAP(b)) {
951                 /* illegal */
952                 bytes[0]=b;
953                 byteIndex=1;
954                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
955                 break;
956             } else if(b!=AMPERSAND) {
957                 /* write directly encoded character */
958                 *target++=b;
959                 if(offsets!=NULL) {
960                     *offsets++=sourceIndex++;
961                 }
962             } else /* AMPERSAND */ {
963                 /* switch to Unicode mode */
964                 nextSourceIndex=++sourceIndex;
965                 inDirectMode=FALSE;
966                 byteIndex=0;
967                 bits=0;
968                 base64Counter=-1;
969                 goto unicodeMode;
970             }
971             --length;
972         }
973         if(source<sourceLimit && target>=targetLimit) {
974             /* target is full */
975             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
976         }
977     } else {
978 unicodeMode:
979         /*
980          * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
981          * The base64 sequence ends with any character that is not in the base64 alphabet.
982          * A terminating minus sign is consumed.
983          * US-ASCII must not be base64-ed.
984          *
985          * In Unicode Mode, the sourceIndex has the index to the start of the current
986          * base64 bytes, while nextSourceIndex is precisely parallel to source,
987          * keeping the index to the following byte.
988          * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
989          */
990         while(source<sourceLimit) {
991             if(target<targetLimit) {
992                 bytes[byteIndex++]=b=*source++;
993                 ++nextSourceIndex;
994                 if(b>0x7e) {
995                     /* illegal - test other illegal US-ASCII values by base64Value==-3 */
996                     inDirectMode=TRUE;
997                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
998                     break;
999                 } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
1000                     /* collect base64 bytes into UChars */
1001                     switch(base64Counter) {
1002                     case -1: /* -1 is immediately after the & */
1003                     case 0:
1004                         bits=base64Value;
1005                         base64Counter=1;
1006                         break;
1007                     case 1:
1008                     case 3:
1009                     case 4:
1010                     case 6:
1011                         bits=(uint16_t)((bits<<6)|base64Value);
1012                         ++base64Counter;
1013                         break;
1014                     case 2:
1015                         c=(UChar)((bits<<4)|(base64Value>>2));
1016                         if(isLegalIMAP(c)) {
1017                             /* illegal */
1018                             inDirectMode=TRUE;
1019                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1020                             goto endloop;
1021                         }
1022                         *target++=c;
1023                         if(offsets!=NULL) {
1024                             *offsets++=sourceIndex;
1025                             sourceIndex=nextSourceIndex-1;
1026                         }
1027                         bytes[0]=b; /* keep this byte in case an error occurs */
1028                         byteIndex=1;
1029                         bits=(uint16_t)(base64Value&3);
1030                         base64Counter=3;
1031                         break;
1032                     case 5:
1033                         c=(UChar)((bits<<2)|(base64Value>>4));
1034                         if(isLegalIMAP(c)) {
1035                             /* illegal */
1036                             inDirectMode=TRUE;
1037                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1038                             goto endloop;
1039                         }
1040                         *target++=c;
1041                         if(offsets!=NULL) {
1042                             *offsets++=sourceIndex;
1043                             sourceIndex=nextSourceIndex-1;
1044                         }
1045                         bytes[0]=b; /* keep this byte in case an error occurs */
1046                         byteIndex=1;
1047                         bits=(uint16_t)(base64Value&15);
1048                         base64Counter=6;
1049                         break;
1050                     case 7:
1051                         c=(UChar)((bits<<6)|base64Value);
1052                         if(isLegalIMAP(c)) {
1053                             /* illegal */
1054                             inDirectMode=TRUE;
1055                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1056                             goto endloop;
1057                         }
1058                         *target++=c;
1059                         if(offsets!=NULL) {
1060                             *offsets++=sourceIndex;
1061                             sourceIndex=nextSourceIndex;
1062                         }
1063                         byteIndex=0;
1064                         bits=0;
1065                         base64Counter=0;
1066                         break;
1067                     default:
1068                         /* will never occur */
1069                         break;
1070                     }
1071                 } else if(base64Value==-2) {
1072                     /* minus sign terminates the base64 sequence */
1073                     inDirectMode=TRUE;
1074                     if(base64Counter==-1) {
1075                         /* &- i.e. a minus immediately following an ampersand */
1076                         *target++=AMPERSAND;
1077                         if(offsets!=NULL) {
1078                             *offsets++=sourceIndex-1;
1079                         }
1080                     } else {
1081                         /* absorb the minus and leave the Unicode Mode */
1082                         if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
1083                             /* bits are illegally left over, a UChar is incomplete */
1084                             /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
1085                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1086                             break;
1087                         }
1088                     }
1089                     sourceIndex=nextSourceIndex;
1090                     goto directMode;
1091                 } else {
1092                     if(base64Counter==-1) {
1093                         /* illegal: & immediately followed by something other than base64 or minus sign */
1094                         /* include the ampersand in the reported sequence */
1095                         --sourceIndex;
1096                         bytes[0]=AMPERSAND;
1097                         bytes[1]=b;
1098                         byteIndex=2;
1099                     }
1100                     /* base64Value==-1 for characters that are illegal only in Unicode mode */
1101                     /* base64Value==-3 for illegal characters */
1102                     /* illegal */
1103                     inDirectMode=TRUE;
1104                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1105                     break;
1106                 }
1107             } else {
1108                 /* target is full */
1109                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1110                 break;
1111             }
1112         }
1113     }
1114 endloop:
1115 
1116     /*
1117      * the end of the input stream and detection of truncated input
1118      * are handled by the framework, but here we must check if we are in Unicode
1119      * mode and byteIndex==0 because we must end in direct mode
1120      *
1121      * conditions:
1122      *   successful
1123      *   in Unicode mode and byteIndex==0
1124      *   end of input and no truncated input
1125      */
1126     if( U_SUCCESS(*pErrorCode) &&
1127         !inDirectMode && byteIndex==0 &&
1128         pArgs->flush && source>=sourceLimit
1129     ) {
1130         if(base64Counter==-1) {
1131             /* & at the very end of the input */
1132             /* make the ampersand the reported sequence */
1133             bytes[0]=AMPERSAND;
1134             byteIndex=1;
1135         }
1136         /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1137 
1138         inDirectMode=TRUE; /* avoid looping */
1139         *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1140     }
1141 
1142     /* set the converter state back into UConverter */
1143     cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
1144     cnv->toULength=byteIndex;
1145 
1146     /* write back the updated pointers */
1147     pArgs->source=(const char *)source;
1148     pArgs->target=target;
1149     pArgs->offsets=offsets;
1150     return;
1151 }
1152 
1153 static void
_IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)1154 _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1155                             UErrorCode *pErrorCode) {
1156     UConverter *cnv;
1157     const UChar *source, *sourceLimit;
1158     uint8_t *target, *targetLimit;
1159     int32_t *offsets;
1160 
1161     int32_t length, targetCapacity, sourceIndex;
1162     UChar c;
1163     uint8_t b;
1164 
1165     /* UTF-7 state */
1166     uint8_t bits;
1167     int8_t base64Counter;
1168     UBool inDirectMode;
1169 
1170     /* set up the local pointers */
1171     cnv=pArgs->converter;
1172 
1173     /* set up the local pointers */
1174     source=pArgs->source;
1175     sourceLimit=pArgs->sourceLimit;
1176     target=(uint8_t *)pArgs->target;
1177     targetLimit=(uint8_t *)pArgs->targetLimit;
1178     offsets=pArgs->offsets;
1179 
1180     /* get the state machine state */
1181     {
1182         uint32_t status=cnv->fromUnicodeStatus;
1183         inDirectMode=(UBool)((status>>24)&1);
1184         base64Counter=(int8_t)(status>>16);
1185         bits=(uint8_t)status;
1186     }
1187 
1188     /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1189     sourceIndex=0;
1190 
1191     if(inDirectMode) {
1192 directMode:
1193         length=(int32_t)(sourceLimit-source);
1194         targetCapacity=(int32_t)(targetLimit-target);
1195         if(length>targetCapacity) {
1196             length=targetCapacity;
1197         }
1198         while(length>0) {
1199             c=*source++;
1200             /* encode 0x20..0x7e except '&' directly */
1201             if(inSetDIMAP(c)) {
1202                 /* encode directly */
1203                 *target++=(uint8_t)c;
1204                 if(offsets!=NULL) {
1205                     *offsets++=sourceIndex++;
1206                 }
1207             } else if(c==AMPERSAND) {
1208                 /* output &- for & */
1209                 *target++=AMPERSAND;
1210                 if(target<targetLimit) {
1211                     *target++=MINUS;
1212                     if(offsets!=NULL) {
1213                         *offsets++=sourceIndex;
1214                         *offsets++=sourceIndex++;
1215                     }
1216                     /* realign length and targetCapacity */
1217                     goto directMode;
1218                 } else {
1219                     if(offsets!=NULL) {
1220                         *offsets++=sourceIndex++;
1221                     }
1222                     cnv->charErrorBuffer[0]=MINUS;
1223                     cnv->charErrorBufferLength=1;
1224                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1225                     break;
1226                 }
1227             } else {
1228                 /* un-read this character and switch to Unicode Mode */
1229                 --source;
1230                 *target++=AMPERSAND;
1231                 if(offsets!=NULL) {
1232                     *offsets++=sourceIndex;
1233                 }
1234                 inDirectMode=FALSE;
1235                 base64Counter=0;
1236                 goto unicodeMode;
1237             }
1238             --length;
1239         }
1240         if(source<sourceLimit && target>=targetLimit) {
1241             /* target is full */
1242             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1243         }
1244     } else {
1245 unicodeMode:
1246         while(source<sourceLimit) {
1247             if(target<targetLimit) {
1248                 c=*source++;
1249                 if(isLegalIMAP(c)) {
1250                     /* encode directly */
1251                     inDirectMode=TRUE;
1252 
1253                     /* trick: back out this character to make this easier */
1254                     --source;
1255 
1256                     /* terminate the base64 sequence */
1257                     if(base64Counter!=0) {
1258                         /* write remaining bits for the previous character */
1259                         *target++=TO_BASE64_IMAP(bits);
1260                         if(offsets!=NULL) {
1261                             *offsets++=sourceIndex-1;
1262                         }
1263                     }
1264                     /* need to terminate with a minus */
1265                     if(target<targetLimit) {
1266                         *target++=MINUS;
1267                         if(offsets!=NULL) {
1268                             *offsets++=sourceIndex-1;
1269                         }
1270                     } else {
1271                         cnv->charErrorBuffer[0]=MINUS;
1272                         cnv->charErrorBufferLength=1;
1273                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1274                         break;
1275                     }
1276                     goto directMode;
1277                 } else {
1278                     /*
1279                      * base64 this character:
1280                      * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1281                      * and the bits of this character, each implicitly in UTF-16BE.
1282                      *
1283                      * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1284                      * character to the next. The actual 2 or 4 bits are shifted to the left edge
1285                      * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1286                      */
1287                     switch(base64Counter) {
1288                     case 0:
1289                         b=(uint8_t)(c>>10);
1290                         *target++=TO_BASE64_IMAP(b);
1291                         if(target<targetLimit) {
1292                             b=(uint8_t)((c>>4)&0x3f);
1293                             *target++=TO_BASE64_IMAP(b);
1294                             if(offsets!=NULL) {
1295                                 *offsets++=sourceIndex;
1296                                 *offsets++=sourceIndex++;
1297                             }
1298                         } else {
1299                             if(offsets!=NULL) {
1300                                 *offsets++=sourceIndex++;
1301                             }
1302                             b=(uint8_t)((c>>4)&0x3f);
1303                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1304                             cnv->charErrorBufferLength=1;
1305                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1306                         }
1307                         bits=(uint8_t)((c&15)<<2);
1308                         base64Counter=1;
1309                         break;
1310                     case 1:
1311                         b=(uint8_t)(bits|(c>>14));
1312                         *target++=TO_BASE64_IMAP(b);
1313                         if(target<targetLimit) {
1314                             b=(uint8_t)((c>>8)&0x3f);
1315                             *target++=TO_BASE64_IMAP(b);
1316                             if(target<targetLimit) {
1317                                 b=(uint8_t)((c>>2)&0x3f);
1318                                 *target++=TO_BASE64_IMAP(b);
1319                                 if(offsets!=NULL) {
1320                                     *offsets++=sourceIndex;
1321                                     *offsets++=sourceIndex;
1322                                     *offsets++=sourceIndex++;
1323                                 }
1324                             } else {
1325                                 if(offsets!=NULL) {
1326                                     *offsets++=sourceIndex;
1327                                     *offsets++=sourceIndex++;
1328                                 }
1329                                 b=(uint8_t)((c>>2)&0x3f);
1330                                 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1331                                 cnv->charErrorBufferLength=1;
1332                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1333                             }
1334                         } else {
1335                             if(offsets!=NULL) {
1336                                 *offsets++=sourceIndex++;
1337                             }
1338                             b=(uint8_t)((c>>8)&0x3f);
1339                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1340                             b=(uint8_t)((c>>2)&0x3f);
1341                             cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1342                             cnv->charErrorBufferLength=2;
1343                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1344                         }
1345                         bits=(uint8_t)((c&3)<<4);
1346                         base64Counter=2;
1347                         break;
1348                     case 2:
1349                         b=(uint8_t)(bits|(c>>12));
1350                         *target++=TO_BASE64_IMAP(b);
1351                         if(target<targetLimit) {
1352                             b=(uint8_t)((c>>6)&0x3f);
1353                             *target++=TO_BASE64_IMAP(b);
1354                             if(target<targetLimit) {
1355                                 b=(uint8_t)(c&0x3f);
1356                                 *target++=TO_BASE64_IMAP(b);
1357                                 if(offsets!=NULL) {
1358                                     *offsets++=sourceIndex;
1359                                     *offsets++=sourceIndex;
1360                                     *offsets++=sourceIndex++;
1361                                 }
1362                             } else {
1363                                 if(offsets!=NULL) {
1364                                     *offsets++=sourceIndex;
1365                                     *offsets++=sourceIndex++;
1366                                 }
1367                                 b=(uint8_t)(c&0x3f);
1368                                 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1369                                 cnv->charErrorBufferLength=1;
1370                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1371                             }
1372                         } else {
1373                             if(offsets!=NULL) {
1374                                 *offsets++=sourceIndex++;
1375                             }
1376                             b=(uint8_t)((c>>6)&0x3f);
1377                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1378                             b=(uint8_t)(c&0x3f);
1379                             cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1380                             cnv->charErrorBufferLength=2;
1381                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1382                         }
1383                         bits=0;
1384                         base64Counter=0;
1385                         break;
1386                     default:
1387                         /* will never occur */
1388                         break;
1389                     }
1390                 }
1391             } else {
1392                 /* target is full */
1393                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1394                 break;
1395             }
1396         }
1397     }
1398 
1399     if(pArgs->flush && source>=sourceLimit) {
1400         /* flush remaining bits to the target */
1401         if(!inDirectMode) {
1402             if(base64Counter!=0) {
1403                 if(target<targetLimit) {
1404                     *target++=TO_BASE64_IMAP(bits);
1405                     if(offsets!=NULL) {
1406                         *offsets++=sourceIndex-1;
1407                     }
1408                 } else {
1409                     cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
1410                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1411                 }
1412             }
1413             /* need to terminate with a minus */
1414             if(target<targetLimit) {
1415                 *target++=MINUS;
1416                 if(offsets!=NULL) {
1417                     *offsets++=sourceIndex-1;
1418                 }
1419             } else {
1420                 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
1421                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1422             }
1423         }
1424         /* reset the state for the next conversion */
1425         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
1426     } else {
1427         /* set the converter state back into UConverter */
1428         cnv->fromUnicodeStatus=
1429             (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
1430             ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
1431     }
1432 
1433     /* write back the updated pointers */
1434     pArgs->source=source;
1435     pArgs->target=(char *)target;
1436     pArgs->offsets=offsets;
1437     return;
1438 }
1439 
1440 static const UConverterImpl _IMAPImpl={
1441     UCNV_IMAP_MAILBOX,
1442 
1443     NULL,
1444     NULL,
1445 
1446     _UTF7Open,
1447     NULL,
1448     _UTF7Reset,
1449 
1450     _IMAPToUnicodeWithOffsets,
1451     _IMAPToUnicodeWithOffsets,
1452     _IMAPFromUnicodeWithOffsets,
1453     _IMAPFromUnicodeWithOffsets,
1454     NULL,
1455 
1456     NULL,
1457     NULL,
1458     NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1459     NULL,
1460     ucnv_getCompleteUnicodeSet
1461 };
1462 
1463 static const UConverterStaticData _IMAPStaticData={
1464     sizeof(UConverterStaticData),
1465     "IMAP-mailbox-name",
1466     0, /* TODO CCSID for IMAP-mailbox-name */
1467     UCNV_IBM, UCNV_IMAP_MAILBOX,
1468     1, 4,
1469     { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1470     FALSE, FALSE,
1471     0,
1472     0,
1473     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1474 };
1475 
1476 const UConverterSharedData _IMAPData={
1477     sizeof(UConverterSharedData), ~((uint32_t)0),
1478     NULL, NULL, &_IMAPStaticData, FALSE, &_IMAPImpl,
1479     0
1480 };
1481 
1482 #endif
1483