• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (C) 2002-2015, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   file name:  ucnv_u7.c
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2002jul01
12 *   created by: Markus W. Scherer
13 *
14 *   UTF-7 converter implementation. Used to be in ucnv_utf.c.
15 */
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
20 
21 #include "unicode/ucnv.h"
22 #include "ucnv_bld.h"
23 #include "ucnv_cnv.h"
24 #include "uassert.h"
25 
26 /* UTF-7 -------------------------------------------------------------------- */
27 
28 /*
29  * UTF-7 is a stateful encoding of Unicode.
30  * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
31  * It was intended for use in Internet email systems, using in its bytewise
32  * encoding only a subset of 7-bit US-ASCII.
33  * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
34  * occasionally used.
35  *
36  * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
37  * characters directly or in base64. Especially, the characters in set O
38  * as defined in the RFC (see below) may be encoded directly but are not
39  * allowed in, e.g., email headers.
40  * By default, the ICU UTF-7 converter encodes set O directly.
41  * By choosing the option "version=1", set O will be escaped instead.
42  * For example:
43  *     utf7Converter=ucnv_open("UTF-7,version=1");
44  *
45  * For details about email headers see RFC 2047.
46  */
47 
48 /*
49  * Tests for US-ASCII characters belonging to character classes
50  * defined in UTF-7.
51  *
52  * Set D (directly encoded characters) consists of the following
53  * characters: the upper and lower case letters A through Z
54  * and a through z, the 10 digits 0-9, and the following nine special
55  * characters (note that "+" and "=" are omitted):
56  *     '(),-./:?
57  *
58  * Set O (optional direct characters) consists of the following
59  * characters (note that "\" and "~" are omitted):
60  *     !"#$%&*;<=>@[]^_`{|}
61  *
62  * According to the rules in RFC 2152, the byte values for the following
63  * US-ASCII characters are not used in UTF-7 and are therefore illegal:
64  * - all C0 control codes except for CR LF TAB
65  * - BACKSLASH
66  * - TILDE
67  * - DEL
68  * - all codes beyond US-ASCII, i.e. all >127
69  */
70 #define inSetD(c) \
71     ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
72      (uint8_t)((c)-48)<10 ||    /* digits */ \
73      (uint8_t)((c)-39)<3 ||     /* '() */ \
74      (uint8_t)((c)-44)<4 ||     /* ,-./ */ \
75      (c)==58 || (c)==63         /* :? */ \
76     )
77 
78 #define inSetO(c) \
79     ((uint8_t)((c)-33)<6 ||         /* !"#$%& */ \
80      (uint8_t)((c)-59)<4 ||         /* ;<=> */ \
81      (uint8_t)((c)-93)<4 ||         /* ]^_` */ \
82      (uint8_t)((c)-123)<3 ||        /* {|} */ \
83      (c)==42 || (c)==64 || (c)==91  /* *@[ */ \
84     )
85 
86 #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
87 #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
88 
89 #define PLUS  43
90 #define MINUS 45
91 #define BACKSLASH 92
92 #define TILDE 126
93 
94 /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
95 #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
96 
97 /* encode directly sets D and O and CR LF SP TAB */
98 static const UBool encodeDirectlyMaximum[128]={
99  /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
100     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
101     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102 
103     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
104     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
105 
106     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
107     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
108 
109     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
110     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
111 };
112 
113 /* encode directly set D and CR LF SP TAB but not set O */
114 static const UBool encodeDirectlyRestricted[128]={
115  /* 0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
116     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
117     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
118 
119     1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
120     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
121 
122     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
123     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
124 
125     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
126     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
127 };
128 
129 static const uint8_t
130 toBase64[64]={
131     /* A-Z */
132     65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
133     78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
134     /* a-z */
135     97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
136     110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
137     /* 0-9 */
138     48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
139     /* +/ */
140     43, 47
141 };
142 
143 static const int8_t
144 fromBase64[128]={
145     /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
146     -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
147     -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
148 
149     /* general punctuation with + and / and a special value (-2) for - */
150     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
151     /* digits */
152     52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
153 
154     /* A-Z */
155     -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
156     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
157 
158     /* a-z */
159     -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
160     41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
161 };
162 
163 /*
164  * converter status values:
165  *
166  * toUnicodeStatus:
167  *     24 inDirectMode (boolean)
168  * 23..16 base64Counter (-1..7)
169  * 15..0  bits (up to 14 bits incoming base64)
170  *
171  * fromUnicodeStatus:
172  * 31..28 version (0: set O direct  1: set O escaped)
173  *     24 inDirectMode (boolean)
174  * 23..16 base64Counter (0..2)
175  *  7..0  bits (6 bits outgoing base64)
176  *
177  */
178 
179 static void
_UTF7Reset(UConverter * cnv,UConverterResetChoice choice)180 _UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
181     if(choice<=UCNV_RESET_TO_UNICODE) {
182         /* reset toUnicode */
183         cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
184         cnv->toULength=0;
185     }
186     if(choice!=UCNV_RESET_TO_UNICODE) {
187         /* reset fromUnicode */
188         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
189     }
190 }
191 
192 static void
_UTF7Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)193 _UTF7Open(UConverter *cnv,
194           UConverterLoadArgs *pArgs,
195           UErrorCode *pErrorCode) {
196     if(UCNV_GET_VERSION(cnv)<=1) {
197         /* TODO(markus): Should just use cnv->options rather than copying the version number. */
198         cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
199         _UTF7Reset(cnv, UCNV_RESET_BOTH);
200     } else {
201         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
202     }
203 }
204 
205 static void
_UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)206 _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
207                           UErrorCode *pErrorCode) {
208     UConverter *cnv;
209     const uint8_t *source, *sourceLimit;
210     UChar *target;
211     const UChar *targetLimit;
212     int32_t *offsets;
213 
214     uint8_t *bytes;
215     uint8_t byteIndex;
216 
217     int32_t length, targetCapacity;
218 
219     /* UTF-7 state */
220     uint16_t bits;
221     int8_t base64Counter;
222     UBool inDirectMode;
223 
224     int8_t base64Value;
225 
226     int32_t sourceIndex, nextSourceIndex;
227 
228     uint8_t b;
229     /* set up the local pointers */
230     cnv=pArgs->converter;
231 
232     source=(const uint8_t *)pArgs->source;
233     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
234     target=pArgs->target;
235     targetLimit=pArgs->targetLimit;
236     offsets=pArgs->offsets;
237     /* get the state machine state */
238     {
239         uint32_t status=cnv->toUnicodeStatus;
240         inDirectMode=(UBool)((status>>24)&1);
241         base64Counter=(int8_t)(status>>16);
242         bits=(uint16_t)status;
243     }
244     bytes=cnv->toUBytes;
245     byteIndex=cnv->toULength;
246 
247     /* sourceIndex=-1 if the current character began in the previous buffer */
248     sourceIndex=byteIndex==0 ? 0 : -1;
249     nextSourceIndex=0;
250 
251     if(inDirectMode) {
252 directMode:
253         /*
254          * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
255          * with their US-ASCII byte values.
256          * Backslash and Tilde and most control characters are not allowed in UTF-7.
257          * A plus sign starts Unicode (or "escape") Mode.
258          *
259          * In Direct Mode, only the sourceIndex is used.
260          */
261         byteIndex=0;
262         length=(int32_t)(sourceLimit-source);
263         targetCapacity=(int32_t)(targetLimit-target);
264         if(length>targetCapacity) {
265             length=targetCapacity;
266         }
267         while(length>0) {
268             b=*source++;
269             if(!isLegalUTF7(b)) {
270                 /* illegal */
271                 bytes[0]=b;
272                 byteIndex=1;
273                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
274                 break;
275             } else if(b!=PLUS) {
276                 /* write directly encoded character */
277                 *target++=b;
278                 if(offsets!=NULL) {
279                     *offsets++=sourceIndex++;
280                 }
281             } else /* PLUS */ {
282                 /* switch to Unicode mode */
283                 nextSourceIndex=++sourceIndex;
284                 inDirectMode=FALSE;
285                 byteIndex=0;
286                 bits=0;
287                 base64Counter=-1;
288                 goto unicodeMode;
289             }
290             --length;
291         }
292         if(source<sourceLimit && target>=targetLimit) {
293             /* target is full */
294             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
295         }
296     } else {
297 unicodeMode:
298         /*
299          * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
300          * The base64 sequence ends with any character that is not in the base64 alphabet.
301          * A terminating minus sign is consumed.
302          *
303          * In Unicode Mode, the sourceIndex has the index to the start of the current
304          * base64 bytes, while nextSourceIndex is precisely parallel to source,
305          * keeping the index to the following byte.
306          * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
307          */
308         while(source<sourceLimit) {
309             if(target<targetLimit) {
310                 bytes[byteIndex++]=b=*source++;
311                 ++nextSourceIndex;
312                 base64Value = -3; /* initialize as illegal */
313                 if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) {
314                     /* either
315                      * base64Value==-1 for any legal character except base64 and minus sign, or
316                      * base64Value==-3 for illegal characters:
317                      * 1. In either case, leave Unicode mode.
318                      * 2.1. If we ended with an incomplete UChar or none after the +, then
319                      *      generate an error for the preceding erroneous sequence and deal with
320                      *      the current (possibly illegal) character next time through.
321                      * 2.2. Else the current char comes after a complete UChar, which was already
322                      *      pushed to the output buf, so:
323                      * 2.2.1. If the current char is legal, just save it for processing next time.
324                      *        It may be for example, a plus which we need to deal with in direct mode.
325                      * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
326                      */
327                     inDirectMode=TRUE;
328                     if(base64Counter==-1) {
329                         /* illegal: + immediately followed by something other than base64 or minus sign */
330                         /* include the plus sign in the reported sequence, but not the subsequent char */
331                         --source;
332                         bytes[0]=PLUS;
333                         byteIndex=1;
334                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
335                         break;
336                     } else if(bits!=0) {
337                         /* bits are illegally left over, a UChar is incomplete */
338                         /* don't include current char (legal or illegal) in error seq */
339                         --source;
340                         --byteIndex;
341                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
342                         break;
343                     } else {
344                         /* previous UChar was complete */
345                         if(base64Value==-3) {
346                             /* current character is illegal, deal with it here */
347                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
348                             break;
349                         } else {
350                             /* un-read the current character in case it is a plus sign */
351                             --source;
352                             sourceIndex=nextSourceIndex-1;
353                             goto directMode;
354                         }
355                     }
356                 } else if(base64Value>=0) {
357                     /* collect base64 bytes into UChars */
358                     switch(base64Counter) {
359                     case -1: /* -1 is immediately after the + */
360                     case 0:
361                         bits=base64Value;
362                         base64Counter=1;
363                         break;
364                     case 1:
365                     case 3:
366                     case 4:
367                     case 6:
368                         bits=(uint16_t)((bits<<6)|base64Value);
369                         ++base64Counter;
370                         break;
371                     case 2:
372                         *target++=(UChar)((bits<<4)|(base64Value>>2));
373                         if(offsets!=NULL) {
374                             *offsets++=sourceIndex;
375                             sourceIndex=nextSourceIndex-1;
376                         }
377                         bytes[0]=b; /* keep this byte in case an error occurs */
378                         byteIndex=1;
379                         bits=(uint16_t)(base64Value&3);
380                         base64Counter=3;
381                         break;
382                     case 5:
383                         *target++=(UChar)((bits<<2)|(base64Value>>4));
384                         if(offsets!=NULL) {
385                             *offsets++=sourceIndex;
386                             sourceIndex=nextSourceIndex-1;
387                         }
388                         bytes[0]=b; /* keep this byte in case an error occurs */
389                         byteIndex=1;
390                         bits=(uint16_t)(base64Value&15);
391                         base64Counter=6;
392                         break;
393                     case 7:
394                         *target++=(UChar)((bits<<6)|base64Value);
395                         if(offsets!=NULL) {
396                             *offsets++=sourceIndex;
397                             sourceIndex=nextSourceIndex;
398                         }
399                         byteIndex=0;
400                         bits=0;
401                         base64Counter=0;
402                         break;
403                     default:
404                         /* will never occur */
405                         break;
406                     }
407                 } else /*base64Value==-2*/ {
408                     /* minus sign terminates the base64 sequence */
409                     inDirectMode=TRUE;
410                     if(base64Counter==-1) {
411                         /* +- i.e. a minus immediately following a plus */
412                         *target++=PLUS;
413                         if(offsets!=NULL) {
414                             *offsets++=sourceIndex-1;
415                         }
416                     } else {
417                         /* absorb the minus and leave the Unicode Mode */
418                         if(bits!=0) {
419                             /* bits are illegally left over, a UChar is incomplete */
420                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
421                             break;
422                         }
423                     }
424                     sourceIndex=nextSourceIndex;
425                     goto directMode;
426                 }
427             } else {
428                 /* target is full */
429                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
430                 break;
431             }
432         }
433     }
434 
435     if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
436         /*
437          * if we are in Unicode mode, then the byteIndex might not be 0,
438          * but that is ok if bits==0
439          * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
440          * (not true for IMAP-mailbox-name where we must end in direct mode)
441          */
442         byteIndex=0;
443     }
444 
445     /* set the converter state back into UConverter */
446     cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
447     cnv->toULength=byteIndex;
448 
449     /* write back the updated pointers */
450     pArgs->source=(const char *)source;
451     pArgs->target=target;
452     pArgs->offsets=offsets;
453     return;
454 }
455 
456 static void
_UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)457 _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
458                             UErrorCode *pErrorCode) {
459     UConverter *cnv;
460     const UChar *source, *sourceLimit;
461     uint8_t *target, *targetLimit;
462     int32_t *offsets;
463 
464     int32_t length, targetCapacity, sourceIndex;
465     UChar c;
466 
467     /* UTF-7 state */
468     const UBool *encodeDirectly;
469     uint8_t bits;
470     int8_t base64Counter;
471     UBool inDirectMode;
472 
473     /* set up the local pointers */
474     cnv=pArgs->converter;
475 
476     /* set up the local pointers */
477     source=pArgs->source;
478     sourceLimit=pArgs->sourceLimit;
479     target=(uint8_t *)pArgs->target;
480     targetLimit=(uint8_t *)pArgs->targetLimit;
481     offsets=pArgs->offsets;
482 
483     /* get the state machine state */
484     {
485         uint32_t status=cnv->fromUnicodeStatus;
486         encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
487         inDirectMode=(UBool)((status>>24)&1);
488         base64Counter=(int8_t)(status>>16);
489         bits=(uint8_t)status;
490         U_ASSERT(bits<=sizeof(toBase64)/sizeof(toBase64[0]));
491     }
492 
493     /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
494     sourceIndex=0;
495 
496     if(inDirectMode) {
497 directMode:
498         length=(int32_t)(sourceLimit-source);
499         targetCapacity=(int32_t)(targetLimit-target);
500         if(length>targetCapacity) {
501             length=targetCapacity;
502         }
503         while(length>0) {
504             c=*source++;
505             /* currently always encode CR LF SP TAB directly */
506             if(c<=127 && encodeDirectly[c]) {
507                 /* encode directly */
508                 *target++=(uint8_t)c;
509                 if(offsets!=NULL) {
510                     *offsets++=sourceIndex++;
511                 }
512             } else if(c==PLUS) {
513                 /* output +- for + */
514                 *target++=PLUS;
515                 if(target<targetLimit) {
516                     *target++=MINUS;
517                     if(offsets!=NULL) {
518                         *offsets++=sourceIndex;
519                         *offsets++=sourceIndex++;
520                     }
521                     /* realign length and targetCapacity */
522                     goto directMode;
523                 } else {
524                     if(offsets!=NULL) {
525                         *offsets++=sourceIndex++;
526                     }
527                     cnv->charErrorBuffer[0]=MINUS;
528                     cnv->charErrorBufferLength=1;
529                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
530                     break;
531                 }
532             } else {
533                 /* un-read this character and switch to Unicode Mode */
534                 --source;
535                 *target++=PLUS;
536                 if(offsets!=NULL) {
537                     *offsets++=sourceIndex;
538                 }
539                 inDirectMode=FALSE;
540                 base64Counter=0;
541                 goto unicodeMode;
542             }
543             --length;
544         }
545         if(source<sourceLimit && target>=targetLimit) {
546             /* target is full */
547             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
548         }
549     } else {
550 unicodeMode:
551         while(source<sourceLimit) {
552             if(target<targetLimit) {
553                 c=*source++;
554                 if(c<=127 && encodeDirectly[c]) {
555                     /* encode directly */
556                     inDirectMode=TRUE;
557 
558                     /* trick: back out this character to make this easier */
559                     --source;
560 
561                     /* terminate the base64 sequence */
562                     if(base64Counter!=0) {
563                         /* write remaining bits for the previous character */
564                         *target++=toBase64[bits];
565                         if(offsets!=NULL) {
566                             *offsets++=sourceIndex-1;
567                         }
568                     }
569                     if(fromBase64[c]!=-1) {
570                         /* need to terminate with a minus */
571                         if(target<targetLimit) {
572                             *target++=MINUS;
573                             if(offsets!=NULL) {
574                                 *offsets++=sourceIndex-1;
575                             }
576                         } else {
577                             cnv->charErrorBuffer[0]=MINUS;
578                             cnv->charErrorBufferLength=1;
579                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
580                             break;
581                         }
582                     }
583                     goto directMode;
584                 } else {
585                     /*
586                      * base64 this character:
587                      * Output 2 or 3 base64 bytes for the remaining bits of the previous character
588                      * and the bits of this character, each implicitly in UTF-16BE.
589                      *
590                      * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
591                      * character to the next. The actual 2 or 4 bits are shifted to the left edge
592                      * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
593                      */
594                     switch(base64Counter) {
595                     case 0:
596                         *target++=toBase64[c>>10];
597                         if(target<targetLimit) {
598                             *target++=toBase64[(c>>4)&0x3f];
599                             if(offsets!=NULL) {
600                                 *offsets++=sourceIndex;
601                                 *offsets++=sourceIndex++;
602                             }
603                         } else {
604                             if(offsets!=NULL) {
605                                 *offsets++=sourceIndex++;
606                             }
607                             cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
608                             cnv->charErrorBufferLength=1;
609                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
610                         }
611                         bits=(uint8_t)((c&15)<<2);
612                         base64Counter=1;
613                         break;
614                     case 1:
615                         *target++=toBase64[bits|(c>>14)];
616                         if(target<targetLimit) {
617                             *target++=toBase64[(c>>8)&0x3f];
618                             if(target<targetLimit) {
619                                 *target++=toBase64[(c>>2)&0x3f];
620                                 if(offsets!=NULL) {
621                                     *offsets++=sourceIndex;
622                                     *offsets++=sourceIndex;
623                                     *offsets++=sourceIndex++;
624                                 }
625                             } else {
626                                 if(offsets!=NULL) {
627                                     *offsets++=sourceIndex;
628                                     *offsets++=sourceIndex++;
629                                 }
630                                 cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
631                                 cnv->charErrorBufferLength=1;
632                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
633                             }
634                         } else {
635                             if(offsets!=NULL) {
636                                 *offsets++=sourceIndex++;
637                             }
638                             cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
639                             cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
640                             cnv->charErrorBufferLength=2;
641                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
642                         }
643                         bits=(uint8_t)((c&3)<<4);
644                         base64Counter=2;
645                         break;
646                     case 2:
647                         *target++=toBase64[bits|(c>>12)];
648                         if(target<targetLimit) {
649                             *target++=toBase64[(c>>6)&0x3f];
650                             if(target<targetLimit) {
651                                 *target++=toBase64[c&0x3f];
652                                 if(offsets!=NULL) {
653                                     *offsets++=sourceIndex;
654                                     *offsets++=sourceIndex;
655                                     *offsets++=sourceIndex++;
656                                 }
657                             } else {
658                                 if(offsets!=NULL) {
659                                     *offsets++=sourceIndex;
660                                     *offsets++=sourceIndex++;
661                                 }
662                                 cnv->charErrorBuffer[0]=toBase64[c&0x3f];
663                                 cnv->charErrorBufferLength=1;
664                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
665                             }
666                         } else {
667                             if(offsets!=NULL) {
668                                 *offsets++=sourceIndex++;
669                             }
670                             cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
671                             cnv->charErrorBuffer[1]=toBase64[c&0x3f];
672                             cnv->charErrorBufferLength=2;
673                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
674                         }
675                         bits=0;
676                         base64Counter=0;
677                         break;
678                     default:
679                         /* will never occur */
680                         break;
681                     }
682                 }
683             } else {
684                 /* target is full */
685                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
686                 break;
687             }
688         }
689     }
690 
691     if(pArgs->flush && source>=sourceLimit) {
692         /* flush remaining bits to the target */
693         if(!inDirectMode) {
694             if (base64Counter!=0) {
695                 if(target<targetLimit) {
696                     *target++=toBase64[bits];
697                     if(offsets!=NULL) {
698                         *offsets++=sourceIndex-1;
699                     }
700                 } else {
701                     cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
702                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
703                 }
704             }
705             /* Add final MINUS to terminate unicodeMode */
706             if(target<targetLimit) {
707                 *target++=MINUS;
708                 if(offsets!=NULL) {
709                     *offsets++=sourceIndex-1;
710                 }
711             } else {
712                 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
713                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
714             }
715         }
716         /* reset the state for the next conversion */
717         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
718     } else {
719         /* set the converter state back into UConverter */
720         cnv->fromUnicodeStatus=
721             (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
722             ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
723     }
724 
725     /* write back the updated pointers */
726     pArgs->source=source;
727     pArgs->target=(char *)target;
728     pArgs->offsets=offsets;
729     return;
730 }
731 
732 static const char *
_UTF7GetName(const UConverter * cnv)733 _UTF7GetName(const UConverter *cnv) {
734     switch(cnv->fromUnicodeStatus>>28) {
735     case 1:
736         return "UTF-7,version=1";
737     default:
738         return "UTF-7";
739     }
740 }
741 
742 static const UConverterImpl _UTF7Impl={
743     UCNV_UTF7,
744 
745     NULL,
746     NULL,
747 
748     _UTF7Open,
749     NULL,
750     _UTF7Reset,
751 
752     _UTF7ToUnicodeWithOffsets,
753     _UTF7ToUnicodeWithOffsets,
754     _UTF7FromUnicodeWithOffsets,
755     _UTF7FromUnicodeWithOffsets,
756     NULL,
757 
758     NULL,
759     _UTF7GetName,
760     NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
761     NULL,
762     ucnv_getCompleteUnicodeSet
763 };
764 
765 static const UConverterStaticData _UTF7StaticData={
766     sizeof(UConverterStaticData),
767     "UTF-7",
768     0, /* TODO CCSID for UTF-7 */
769     UCNV_IBM, UCNV_UTF7,
770     1, 4,
771     { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
772     FALSE, FALSE,
773     0,
774     0,
775     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
776 };
777 
778 const UConverterSharedData _UTF7Data=
779         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF7StaticData, &_UTF7Impl);
780 
781 /* IMAP mailbox name encoding ----------------------------------------------- */
782 
783 /*
784  * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
785  * http://www.ietf.org/rfc/rfc2060.txt
786  *
787  * 5.1.3.  Mailbox International Naming Convention
788  *
789  * By convention, international mailbox names are specified using a
790  * modified version of the UTF-7 encoding described in [UTF-7].  The
791  * purpose of these modifications is to correct the following problems
792  * with UTF-7:
793  *
794  *    1) UTF-7 uses the "+" character for shifting; this conflicts with
795  *       the common use of "+" in mailbox names, in particular USENET
796  *       newsgroup names.
797  *
798  *    2) UTF-7's encoding is BASE64 which uses the "/" character; this
799  *       conflicts with the use of "/" as a popular hierarchy delimiter.
800  *
801  *    3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
802  *       the use of "\" as a popular hierarchy delimiter.
803  *
804  *    4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
805  *       the use of "~" in some servers as a home directory indicator.
806  *
807  *    5) UTF-7 permits multiple alternate forms to represent the same
808  *       string; in particular, printable US-ASCII chararacters can be
809  *       represented in encoded form.
810  *
811  * In modified UTF-7, printable US-ASCII characters except for "&"
812  * represent themselves; that is, characters with octet values 0x20-0x25
813  * and 0x27-0x7e.  The character "&" (0x26) is represented by the two-
814  * octet sequence "&-".
815  *
816  * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
817  * Unicode 16-bit octets) are represented in modified BASE64, with a
818  * further modification from [UTF-7] that "," is used instead of "/".
819  * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
820  * character which can represent itself.
821  *
822  * "&" is used to shift to modified BASE64 and "-" to shift back to US-
823  * ASCII.  All names start in US-ASCII, and MUST end in US-ASCII (that
824  * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
825  * ").
826  *
827  * For example, here is a mailbox name which mixes English, Japanese,
828  * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
829  */
830 
831 /*
832  * Tests for US-ASCII characters belonging to character classes
833  * defined in UTF-7.
834  *
835  * Set D (directly encoded characters) consists of the following
836  * characters: the upper and lower case letters A through Z
837  * and a through z, the 10 digits 0-9, and the following nine special
838  * characters (note that "+" and "=" are omitted):
839  *     '(),-./:?
840  *
841  * Set O (optional direct characters) consists of the following
842  * characters (note that "\" and "~" are omitted):
843  *     !"#$%&*;<=>@[]^_`{|}
844  *
845  * According to the rules in RFC 2152, the byte values for the following
846  * US-ASCII characters are not used in UTF-7 and are therefore illegal:
847  * - all C0 control codes except for CR LF TAB
848  * - BACKSLASH
849  * - TILDE
850  * - DEL
851  * - all codes beyond US-ASCII, i.e. all >127
852  */
853 
854 /* uses '&' not '+' to start a base64 sequence */
855 #define AMPERSAND 0x26
856 #define COMMA 0x2c
857 #define SLASH 0x2f
858 
859 /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
860 #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
861 
862 /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
863 #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
864 
865 #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
866 #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
867 
868 /*
869  * converter status values:
870  *
871  * toUnicodeStatus:
872  *     24 inDirectMode (boolean)
873  * 23..16 base64Counter (-1..7)
874  * 15..0  bits (up to 14 bits incoming base64)
875  *
876  * fromUnicodeStatus:
877  *     24 inDirectMode (boolean)
878  * 23..16 base64Counter (0..2)
879  *  7..0  bits (6 bits outgoing base64)
880  *
881  * ignore bits 31..25
882  */
883 
884 static void
_IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)885 _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
886                           UErrorCode *pErrorCode) {
887     UConverter *cnv;
888     const uint8_t *source, *sourceLimit;
889     UChar *target;
890     const UChar *targetLimit;
891     int32_t *offsets;
892 
893     uint8_t *bytes;
894     uint8_t byteIndex;
895 
896     int32_t length, targetCapacity;
897 
898     /* UTF-7 state */
899     uint16_t bits;
900     int8_t base64Counter;
901     UBool inDirectMode;
902 
903     int8_t base64Value;
904 
905     int32_t sourceIndex, nextSourceIndex;
906 
907     UChar c;
908     uint8_t b;
909 
910     /* set up the local pointers */
911     cnv=pArgs->converter;
912 
913     source=(const uint8_t *)pArgs->source;
914     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
915     target=pArgs->target;
916     targetLimit=pArgs->targetLimit;
917     offsets=pArgs->offsets;
918     /* get the state machine state */
919     {
920         uint32_t status=cnv->toUnicodeStatus;
921         inDirectMode=(UBool)((status>>24)&1);
922         base64Counter=(int8_t)(status>>16);
923         bits=(uint16_t)status;
924     }
925     bytes=cnv->toUBytes;
926     byteIndex=cnv->toULength;
927 
928     /* sourceIndex=-1 if the current character began in the previous buffer */
929     sourceIndex=byteIndex==0 ? 0 : -1;
930     nextSourceIndex=0;
931 
932     if(inDirectMode) {
933 directMode:
934         /*
935          * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
936          * with their US-ASCII byte values.
937          * An ampersand starts Unicode (or "escape") Mode.
938          *
939          * In Direct Mode, only the sourceIndex is used.
940          */
941         byteIndex=0;
942         length=(int32_t)(sourceLimit-source);
943         targetCapacity=(int32_t)(targetLimit-target);
944         if(length>targetCapacity) {
945             length=targetCapacity;
946         }
947         while(length>0) {
948             b=*source++;
949             if(!isLegalIMAP(b)) {
950                 /* illegal */
951                 bytes[0]=b;
952                 byteIndex=1;
953                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
954                 break;
955             } else if(b!=AMPERSAND) {
956                 /* write directly encoded character */
957                 *target++=b;
958                 if(offsets!=NULL) {
959                     *offsets++=sourceIndex++;
960                 }
961             } else /* AMPERSAND */ {
962                 /* switch to Unicode mode */
963                 nextSourceIndex=++sourceIndex;
964                 inDirectMode=FALSE;
965                 byteIndex=0;
966                 bits=0;
967                 base64Counter=-1;
968                 goto unicodeMode;
969             }
970             --length;
971         }
972         if(source<sourceLimit && target>=targetLimit) {
973             /* target is full */
974             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
975         }
976     } else {
977 unicodeMode:
978         /*
979          * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
980          * The base64 sequence ends with any character that is not in the base64 alphabet.
981          * A terminating minus sign is consumed.
982          * US-ASCII must not be base64-ed.
983          *
984          * In Unicode Mode, the sourceIndex has the index to the start of the current
985          * base64 bytes, while nextSourceIndex is precisely parallel to source,
986          * keeping the index to the following byte.
987          * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
988          */
989         while(source<sourceLimit) {
990             if(target<targetLimit) {
991                 bytes[byteIndex++]=b=*source++;
992                 ++nextSourceIndex;
993                 if(b>0x7e) {
994                     /* illegal - test other illegal US-ASCII values by base64Value==-3 */
995                     inDirectMode=TRUE;
996                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
997                     break;
998                 } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
999                     /* collect base64 bytes into UChars */
1000                     switch(base64Counter) {
1001                     case -1: /* -1 is immediately after the & */
1002                     case 0:
1003                         bits=base64Value;
1004                         base64Counter=1;
1005                         break;
1006                     case 1:
1007                     case 3:
1008                     case 4:
1009                     case 6:
1010                         bits=(uint16_t)((bits<<6)|base64Value);
1011                         ++base64Counter;
1012                         break;
1013                     case 2:
1014                         c=(UChar)((bits<<4)|(base64Value>>2));
1015                         if(isLegalIMAP(c)) {
1016                             /* illegal */
1017                             inDirectMode=TRUE;
1018                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1019                             goto endloop;
1020                         }
1021                         *target++=c;
1022                         if(offsets!=NULL) {
1023                             *offsets++=sourceIndex;
1024                             sourceIndex=nextSourceIndex-1;
1025                         }
1026                         bytes[0]=b; /* keep this byte in case an error occurs */
1027                         byteIndex=1;
1028                         bits=(uint16_t)(base64Value&3);
1029                         base64Counter=3;
1030                         break;
1031                     case 5:
1032                         c=(UChar)((bits<<2)|(base64Value>>4));
1033                         if(isLegalIMAP(c)) {
1034                             /* illegal */
1035                             inDirectMode=TRUE;
1036                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1037                             goto endloop;
1038                         }
1039                         *target++=c;
1040                         if(offsets!=NULL) {
1041                             *offsets++=sourceIndex;
1042                             sourceIndex=nextSourceIndex-1;
1043                         }
1044                         bytes[0]=b; /* keep this byte in case an error occurs */
1045                         byteIndex=1;
1046                         bits=(uint16_t)(base64Value&15);
1047                         base64Counter=6;
1048                         break;
1049                     case 7:
1050                         c=(UChar)((bits<<6)|base64Value);
1051                         if(isLegalIMAP(c)) {
1052                             /* illegal */
1053                             inDirectMode=TRUE;
1054                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1055                             goto endloop;
1056                         }
1057                         *target++=c;
1058                         if(offsets!=NULL) {
1059                             *offsets++=sourceIndex;
1060                             sourceIndex=nextSourceIndex;
1061                         }
1062                         byteIndex=0;
1063                         bits=0;
1064                         base64Counter=0;
1065                         break;
1066                     default:
1067                         /* will never occur */
1068                         break;
1069                     }
1070                 } else if(base64Value==-2) {
1071                     /* minus sign terminates the base64 sequence */
1072                     inDirectMode=TRUE;
1073                     if(base64Counter==-1) {
1074                         /* &- i.e. a minus immediately following an ampersand */
1075                         *target++=AMPERSAND;
1076                         if(offsets!=NULL) {
1077                             *offsets++=sourceIndex-1;
1078                         }
1079                     } else {
1080                         /* absorb the minus and leave the Unicode Mode */
1081                         if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
1082                             /* bits are illegally left over, a UChar is incomplete */
1083                             /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
1084                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1085                             break;
1086                         }
1087                     }
1088                     sourceIndex=nextSourceIndex;
1089                     goto directMode;
1090                 } else {
1091                     if(base64Counter==-1) {
1092                         /* illegal: & immediately followed by something other than base64 or minus sign */
1093                         /* include the ampersand in the reported sequence */
1094                         --sourceIndex;
1095                         bytes[0]=AMPERSAND;
1096                         bytes[1]=b;
1097                         byteIndex=2;
1098                     }
1099                     /* base64Value==-1 for characters that are illegal only in Unicode mode */
1100                     /* base64Value==-3 for illegal characters */
1101                     /* illegal */
1102                     inDirectMode=TRUE;
1103                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1104                     break;
1105                 }
1106             } else {
1107                 /* target is full */
1108                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1109                 break;
1110             }
1111         }
1112     }
1113 endloop:
1114 
1115     /*
1116      * the end of the input stream and detection of truncated input
1117      * are handled by the framework, but here we must check if we are in Unicode
1118      * mode and byteIndex==0 because we must end in direct mode
1119      *
1120      * conditions:
1121      *   successful
1122      *   in Unicode mode and byteIndex==0
1123      *   end of input and no truncated input
1124      */
1125     if( U_SUCCESS(*pErrorCode) &&
1126         !inDirectMode && byteIndex==0 &&
1127         pArgs->flush && source>=sourceLimit
1128     ) {
1129         if(base64Counter==-1) {
1130             /* & at the very end of the input */
1131             /* make the ampersand the reported sequence */
1132             bytes[0]=AMPERSAND;
1133             byteIndex=1;
1134         }
1135         /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1136 
1137         inDirectMode=TRUE; /* avoid looping */
1138         *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1139     }
1140 
1141     /* set the converter state back into UConverter */
1142     cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
1143     cnv->toULength=byteIndex;
1144 
1145     /* write back the updated pointers */
1146     pArgs->source=(const char *)source;
1147     pArgs->target=target;
1148     pArgs->offsets=offsets;
1149     return;
1150 }
1151 
1152 static void
_IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)1153 _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1154                             UErrorCode *pErrorCode) {
1155     UConverter *cnv;
1156     const UChar *source, *sourceLimit;
1157     uint8_t *target, *targetLimit;
1158     int32_t *offsets;
1159 
1160     int32_t length, targetCapacity, sourceIndex;
1161     UChar c;
1162     uint8_t b;
1163 
1164     /* UTF-7 state */
1165     uint8_t bits;
1166     int8_t base64Counter;
1167     UBool inDirectMode;
1168 
1169     /* set up the local pointers */
1170     cnv=pArgs->converter;
1171 
1172     /* set up the local pointers */
1173     source=pArgs->source;
1174     sourceLimit=pArgs->sourceLimit;
1175     target=(uint8_t *)pArgs->target;
1176     targetLimit=(uint8_t *)pArgs->targetLimit;
1177     offsets=pArgs->offsets;
1178 
1179     /* get the state machine state */
1180     {
1181         uint32_t status=cnv->fromUnicodeStatus;
1182         inDirectMode=(UBool)((status>>24)&1);
1183         base64Counter=(int8_t)(status>>16);
1184         bits=(uint8_t)status;
1185     }
1186 
1187     /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1188     sourceIndex=0;
1189 
1190     if(inDirectMode) {
1191 directMode:
1192         length=(int32_t)(sourceLimit-source);
1193         targetCapacity=(int32_t)(targetLimit-target);
1194         if(length>targetCapacity) {
1195             length=targetCapacity;
1196         }
1197         while(length>0) {
1198             c=*source++;
1199             /* encode 0x20..0x7e except '&' directly */
1200             if(inSetDIMAP(c)) {
1201                 /* encode directly */
1202                 *target++=(uint8_t)c;
1203                 if(offsets!=NULL) {
1204                     *offsets++=sourceIndex++;
1205                 }
1206             } else if(c==AMPERSAND) {
1207                 /* output &- for & */
1208                 *target++=AMPERSAND;
1209                 if(target<targetLimit) {
1210                     *target++=MINUS;
1211                     if(offsets!=NULL) {
1212                         *offsets++=sourceIndex;
1213                         *offsets++=sourceIndex++;
1214                     }
1215                     /* realign length and targetCapacity */
1216                     goto directMode;
1217                 } else {
1218                     if(offsets!=NULL) {
1219                         *offsets++=sourceIndex++;
1220                     }
1221                     cnv->charErrorBuffer[0]=MINUS;
1222                     cnv->charErrorBufferLength=1;
1223                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1224                     break;
1225                 }
1226             } else {
1227                 /* un-read this character and switch to Unicode Mode */
1228                 --source;
1229                 *target++=AMPERSAND;
1230                 if(offsets!=NULL) {
1231                     *offsets++=sourceIndex;
1232                 }
1233                 inDirectMode=FALSE;
1234                 base64Counter=0;
1235                 goto unicodeMode;
1236             }
1237             --length;
1238         }
1239         if(source<sourceLimit && target>=targetLimit) {
1240             /* target is full */
1241             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1242         }
1243     } else {
1244 unicodeMode:
1245         while(source<sourceLimit) {
1246             if(target<targetLimit) {
1247                 c=*source++;
1248                 if(isLegalIMAP(c)) {
1249                     /* encode directly */
1250                     inDirectMode=TRUE;
1251 
1252                     /* trick: back out this character to make this easier */
1253                     --source;
1254 
1255                     /* terminate the base64 sequence */
1256                     if(base64Counter!=0) {
1257                         /* write remaining bits for the previous character */
1258                         *target++=TO_BASE64_IMAP(bits);
1259                         if(offsets!=NULL) {
1260                             *offsets++=sourceIndex-1;
1261                         }
1262                     }
1263                     /* need to terminate with a minus */
1264                     if(target<targetLimit) {
1265                         *target++=MINUS;
1266                         if(offsets!=NULL) {
1267                             *offsets++=sourceIndex-1;
1268                         }
1269                     } else {
1270                         cnv->charErrorBuffer[0]=MINUS;
1271                         cnv->charErrorBufferLength=1;
1272                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1273                         break;
1274                     }
1275                     goto directMode;
1276                 } else {
1277                     /*
1278                      * base64 this character:
1279                      * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1280                      * and the bits of this character, each implicitly in UTF-16BE.
1281                      *
1282                      * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1283                      * character to the next. The actual 2 or 4 bits are shifted to the left edge
1284                      * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1285                      */
1286                     switch(base64Counter) {
1287                     case 0:
1288                         b=(uint8_t)(c>>10);
1289                         *target++=TO_BASE64_IMAP(b);
1290                         if(target<targetLimit) {
1291                             b=(uint8_t)((c>>4)&0x3f);
1292                             *target++=TO_BASE64_IMAP(b);
1293                             if(offsets!=NULL) {
1294                                 *offsets++=sourceIndex;
1295                                 *offsets++=sourceIndex++;
1296                             }
1297                         } else {
1298                             if(offsets!=NULL) {
1299                                 *offsets++=sourceIndex++;
1300                             }
1301                             b=(uint8_t)((c>>4)&0x3f);
1302                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1303                             cnv->charErrorBufferLength=1;
1304                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1305                         }
1306                         bits=(uint8_t)((c&15)<<2);
1307                         base64Counter=1;
1308                         break;
1309                     case 1:
1310                         b=(uint8_t)(bits|(c>>14));
1311                         *target++=TO_BASE64_IMAP(b);
1312                         if(target<targetLimit) {
1313                             b=(uint8_t)((c>>8)&0x3f);
1314                             *target++=TO_BASE64_IMAP(b);
1315                             if(target<targetLimit) {
1316                                 b=(uint8_t)((c>>2)&0x3f);
1317                                 *target++=TO_BASE64_IMAP(b);
1318                                 if(offsets!=NULL) {
1319                                     *offsets++=sourceIndex;
1320                                     *offsets++=sourceIndex;
1321                                     *offsets++=sourceIndex++;
1322                                 }
1323                             } else {
1324                                 if(offsets!=NULL) {
1325                                     *offsets++=sourceIndex;
1326                                     *offsets++=sourceIndex++;
1327                                 }
1328                                 b=(uint8_t)((c>>2)&0x3f);
1329                                 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1330                                 cnv->charErrorBufferLength=1;
1331                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1332                             }
1333                         } else {
1334                             if(offsets!=NULL) {
1335                                 *offsets++=sourceIndex++;
1336                             }
1337                             b=(uint8_t)((c>>8)&0x3f);
1338                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1339                             b=(uint8_t)((c>>2)&0x3f);
1340                             cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1341                             cnv->charErrorBufferLength=2;
1342                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1343                         }
1344                         bits=(uint8_t)((c&3)<<4);
1345                         base64Counter=2;
1346                         break;
1347                     case 2:
1348                         b=(uint8_t)(bits|(c>>12));
1349                         *target++=TO_BASE64_IMAP(b);
1350                         if(target<targetLimit) {
1351                             b=(uint8_t)((c>>6)&0x3f);
1352                             *target++=TO_BASE64_IMAP(b);
1353                             if(target<targetLimit) {
1354                                 b=(uint8_t)(c&0x3f);
1355                                 *target++=TO_BASE64_IMAP(b);
1356                                 if(offsets!=NULL) {
1357                                     *offsets++=sourceIndex;
1358                                     *offsets++=sourceIndex;
1359                                     *offsets++=sourceIndex++;
1360                                 }
1361                             } else {
1362                                 if(offsets!=NULL) {
1363                                     *offsets++=sourceIndex;
1364                                     *offsets++=sourceIndex++;
1365                                 }
1366                                 b=(uint8_t)(c&0x3f);
1367                                 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1368                                 cnv->charErrorBufferLength=1;
1369                                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1370                             }
1371                         } else {
1372                             if(offsets!=NULL) {
1373                                 *offsets++=sourceIndex++;
1374                             }
1375                             b=(uint8_t)((c>>6)&0x3f);
1376                             cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1377                             b=(uint8_t)(c&0x3f);
1378                             cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1379                             cnv->charErrorBufferLength=2;
1380                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1381                         }
1382                         bits=0;
1383                         base64Counter=0;
1384                         break;
1385                     default:
1386                         /* will never occur */
1387                         break;
1388                     }
1389                 }
1390             } else {
1391                 /* target is full */
1392                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1393                 break;
1394             }
1395         }
1396     }
1397 
1398     if(pArgs->flush && source>=sourceLimit) {
1399         /* flush remaining bits to the target */
1400         if(!inDirectMode) {
1401             if(base64Counter!=0) {
1402                 if(target<targetLimit) {
1403                     *target++=TO_BASE64_IMAP(bits);
1404                     if(offsets!=NULL) {
1405                         *offsets++=sourceIndex-1;
1406                     }
1407                 } else {
1408                     cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
1409                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1410                 }
1411             }
1412             /* need to terminate with a minus */
1413             if(target<targetLimit) {
1414                 *target++=MINUS;
1415                 if(offsets!=NULL) {
1416                     *offsets++=sourceIndex-1;
1417                 }
1418             } else {
1419                 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
1420                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1421             }
1422         }
1423         /* reset the state for the next conversion */
1424         cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
1425     } else {
1426         /* set the converter state back into UConverter */
1427         cnv->fromUnicodeStatus=
1428             (cnv->fromUnicodeStatus&0xf0000000)|    /* keep version*/
1429             ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
1430     }
1431 
1432     /* write back the updated pointers */
1433     pArgs->source=source;
1434     pArgs->target=(char *)target;
1435     pArgs->offsets=offsets;
1436     return;
1437 }
1438 
1439 static const UConverterImpl _IMAPImpl={
1440     UCNV_IMAP_MAILBOX,
1441 
1442     NULL,
1443     NULL,
1444 
1445     _UTF7Open,
1446     NULL,
1447     _UTF7Reset,
1448 
1449     _IMAPToUnicodeWithOffsets,
1450     _IMAPToUnicodeWithOffsets,
1451     _IMAPFromUnicodeWithOffsets,
1452     _IMAPFromUnicodeWithOffsets,
1453     NULL,
1454 
1455     NULL,
1456     NULL,
1457     NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1458     NULL,
1459     ucnv_getCompleteUnicodeSet
1460 };
1461 
1462 static const UConverterStaticData _IMAPStaticData={
1463     sizeof(UConverterStaticData),
1464     "IMAP-mailbox-name",
1465     0, /* TODO CCSID for IMAP-mailbox-name */
1466     UCNV_IBM, UCNV_IMAP_MAILBOX,
1467     1, 4,
1468     { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1469     FALSE, FALSE,
1470     0,
1471     0,
1472     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1473 };
1474 
1475 const UConverterSharedData _IMAPData=
1476         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_IMAPStaticData, &_IMAPImpl);
1477 
1478 #endif
1479