1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnv_u7.c
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2002jul01
12 * created by: Markus W. Scherer
13 *
14 * UTF-7 converter implementation. Used to be in ucnv_utf.c.
15 */
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
20
21 #include "unicode/ucnv.h"
22 #include "ucnv_bld.h"
23 #include "ucnv_cnv.h"
24 #include "uassert.h"
25
26 /* UTF-7 -------------------------------------------------------------------- */
27
28 /*
29 * UTF-7 is a stateful encoding of Unicode.
30 * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
31 * It was intended for use in Internet email systems, using in its bytewise
32 * encoding only a subset of 7-bit US-ASCII.
33 * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
34 * occasionally used.
35 *
36 * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
37 * characters directly or in base64. Especially, the characters in set O
38 * as defined in the RFC (see below) may be encoded directly but are not
39 * allowed in, e.g., email headers.
40 * By default, the ICU UTF-7 converter encodes set O directly.
41 * By choosing the option "version=1", set O will be escaped instead.
42 * For example:
43 * utf7Converter=ucnv_open("UTF-7,version=1");
44 *
45 * For details about email headers see RFC 2047.
46 */
47
48 /*
49 * Tests for US-ASCII characters belonging to character classes
50 * defined in UTF-7.
51 *
52 * Set D (directly encoded characters) consists of the following
53 * characters: the upper and lower case letters A through Z
54 * and a through z, the 10 digits 0-9, and the following nine special
55 * characters (note that "+" and "=" are omitted):
56 * '(),-./:?
57 *
58 * Set O (optional direct characters) consists of the following
59 * characters (note that "\" and "~" are omitted):
60 * !"#$%&*;<=>@[]^_`{|}
61 *
62 * According to the rules in RFC 2152, the byte values for the following
63 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
64 * - all C0 control codes except for CR LF TAB
65 * - BACKSLASH
66 * - TILDE
67 * - DEL
68 * - all codes beyond US-ASCII, i.e. all >127
69 */
70 #define inSetD(c) \
71 ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
72 (uint8_t)((c)-48)<10 || /* digits */ \
73 (uint8_t)((c)-39)<3 || /* '() */ \
74 (uint8_t)((c)-44)<4 || /* ,-./ */ \
75 (c)==58 || (c)==63 /* :? */ \
76 )
77
78 #define inSetO(c) \
79 ((uint8_t)((c)-33)<6 || /* !"#$%& */ \
80 (uint8_t)((c)-59)<4 || /* ;<=> */ \
81 (uint8_t)((c)-93)<4 || /* ]^_` */ \
82 (uint8_t)((c)-123)<3 || /* {|} */ \
83 (c)==42 || (c)==64 || (c)==91 /* *@[ */ \
84 )
85
86 #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
87 #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
88
89 #define PLUS 43
90 #define MINUS 45
91 #define BACKSLASH 92
92 #define TILDE 126
93
94 /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
95 #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
96
97 /* encode directly sets D and O and CR LF SP TAB */
98 static const UBool encodeDirectlyMaximum[128]={
99 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102
103 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
104 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
105
106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
107 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
108
109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
110 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
111 };
112
113 /* encode directly set D and CR LF SP TAB but not set O */
114 static const UBool encodeDirectlyRestricted[128]={
115 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
116 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
118
119 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
120 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
121
122 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
123 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
124
125 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
126 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
127 };
128
129 static const uint8_t
130 toBase64[64]={
131 /* A-Z */
132 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
133 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
134 /* a-z */
135 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
136 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
137 /* 0-9 */
138 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
139 /* +/ */
140 43, 47
141 };
142
143 static const int8_t
144 fromBase64[128]={
145 /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
146 -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
147 -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
148
149 /* general punctuation with + and / and a special value (-2) for - */
150 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
151 /* digits */
152 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
153
154 /* A-Z */
155 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
156 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
157
158 /* a-z */
159 -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
160 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
161 };
162
163 /*
164 * converter status values:
165 *
166 * toUnicodeStatus:
167 * 24 inDirectMode (boolean)
168 * 23..16 base64Counter (-1..7)
169 * 15..0 bits (up to 14 bits incoming base64)
170 *
171 * fromUnicodeStatus:
172 * 31..28 version (0: set O direct 1: set O escaped)
173 * 24 inDirectMode (boolean)
174 * 23..16 base64Counter (0..2)
175 * 7..0 bits (6 bits outgoing base64)
176 *
177 */
178
179 static void
_UTF7Reset(UConverter * cnv,UConverterResetChoice choice)180 _UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
181 if(choice<=UCNV_RESET_TO_UNICODE) {
182 /* reset toUnicode */
183 cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
184 cnv->toULength=0;
185 }
186 if(choice!=UCNV_RESET_TO_UNICODE) {
187 /* reset fromUnicode */
188 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
189 }
190 }
191
192 static void
_UTF7Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)193 _UTF7Open(UConverter *cnv,
194 UConverterLoadArgs *pArgs,
195 UErrorCode *pErrorCode) {
196 if(UCNV_GET_VERSION(cnv)<=1) {
197 /* TODO(markus): Should just use cnv->options rather than copying the version number. */
198 cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
199 _UTF7Reset(cnv, UCNV_RESET_BOTH);
200 } else {
201 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
202 }
203 }
204
205 static void
_UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)206 _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
207 UErrorCode *pErrorCode) {
208 UConverter *cnv;
209 const uint8_t *source, *sourceLimit;
210 UChar *target;
211 const UChar *targetLimit;
212 int32_t *offsets;
213
214 uint8_t *bytes;
215 uint8_t byteIndex;
216
217 int32_t length, targetCapacity;
218
219 /* UTF-7 state */
220 uint16_t bits;
221 int8_t base64Counter;
222 UBool inDirectMode;
223
224 int8_t base64Value;
225
226 int32_t sourceIndex, nextSourceIndex;
227
228 uint8_t b;
229 /* set up the local pointers */
230 cnv=pArgs->converter;
231
232 source=(const uint8_t *)pArgs->source;
233 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
234 target=pArgs->target;
235 targetLimit=pArgs->targetLimit;
236 offsets=pArgs->offsets;
237 /* get the state machine state */
238 {
239 uint32_t status=cnv->toUnicodeStatus;
240 inDirectMode=(UBool)((status>>24)&1);
241 base64Counter=(int8_t)(status>>16);
242 bits=(uint16_t)status;
243 }
244 bytes=cnv->toUBytes;
245 byteIndex=cnv->toULength;
246
247 /* sourceIndex=-1 if the current character began in the previous buffer */
248 sourceIndex=byteIndex==0 ? 0 : -1;
249 nextSourceIndex=0;
250
251 if(inDirectMode) {
252 directMode:
253 /*
254 * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
255 * with their US-ASCII byte values.
256 * Backslash and Tilde and most control characters are not allowed in UTF-7.
257 * A plus sign starts Unicode (or "escape") Mode.
258 *
259 * In Direct Mode, only the sourceIndex is used.
260 */
261 byteIndex=0;
262 length=(int32_t)(sourceLimit-source);
263 targetCapacity=(int32_t)(targetLimit-target);
264 if(length>targetCapacity) {
265 length=targetCapacity;
266 }
267 while(length>0) {
268 b=*source++;
269 if(!isLegalUTF7(b)) {
270 /* illegal */
271 bytes[0]=b;
272 byteIndex=1;
273 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
274 break;
275 } else if(b!=PLUS) {
276 /* write directly encoded character */
277 *target++=b;
278 if(offsets!=NULL) {
279 *offsets++=sourceIndex++;
280 }
281 } else /* PLUS */ {
282 /* switch to Unicode mode */
283 nextSourceIndex=++sourceIndex;
284 inDirectMode=FALSE;
285 byteIndex=0;
286 bits=0;
287 base64Counter=-1;
288 goto unicodeMode;
289 }
290 --length;
291 }
292 if(source<sourceLimit && target>=targetLimit) {
293 /* target is full */
294 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
295 }
296 } else {
297 unicodeMode:
298 /*
299 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
300 * The base64 sequence ends with any character that is not in the base64 alphabet.
301 * A terminating minus sign is consumed.
302 *
303 * In Unicode Mode, the sourceIndex has the index to the start of the current
304 * base64 bytes, while nextSourceIndex is precisely parallel to source,
305 * keeping the index to the following byte.
306 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
307 */
308 while(source<sourceLimit) {
309 if(target<targetLimit) {
310 bytes[byteIndex++]=b=*source++;
311 ++nextSourceIndex;
312 base64Value = -3; /* initialize as illegal */
313 if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) {
314 /* either
315 * base64Value==-1 for any legal character except base64 and minus sign, or
316 * base64Value==-3 for illegal characters:
317 * 1. In either case, leave Unicode mode.
318 * 2.1. If we ended with an incomplete UChar or none after the +, then
319 * generate an error for the preceding erroneous sequence and deal with
320 * the current (possibly illegal) character next time through.
321 * 2.2. Else the current char comes after a complete UChar, which was already
322 * pushed to the output buf, so:
323 * 2.2.1. If the current char is legal, just save it for processing next time.
324 * It may be for example, a plus which we need to deal with in direct mode.
325 * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
326 */
327 inDirectMode=TRUE;
328 if(base64Counter==-1) {
329 /* illegal: + immediately followed by something other than base64 or minus sign */
330 /* include the plus sign in the reported sequence, but not the subsequent char */
331 --source;
332 bytes[0]=PLUS;
333 byteIndex=1;
334 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
335 break;
336 } else if(bits!=0) {
337 /* bits are illegally left over, a UChar is incomplete */
338 /* don't include current char (legal or illegal) in error seq */
339 --source;
340 --byteIndex;
341 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
342 break;
343 } else {
344 /* previous UChar was complete */
345 if(base64Value==-3) {
346 /* current character is illegal, deal with it here */
347 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
348 break;
349 } else {
350 /* un-read the current character in case it is a plus sign */
351 --source;
352 sourceIndex=nextSourceIndex-1;
353 goto directMode;
354 }
355 }
356 } else if(base64Value>=0) {
357 /* collect base64 bytes into UChars */
358 switch(base64Counter) {
359 case -1: /* -1 is immediately after the + */
360 case 0:
361 bits=base64Value;
362 base64Counter=1;
363 break;
364 case 1:
365 case 3:
366 case 4:
367 case 6:
368 bits=(uint16_t)((bits<<6)|base64Value);
369 ++base64Counter;
370 break;
371 case 2:
372 *target++=(UChar)((bits<<4)|(base64Value>>2));
373 if(offsets!=NULL) {
374 *offsets++=sourceIndex;
375 sourceIndex=nextSourceIndex-1;
376 }
377 bytes[0]=b; /* keep this byte in case an error occurs */
378 byteIndex=1;
379 bits=(uint16_t)(base64Value&3);
380 base64Counter=3;
381 break;
382 case 5:
383 *target++=(UChar)((bits<<2)|(base64Value>>4));
384 if(offsets!=NULL) {
385 *offsets++=sourceIndex;
386 sourceIndex=nextSourceIndex-1;
387 }
388 bytes[0]=b; /* keep this byte in case an error occurs */
389 byteIndex=1;
390 bits=(uint16_t)(base64Value&15);
391 base64Counter=6;
392 break;
393 case 7:
394 *target++=(UChar)((bits<<6)|base64Value);
395 if(offsets!=NULL) {
396 *offsets++=sourceIndex;
397 sourceIndex=nextSourceIndex;
398 }
399 byteIndex=0;
400 bits=0;
401 base64Counter=0;
402 break;
403 default:
404 /* will never occur */
405 break;
406 }
407 } else /*base64Value==-2*/ {
408 /* minus sign terminates the base64 sequence */
409 inDirectMode=TRUE;
410 if(base64Counter==-1) {
411 /* +- i.e. a minus immediately following a plus */
412 *target++=PLUS;
413 if(offsets!=NULL) {
414 *offsets++=sourceIndex-1;
415 }
416 } else {
417 /* absorb the minus and leave the Unicode Mode */
418 if(bits!=0) {
419 /* bits are illegally left over, a UChar is incomplete */
420 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
421 break;
422 }
423 }
424 sourceIndex=nextSourceIndex;
425 goto directMode;
426 }
427 } else {
428 /* target is full */
429 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
430 break;
431 }
432 }
433 }
434
435 if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
436 /*
437 * if we are in Unicode mode, then the byteIndex might not be 0,
438 * but that is ok if bits==0
439 * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
440 * (not true for IMAP-mailbox-name where we must end in direct mode)
441 */
442 byteIndex=0;
443 }
444
445 /* set the converter state back into UConverter */
446 cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
447 cnv->toULength=byteIndex;
448
449 /* write back the updated pointers */
450 pArgs->source=(const char *)source;
451 pArgs->target=target;
452 pArgs->offsets=offsets;
453 return;
454 }
455
456 static void
_UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)457 _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
458 UErrorCode *pErrorCode) {
459 UConverter *cnv;
460 const UChar *source, *sourceLimit;
461 uint8_t *target, *targetLimit;
462 int32_t *offsets;
463
464 int32_t length, targetCapacity, sourceIndex;
465 UChar c;
466
467 /* UTF-7 state */
468 const UBool *encodeDirectly;
469 uint8_t bits;
470 int8_t base64Counter;
471 UBool inDirectMode;
472
473 /* set up the local pointers */
474 cnv=pArgs->converter;
475
476 /* set up the local pointers */
477 source=pArgs->source;
478 sourceLimit=pArgs->sourceLimit;
479 target=(uint8_t *)pArgs->target;
480 targetLimit=(uint8_t *)pArgs->targetLimit;
481 offsets=pArgs->offsets;
482
483 /* get the state machine state */
484 {
485 uint32_t status=cnv->fromUnicodeStatus;
486 encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
487 inDirectMode=(UBool)((status>>24)&1);
488 base64Counter=(int8_t)(status>>16);
489 bits=(uint8_t)status;
490 U_ASSERT(bits<=sizeof(toBase64)/sizeof(toBase64[0]));
491 }
492
493 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
494 sourceIndex=0;
495
496 if(inDirectMode) {
497 directMode:
498 length=(int32_t)(sourceLimit-source);
499 targetCapacity=(int32_t)(targetLimit-target);
500 if(length>targetCapacity) {
501 length=targetCapacity;
502 }
503 while(length>0) {
504 c=*source++;
505 /* currently always encode CR LF SP TAB directly */
506 if(c<=127 && encodeDirectly[c]) {
507 /* encode directly */
508 *target++=(uint8_t)c;
509 if(offsets!=NULL) {
510 *offsets++=sourceIndex++;
511 }
512 } else if(c==PLUS) {
513 /* output +- for + */
514 *target++=PLUS;
515 if(target<targetLimit) {
516 *target++=MINUS;
517 if(offsets!=NULL) {
518 *offsets++=sourceIndex;
519 *offsets++=sourceIndex++;
520 }
521 /* realign length and targetCapacity */
522 goto directMode;
523 } else {
524 if(offsets!=NULL) {
525 *offsets++=sourceIndex++;
526 }
527 cnv->charErrorBuffer[0]=MINUS;
528 cnv->charErrorBufferLength=1;
529 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
530 break;
531 }
532 } else {
533 /* un-read this character and switch to Unicode Mode */
534 --source;
535 *target++=PLUS;
536 if(offsets!=NULL) {
537 *offsets++=sourceIndex;
538 }
539 inDirectMode=FALSE;
540 base64Counter=0;
541 goto unicodeMode;
542 }
543 --length;
544 }
545 if(source<sourceLimit && target>=targetLimit) {
546 /* target is full */
547 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
548 }
549 } else {
550 unicodeMode:
551 while(source<sourceLimit) {
552 if(target<targetLimit) {
553 c=*source++;
554 if(c<=127 && encodeDirectly[c]) {
555 /* encode directly */
556 inDirectMode=TRUE;
557
558 /* trick: back out this character to make this easier */
559 --source;
560
561 /* terminate the base64 sequence */
562 if(base64Counter!=0) {
563 /* write remaining bits for the previous character */
564 *target++=toBase64[bits];
565 if(offsets!=NULL) {
566 *offsets++=sourceIndex-1;
567 }
568 }
569 if(fromBase64[c]!=-1) {
570 /* need to terminate with a minus */
571 if(target<targetLimit) {
572 *target++=MINUS;
573 if(offsets!=NULL) {
574 *offsets++=sourceIndex-1;
575 }
576 } else {
577 cnv->charErrorBuffer[0]=MINUS;
578 cnv->charErrorBufferLength=1;
579 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
580 break;
581 }
582 }
583 goto directMode;
584 } else {
585 /*
586 * base64 this character:
587 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
588 * and the bits of this character, each implicitly in UTF-16BE.
589 *
590 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
591 * character to the next. The actual 2 or 4 bits are shifted to the left edge
592 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
593 */
594 switch(base64Counter) {
595 case 0:
596 *target++=toBase64[c>>10];
597 if(target<targetLimit) {
598 *target++=toBase64[(c>>4)&0x3f];
599 if(offsets!=NULL) {
600 *offsets++=sourceIndex;
601 *offsets++=sourceIndex++;
602 }
603 } else {
604 if(offsets!=NULL) {
605 *offsets++=sourceIndex++;
606 }
607 cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
608 cnv->charErrorBufferLength=1;
609 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
610 }
611 bits=(uint8_t)((c&15)<<2);
612 base64Counter=1;
613 break;
614 case 1:
615 *target++=toBase64[bits|(c>>14)];
616 if(target<targetLimit) {
617 *target++=toBase64[(c>>8)&0x3f];
618 if(target<targetLimit) {
619 *target++=toBase64[(c>>2)&0x3f];
620 if(offsets!=NULL) {
621 *offsets++=sourceIndex;
622 *offsets++=sourceIndex;
623 *offsets++=sourceIndex++;
624 }
625 } else {
626 if(offsets!=NULL) {
627 *offsets++=sourceIndex;
628 *offsets++=sourceIndex++;
629 }
630 cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
631 cnv->charErrorBufferLength=1;
632 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
633 }
634 } else {
635 if(offsets!=NULL) {
636 *offsets++=sourceIndex++;
637 }
638 cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
639 cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
640 cnv->charErrorBufferLength=2;
641 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
642 }
643 bits=(uint8_t)((c&3)<<4);
644 base64Counter=2;
645 break;
646 case 2:
647 *target++=toBase64[bits|(c>>12)];
648 if(target<targetLimit) {
649 *target++=toBase64[(c>>6)&0x3f];
650 if(target<targetLimit) {
651 *target++=toBase64[c&0x3f];
652 if(offsets!=NULL) {
653 *offsets++=sourceIndex;
654 *offsets++=sourceIndex;
655 *offsets++=sourceIndex++;
656 }
657 } else {
658 if(offsets!=NULL) {
659 *offsets++=sourceIndex;
660 *offsets++=sourceIndex++;
661 }
662 cnv->charErrorBuffer[0]=toBase64[c&0x3f];
663 cnv->charErrorBufferLength=1;
664 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
665 }
666 } else {
667 if(offsets!=NULL) {
668 *offsets++=sourceIndex++;
669 }
670 cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
671 cnv->charErrorBuffer[1]=toBase64[c&0x3f];
672 cnv->charErrorBufferLength=2;
673 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
674 }
675 bits=0;
676 base64Counter=0;
677 break;
678 default:
679 /* will never occur */
680 break;
681 }
682 }
683 } else {
684 /* target is full */
685 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
686 break;
687 }
688 }
689 }
690
691 if(pArgs->flush && source>=sourceLimit) {
692 /* flush remaining bits to the target */
693 if(!inDirectMode) {
694 if (base64Counter!=0) {
695 if(target<targetLimit) {
696 *target++=toBase64[bits];
697 if(offsets!=NULL) {
698 *offsets++=sourceIndex-1;
699 }
700 } else {
701 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
702 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
703 }
704 }
705 /* Add final MINUS to terminate unicodeMode */
706 if(target<targetLimit) {
707 *target++=MINUS;
708 if(offsets!=NULL) {
709 *offsets++=sourceIndex-1;
710 }
711 } else {
712 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
713 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
714 }
715 }
716 /* reset the state for the next conversion */
717 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
718 } else {
719 /* set the converter state back into UConverter */
720 cnv->fromUnicodeStatus=
721 (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
722 ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
723 }
724
725 /* write back the updated pointers */
726 pArgs->source=source;
727 pArgs->target=(char *)target;
728 pArgs->offsets=offsets;
729 return;
730 }
731
732 static const char *
_UTF7GetName(const UConverter * cnv)733 _UTF7GetName(const UConverter *cnv) {
734 switch(cnv->fromUnicodeStatus>>28) {
735 case 1:
736 return "UTF-7,version=1";
737 default:
738 return "UTF-7";
739 }
740 }
741
742 static const UConverterImpl _UTF7Impl={
743 UCNV_UTF7,
744
745 NULL,
746 NULL,
747
748 _UTF7Open,
749 NULL,
750 _UTF7Reset,
751
752 _UTF7ToUnicodeWithOffsets,
753 _UTF7ToUnicodeWithOffsets,
754 _UTF7FromUnicodeWithOffsets,
755 _UTF7FromUnicodeWithOffsets,
756 NULL,
757
758 NULL,
759 _UTF7GetName,
760 NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
761 NULL,
762 ucnv_getCompleteUnicodeSet
763 };
764
765 static const UConverterStaticData _UTF7StaticData={
766 sizeof(UConverterStaticData),
767 "UTF-7",
768 0, /* TODO CCSID for UTF-7 */
769 UCNV_IBM, UCNV_UTF7,
770 1, 4,
771 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
772 FALSE, FALSE,
773 0,
774 0,
775 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
776 };
777
778 const UConverterSharedData _UTF7Data=
779 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF7StaticData, &_UTF7Impl);
780
781 /* IMAP mailbox name encoding ----------------------------------------------- */
782
783 /*
784 * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
785 * http://www.ietf.org/rfc/rfc2060.txt
786 *
787 * 5.1.3. Mailbox International Naming Convention
788 *
789 * By convention, international mailbox names are specified using a
790 * modified version of the UTF-7 encoding described in [UTF-7]. The
791 * purpose of these modifications is to correct the following problems
792 * with UTF-7:
793 *
794 * 1) UTF-7 uses the "+" character for shifting; this conflicts with
795 * the common use of "+" in mailbox names, in particular USENET
796 * newsgroup names.
797 *
798 * 2) UTF-7's encoding is BASE64 which uses the "/" character; this
799 * conflicts with the use of "/" as a popular hierarchy delimiter.
800 *
801 * 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
802 * the use of "\" as a popular hierarchy delimiter.
803 *
804 * 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
805 * the use of "~" in some servers as a home directory indicator.
806 *
807 * 5) UTF-7 permits multiple alternate forms to represent the same
808 * string; in particular, printable US-ASCII chararacters can be
809 * represented in encoded form.
810 *
811 * In modified UTF-7, printable US-ASCII characters except for "&"
812 * represent themselves; that is, characters with octet values 0x20-0x25
813 * and 0x27-0x7e. The character "&" (0x26) is represented by the two-
814 * octet sequence "&-".
815 *
816 * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
817 * Unicode 16-bit octets) are represented in modified BASE64, with a
818 * further modification from [UTF-7] that "," is used instead of "/".
819 * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
820 * character which can represent itself.
821 *
822 * "&" is used to shift to modified BASE64 and "-" to shift back to US-
823 * ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that
824 * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
825 * ").
826 *
827 * For example, here is a mailbox name which mixes English, Japanese,
828 * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
829 */
830
831 /*
832 * Tests for US-ASCII characters belonging to character classes
833 * defined in UTF-7.
834 *
835 * Set D (directly encoded characters) consists of the following
836 * characters: the upper and lower case letters A through Z
837 * and a through z, the 10 digits 0-9, and the following nine special
838 * characters (note that "+" and "=" are omitted):
839 * '(),-./:?
840 *
841 * Set O (optional direct characters) consists of the following
842 * characters (note that "\" and "~" are omitted):
843 * !"#$%&*;<=>@[]^_`{|}
844 *
845 * According to the rules in RFC 2152, the byte values for the following
846 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
847 * - all C0 control codes except for CR LF TAB
848 * - BACKSLASH
849 * - TILDE
850 * - DEL
851 * - all codes beyond US-ASCII, i.e. all >127
852 */
853
854 /* uses '&' not '+' to start a base64 sequence */
855 #define AMPERSAND 0x26
856 #define COMMA 0x2c
857 #define SLASH 0x2f
858
859 /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
860 #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
861
862 /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
863 #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
864
865 #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
866 #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
867
868 /*
869 * converter status values:
870 *
871 * toUnicodeStatus:
872 * 24 inDirectMode (boolean)
873 * 23..16 base64Counter (-1..7)
874 * 15..0 bits (up to 14 bits incoming base64)
875 *
876 * fromUnicodeStatus:
877 * 24 inDirectMode (boolean)
878 * 23..16 base64Counter (0..2)
879 * 7..0 bits (6 bits outgoing base64)
880 *
881 * ignore bits 31..25
882 */
883
884 static void
_IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)885 _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
886 UErrorCode *pErrorCode) {
887 UConverter *cnv;
888 const uint8_t *source, *sourceLimit;
889 UChar *target;
890 const UChar *targetLimit;
891 int32_t *offsets;
892
893 uint8_t *bytes;
894 uint8_t byteIndex;
895
896 int32_t length, targetCapacity;
897
898 /* UTF-7 state */
899 uint16_t bits;
900 int8_t base64Counter;
901 UBool inDirectMode;
902
903 int8_t base64Value;
904
905 int32_t sourceIndex, nextSourceIndex;
906
907 UChar c;
908 uint8_t b;
909
910 /* set up the local pointers */
911 cnv=pArgs->converter;
912
913 source=(const uint8_t *)pArgs->source;
914 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
915 target=pArgs->target;
916 targetLimit=pArgs->targetLimit;
917 offsets=pArgs->offsets;
918 /* get the state machine state */
919 {
920 uint32_t status=cnv->toUnicodeStatus;
921 inDirectMode=(UBool)((status>>24)&1);
922 base64Counter=(int8_t)(status>>16);
923 bits=(uint16_t)status;
924 }
925 bytes=cnv->toUBytes;
926 byteIndex=cnv->toULength;
927
928 /* sourceIndex=-1 if the current character began in the previous buffer */
929 sourceIndex=byteIndex==0 ? 0 : -1;
930 nextSourceIndex=0;
931
932 if(inDirectMode) {
933 directMode:
934 /*
935 * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
936 * with their US-ASCII byte values.
937 * An ampersand starts Unicode (or "escape") Mode.
938 *
939 * In Direct Mode, only the sourceIndex is used.
940 */
941 byteIndex=0;
942 length=(int32_t)(sourceLimit-source);
943 targetCapacity=(int32_t)(targetLimit-target);
944 if(length>targetCapacity) {
945 length=targetCapacity;
946 }
947 while(length>0) {
948 b=*source++;
949 if(!isLegalIMAP(b)) {
950 /* illegal */
951 bytes[0]=b;
952 byteIndex=1;
953 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
954 break;
955 } else if(b!=AMPERSAND) {
956 /* write directly encoded character */
957 *target++=b;
958 if(offsets!=NULL) {
959 *offsets++=sourceIndex++;
960 }
961 } else /* AMPERSAND */ {
962 /* switch to Unicode mode */
963 nextSourceIndex=++sourceIndex;
964 inDirectMode=FALSE;
965 byteIndex=0;
966 bits=0;
967 base64Counter=-1;
968 goto unicodeMode;
969 }
970 --length;
971 }
972 if(source<sourceLimit && target>=targetLimit) {
973 /* target is full */
974 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
975 }
976 } else {
977 unicodeMode:
978 /*
979 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
980 * The base64 sequence ends with any character that is not in the base64 alphabet.
981 * A terminating minus sign is consumed.
982 * US-ASCII must not be base64-ed.
983 *
984 * In Unicode Mode, the sourceIndex has the index to the start of the current
985 * base64 bytes, while nextSourceIndex is precisely parallel to source,
986 * keeping the index to the following byte.
987 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
988 */
989 while(source<sourceLimit) {
990 if(target<targetLimit) {
991 bytes[byteIndex++]=b=*source++;
992 ++nextSourceIndex;
993 if(b>0x7e) {
994 /* illegal - test other illegal US-ASCII values by base64Value==-3 */
995 inDirectMode=TRUE;
996 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
997 break;
998 } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
999 /* collect base64 bytes into UChars */
1000 switch(base64Counter) {
1001 case -1: /* -1 is immediately after the & */
1002 case 0:
1003 bits=base64Value;
1004 base64Counter=1;
1005 break;
1006 case 1:
1007 case 3:
1008 case 4:
1009 case 6:
1010 bits=(uint16_t)((bits<<6)|base64Value);
1011 ++base64Counter;
1012 break;
1013 case 2:
1014 c=(UChar)((bits<<4)|(base64Value>>2));
1015 if(isLegalIMAP(c)) {
1016 /* illegal */
1017 inDirectMode=TRUE;
1018 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1019 goto endloop;
1020 }
1021 *target++=c;
1022 if(offsets!=NULL) {
1023 *offsets++=sourceIndex;
1024 sourceIndex=nextSourceIndex-1;
1025 }
1026 bytes[0]=b; /* keep this byte in case an error occurs */
1027 byteIndex=1;
1028 bits=(uint16_t)(base64Value&3);
1029 base64Counter=3;
1030 break;
1031 case 5:
1032 c=(UChar)((bits<<2)|(base64Value>>4));
1033 if(isLegalIMAP(c)) {
1034 /* illegal */
1035 inDirectMode=TRUE;
1036 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1037 goto endloop;
1038 }
1039 *target++=c;
1040 if(offsets!=NULL) {
1041 *offsets++=sourceIndex;
1042 sourceIndex=nextSourceIndex-1;
1043 }
1044 bytes[0]=b; /* keep this byte in case an error occurs */
1045 byteIndex=1;
1046 bits=(uint16_t)(base64Value&15);
1047 base64Counter=6;
1048 break;
1049 case 7:
1050 c=(UChar)((bits<<6)|base64Value);
1051 if(isLegalIMAP(c)) {
1052 /* illegal */
1053 inDirectMode=TRUE;
1054 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1055 goto endloop;
1056 }
1057 *target++=c;
1058 if(offsets!=NULL) {
1059 *offsets++=sourceIndex;
1060 sourceIndex=nextSourceIndex;
1061 }
1062 byteIndex=0;
1063 bits=0;
1064 base64Counter=0;
1065 break;
1066 default:
1067 /* will never occur */
1068 break;
1069 }
1070 } else if(base64Value==-2) {
1071 /* minus sign terminates the base64 sequence */
1072 inDirectMode=TRUE;
1073 if(base64Counter==-1) {
1074 /* &- i.e. a minus immediately following an ampersand */
1075 *target++=AMPERSAND;
1076 if(offsets!=NULL) {
1077 *offsets++=sourceIndex-1;
1078 }
1079 } else {
1080 /* absorb the minus and leave the Unicode Mode */
1081 if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
1082 /* bits are illegally left over, a UChar is incomplete */
1083 /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
1084 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1085 break;
1086 }
1087 }
1088 sourceIndex=nextSourceIndex;
1089 goto directMode;
1090 } else {
1091 if(base64Counter==-1) {
1092 /* illegal: & immediately followed by something other than base64 or minus sign */
1093 /* include the ampersand in the reported sequence */
1094 --sourceIndex;
1095 bytes[0]=AMPERSAND;
1096 bytes[1]=b;
1097 byteIndex=2;
1098 }
1099 /* base64Value==-1 for characters that are illegal only in Unicode mode */
1100 /* base64Value==-3 for illegal characters */
1101 /* illegal */
1102 inDirectMode=TRUE;
1103 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1104 break;
1105 }
1106 } else {
1107 /* target is full */
1108 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1109 break;
1110 }
1111 }
1112 }
1113 endloop:
1114
1115 /*
1116 * the end of the input stream and detection of truncated input
1117 * are handled by the framework, but here we must check if we are in Unicode
1118 * mode and byteIndex==0 because we must end in direct mode
1119 *
1120 * conditions:
1121 * successful
1122 * in Unicode mode and byteIndex==0
1123 * end of input and no truncated input
1124 */
1125 if( U_SUCCESS(*pErrorCode) &&
1126 !inDirectMode && byteIndex==0 &&
1127 pArgs->flush && source>=sourceLimit
1128 ) {
1129 if(base64Counter==-1) {
1130 /* & at the very end of the input */
1131 /* make the ampersand the reported sequence */
1132 bytes[0]=AMPERSAND;
1133 byteIndex=1;
1134 }
1135 /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1136
1137 inDirectMode=TRUE; /* avoid looping */
1138 *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1139 }
1140
1141 /* set the converter state back into UConverter */
1142 cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
1143 cnv->toULength=byteIndex;
1144
1145 /* write back the updated pointers */
1146 pArgs->source=(const char *)source;
1147 pArgs->target=target;
1148 pArgs->offsets=offsets;
1149 return;
1150 }
1151
1152 static void
_IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)1153 _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1154 UErrorCode *pErrorCode) {
1155 UConverter *cnv;
1156 const UChar *source, *sourceLimit;
1157 uint8_t *target, *targetLimit;
1158 int32_t *offsets;
1159
1160 int32_t length, targetCapacity, sourceIndex;
1161 UChar c;
1162 uint8_t b;
1163
1164 /* UTF-7 state */
1165 uint8_t bits;
1166 int8_t base64Counter;
1167 UBool inDirectMode;
1168
1169 /* set up the local pointers */
1170 cnv=pArgs->converter;
1171
1172 /* set up the local pointers */
1173 source=pArgs->source;
1174 sourceLimit=pArgs->sourceLimit;
1175 target=(uint8_t *)pArgs->target;
1176 targetLimit=(uint8_t *)pArgs->targetLimit;
1177 offsets=pArgs->offsets;
1178
1179 /* get the state machine state */
1180 {
1181 uint32_t status=cnv->fromUnicodeStatus;
1182 inDirectMode=(UBool)((status>>24)&1);
1183 base64Counter=(int8_t)(status>>16);
1184 bits=(uint8_t)status;
1185 }
1186
1187 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1188 sourceIndex=0;
1189
1190 if(inDirectMode) {
1191 directMode:
1192 length=(int32_t)(sourceLimit-source);
1193 targetCapacity=(int32_t)(targetLimit-target);
1194 if(length>targetCapacity) {
1195 length=targetCapacity;
1196 }
1197 while(length>0) {
1198 c=*source++;
1199 /* encode 0x20..0x7e except '&' directly */
1200 if(inSetDIMAP(c)) {
1201 /* encode directly */
1202 *target++=(uint8_t)c;
1203 if(offsets!=NULL) {
1204 *offsets++=sourceIndex++;
1205 }
1206 } else if(c==AMPERSAND) {
1207 /* output &- for & */
1208 *target++=AMPERSAND;
1209 if(target<targetLimit) {
1210 *target++=MINUS;
1211 if(offsets!=NULL) {
1212 *offsets++=sourceIndex;
1213 *offsets++=sourceIndex++;
1214 }
1215 /* realign length and targetCapacity */
1216 goto directMode;
1217 } else {
1218 if(offsets!=NULL) {
1219 *offsets++=sourceIndex++;
1220 }
1221 cnv->charErrorBuffer[0]=MINUS;
1222 cnv->charErrorBufferLength=1;
1223 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1224 break;
1225 }
1226 } else {
1227 /* un-read this character and switch to Unicode Mode */
1228 --source;
1229 *target++=AMPERSAND;
1230 if(offsets!=NULL) {
1231 *offsets++=sourceIndex;
1232 }
1233 inDirectMode=FALSE;
1234 base64Counter=0;
1235 goto unicodeMode;
1236 }
1237 --length;
1238 }
1239 if(source<sourceLimit && target>=targetLimit) {
1240 /* target is full */
1241 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1242 }
1243 } else {
1244 unicodeMode:
1245 while(source<sourceLimit) {
1246 if(target<targetLimit) {
1247 c=*source++;
1248 if(isLegalIMAP(c)) {
1249 /* encode directly */
1250 inDirectMode=TRUE;
1251
1252 /* trick: back out this character to make this easier */
1253 --source;
1254
1255 /* terminate the base64 sequence */
1256 if(base64Counter!=0) {
1257 /* write remaining bits for the previous character */
1258 *target++=TO_BASE64_IMAP(bits);
1259 if(offsets!=NULL) {
1260 *offsets++=sourceIndex-1;
1261 }
1262 }
1263 /* need to terminate with a minus */
1264 if(target<targetLimit) {
1265 *target++=MINUS;
1266 if(offsets!=NULL) {
1267 *offsets++=sourceIndex-1;
1268 }
1269 } else {
1270 cnv->charErrorBuffer[0]=MINUS;
1271 cnv->charErrorBufferLength=1;
1272 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1273 break;
1274 }
1275 goto directMode;
1276 } else {
1277 /*
1278 * base64 this character:
1279 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1280 * and the bits of this character, each implicitly in UTF-16BE.
1281 *
1282 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1283 * character to the next. The actual 2 or 4 bits are shifted to the left edge
1284 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1285 */
1286 switch(base64Counter) {
1287 case 0:
1288 b=(uint8_t)(c>>10);
1289 *target++=TO_BASE64_IMAP(b);
1290 if(target<targetLimit) {
1291 b=(uint8_t)((c>>4)&0x3f);
1292 *target++=TO_BASE64_IMAP(b);
1293 if(offsets!=NULL) {
1294 *offsets++=sourceIndex;
1295 *offsets++=sourceIndex++;
1296 }
1297 } else {
1298 if(offsets!=NULL) {
1299 *offsets++=sourceIndex++;
1300 }
1301 b=(uint8_t)((c>>4)&0x3f);
1302 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1303 cnv->charErrorBufferLength=1;
1304 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1305 }
1306 bits=(uint8_t)((c&15)<<2);
1307 base64Counter=1;
1308 break;
1309 case 1:
1310 b=(uint8_t)(bits|(c>>14));
1311 *target++=TO_BASE64_IMAP(b);
1312 if(target<targetLimit) {
1313 b=(uint8_t)((c>>8)&0x3f);
1314 *target++=TO_BASE64_IMAP(b);
1315 if(target<targetLimit) {
1316 b=(uint8_t)((c>>2)&0x3f);
1317 *target++=TO_BASE64_IMAP(b);
1318 if(offsets!=NULL) {
1319 *offsets++=sourceIndex;
1320 *offsets++=sourceIndex;
1321 *offsets++=sourceIndex++;
1322 }
1323 } else {
1324 if(offsets!=NULL) {
1325 *offsets++=sourceIndex;
1326 *offsets++=sourceIndex++;
1327 }
1328 b=(uint8_t)((c>>2)&0x3f);
1329 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1330 cnv->charErrorBufferLength=1;
1331 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1332 }
1333 } else {
1334 if(offsets!=NULL) {
1335 *offsets++=sourceIndex++;
1336 }
1337 b=(uint8_t)((c>>8)&0x3f);
1338 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1339 b=(uint8_t)((c>>2)&0x3f);
1340 cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1341 cnv->charErrorBufferLength=2;
1342 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1343 }
1344 bits=(uint8_t)((c&3)<<4);
1345 base64Counter=2;
1346 break;
1347 case 2:
1348 b=(uint8_t)(bits|(c>>12));
1349 *target++=TO_BASE64_IMAP(b);
1350 if(target<targetLimit) {
1351 b=(uint8_t)((c>>6)&0x3f);
1352 *target++=TO_BASE64_IMAP(b);
1353 if(target<targetLimit) {
1354 b=(uint8_t)(c&0x3f);
1355 *target++=TO_BASE64_IMAP(b);
1356 if(offsets!=NULL) {
1357 *offsets++=sourceIndex;
1358 *offsets++=sourceIndex;
1359 *offsets++=sourceIndex++;
1360 }
1361 } else {
1362 if(offsets!=NULL) {
1363 *offsets++=sourceIndex;
1364 *offsets++=sourceIndex++;
1365 }
1366 b=(uint8_t)(c&0x3f);
1367 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1368 cnv->charErrorBufferLength=1;
1369 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1370 }
1371 } else {
1372 if(offsets!=NULL) {
1373 *offsets++=sourceIndex++;
1374 }
1375 b=(uint8_t)((c>>6)&0x3f);
1376 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1377 b=(uint8_t)(c&0x3f);
1378 cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1379 cnv->charErrorBufferLength=2;
1380 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1381 }
1382 bits=0;
1383 base64Counter=0;
1384 break;
1385 default:
1386 /* will never occur */
1387 break;
1388 }
1389 }
1390 } else {
1391 /* target is full */
1392 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1393 break;
1394 }
1395 }
1396 }
1397
1398 if(pArgs->flush && source>=sourceLimit) {
1399 /* flush remaining bits to the target */
1400 if(!inDirectMode) {
1401 if(base64Counter!=0) {
1402 if(target<targetLimit) {
1403 *target++=TO_BASE64_IMAP(bits);
1404 if(offsets!=NULL) {
1405 *offsets++=sourceIndex-1;
1406 }
1407 } else {
1408 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
1409 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1410 }
1411 }
1412 /* need to terminate with a minus */
1413 if(target<targetLimit) {
1414 *target++=MINUS;
1415 if(offsets!=NULL) {
1416 *offsets++=sourceIndex-1;
1417 }
1418 } else {
1419 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
1420 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1421 }
1422 }
1423 /* reset the state for the next conversion */
1424 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
1425 } else {
1426 /* set the converter state back into UConverter */
1427 cnv->fromUnicodeStatus=
1428 (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
1429 ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
1430 }
1431
1432 /* write back the updated pointers */
1433 pArgs->source=source;
1434 pArgs->target=(char *)target;
1435 pArgs->offsets=offsets;
1436 return;
1437 }
1438
1439 static const UConverterImpl _IMAPImpl={
1440 UCNV_IMAP_MAILBOX,
1441
1442 NULL,
1443 NULL,
1444
1445 _UTF7Open,
1446 NULL,
1447 _UTF7Reset,
1448
1449 _IMAPToUnicodeWithOffsets,
1450 _IMAPToUnicodeWithOffsets,
1451 _IMAPFromUnicodeWithOffsets,
1452 _IMAPFromUnicodeWithOffsets,
1453 NULL,
1454
1455 NULL,
1456 NULL,
1457 NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1458 NULL,
1459 ucnv_getCompleteUnicodeSet
1460 };
1461
1462 static const UConverterStaticData _IMAPStaticData={
1463 sizeof(UConverterStaticData),
1464 "IMAP-mailbox-name",
1465 0, /* TODO CCSID for IMAP-mailbox-name */
1466 UCNV_IBM, UCNV_IMAP_MAILBOX,
1467 1, 4,
1468 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1469 FALSE, FALSE,
1470 0,
1471 0,
1472 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1473 };
1474
1475 const UConverterSharedData _IMAPData=
1476 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_IMAPStaticData, &_IMAPImpl);
1477
1478 #endif
1479