• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ******************************************************************************
3 *
4 *   Copyright (C) 2000-2009, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ******************************************************************************
8 *   file name:  ucnvscsu.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2000nov18
14 *   created by: Markus W. Scherer
15 *
16 *   This is an implementation of the Standard Compression Scheme for Unicode
17 *   as defined in http://www.unicode.org/unicode/reports/tr6/ .
18 *   Reserved commands and window settings are treated as illegal sequences and
19 *   will result in callback calls.
20 */
21 
22 #include "unicode/utypes.h"
23 
24 #if !UCONFIG_NO_CONVERSION
25 
26 #include "unicode/ucnv.h"
27 #include "unicode/ucnv_cb.h"
28 #include "ucnv_bld.h"
29 #include "ucnv_cnv.h"
30 #include "cmemory.h"
31 
32 /* SCSU definitions --------------------------------------------------------- */
33 
34 /* SCSU command byte values */
35 enum {
36     SQ0=0x01, /* Quote from window pair 0 */
37     SQ7=0x08, /* Quote from window pair 7 */
38     SDX=0x0B, /* Define a window as extended */
39     Srs=0x0C, /* reserved */
40     SQU=0x0E, /* Quote a single Unicode character */
41     SCU=0x0F, /* Change to Unicode mode */
42     SC0=0x10, /* Select window 0 */
43     SC7=0x17, /* Select window 7 */
44     SD0=0x18, /* Define and select window 0 */
45     SD7=0x1F, /* Define and select window 7 */
46 
47     UC0=0xE0, /* Select window 0 */
48     UC7=0xE7, /* Select window 7 */
49     UD0=0xE8, /* Define and select window 0 */
50     UD7=0xEF, /* Define and select window 7 */
51     UQU=0xF0, /* Quote a single Unicode character */
52     UDX=0xF1, /* Define a Window as extended */
53     Urs=0xF2  /* reserved */
54 };
55 
56 enum {
57     /*
58      * Unicode code points from 3400 to E000 are not adressible by
59      * dynamic window, since in these areas no short run alphabets are
60      * found. Therefore add gapOffset to all values from gapThreshold.
61      */
62     gapThreshold=0x68,
63     gapOffset=0xAC00,
64 
65     /* values between reservedStart and fixedThreshold are reserved */
66     reservedStart=0xA8,
67 
68     /* use table of predefined fixed offsets for values from fixedThreshold */
69     fixedThreshold=0xF9
70 };
71 
72 /* constant offsets for the 8 static windows */
73 static const uint32_t staticOffsets[8]={
74     0x0000, /* ASCII for quoted tags */
75     0x0080, /* Latin - 1 Supplement (for access to punctuation) */
76     0x0100, /* Latin Extended-A */
77     0x0300, /* Combining Diacritical Marks */
78     0x2000, /* General Punctuation */
79     0x2080, /* Currency Symbols */
80     0x2100, /* Letterlike Symbols and Number Forms */
81     0x3000  /* CJK Symbols and punctuation */
82 };
83 
84 /* initial offsets for the 8 dynamic (sliding) windows */
85 static const uint32_t initialDynamicOffsets[8]={
86     0x0080, /* Latin-1 */
87     0x00C0, /* Latin Extended A */
88     0x0400, /* Cyrillic */
89     0x0600, /* Arabic */
90     0x0900, /* Devanagari */
91     0x3040, /* Hiragana */
92     0x30A0, /* Katakana */
93     0xFF00  /* Fullwidth ASCII */
94 };
95 
96 /* Table of fixed predefined Offsets */
97 static const uint32_t fixedOffsets[]={
98     /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */
99     /* 0xFA */ 0x0250, /* IPA extensions */
100     /* 0xFB */ 0x0370, /* Greek */
101     /* 0xFC */ 0x0530, /* Armenian */
102     /* 0xFD */ 0x3040, /* Hiragana */
103     /* 0xFE */ 0x30A0, /* Katakana */
104     /* 0xFF */ 0xFF60  /* Halfwidth Katakana */
105 };
106 
107 /* state values */
108 enum {
109     readCommand,
110     quotePairOne,
111     quotePairTwo,
112     quoteOne,
113     definePairOne,
114     definePairTwo,
115     defineOne
116 };
117 
118 typedef struct SCSUData {
119     /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */
120     uint32_t toUDynamicOffsets[8];
121     uint32_t fromUDynamicOffsets[8];
122 
123     /* state machine state - toUnicode */
124     UBool toUIsSingleByteMode;
125     uint8_t toUState;
126     int8_t toUQuoteWindow, toUDynamicWindow;
127     uint8_t toUByteOne;
128     uint8_t toUPadding[3];
129 
130     /* state machine state - fromUnicode */
131     UBool fromUIsSingleByteMode;
132     int8_t fromUDynamicWindow;
133 
134     /*
135      * windowUse[] keeps track of the use of the dynamic windows:
136      * At nextWindowUseIndex there is the least recently used window,
137      * and the following windows (in a wrapping manner) are more and more
138      * recently used.
139      * At nextWindowUseIndex-1 there is the most recently used window.
140      */
141     uint8_t locale;
142     int8_t nextWindowUseIndex;
143     int8_t windowUse[8];
144 } SCSUData;
145 
146 static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 };
147 static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 };
148 
149 enum {
150     lGeneric, l_ja
151 };
152 
153 /* SCSU setup functions ----------------------------------------------------- */
154 
155 static void
_SCSUReset(UConverter * cnv,UConverterResetChoice choice)156 _SCSUReset(UConverter *cnv, UConverterResetChoice choice) {
157     SCSUData *scsu=(SCSUData *)cnv->extraInfo;
158 
159     if(choice<=UCNV_RESET_TO_UNICODE) {
160         /* reset toUnicode */
161         uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32);
162 
163         scsu->toUIsSingleByteMode=TRUE;
164         scsu->toUState=readCommand;
165         scsu->toUQuoteWindow=scsu->toUDynamicWindow=0;
166         scsu->toUByteOne=0;
167 
168         cnv->toULength=0;
169     }
170     if(choice!=UCNV_RESET_TO_UNICODE) {
171         /* reset fromUnicode */
172         uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32);
173 
174         scsu->fromUIsSingleByteMode=TRUE;
175         scsu->fromUDynamicWindow=0;
176 
177         scsu->nextWindowUseIndex=0;
178         switch(scsu->locale) {
179         case l_ja:
180             uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8);
181             break;
182         default:
183             uprv_memcpy(scsu->windowUse, initialWindowUse, 8);
184             break;
185         }
186 
187         cnv->fromUChar32=0;
188     }
189 }
190 
191 static void
_SCSUOpen(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)192 _SCSUOpen(UConverter *cnv,
193           UConverterLoadArgs *pArgs,
194           UErrorCode *pErrorCode) {
195     const char *locale=pArgs->locale;
196     if(pArgs->onlyTestIsLoadable) {
197         return;
198     }
199     cnv->extraInfo=uprv_malloc(sizeof(SCSUData));
200     if(cnv->extraInfo!=NULL) {
201         if(locale!=NULL && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) {
202             ((SCSUData *)cnv->extraInfo)->locale=l_ja;
203         } else {
204             ((SCSUData *)cnv->extraInfo)->locale=lGeneric;
205         }
206         _SCSUReset(cnv, UCNV_RESET_BOTH);
207     } else {
208         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
209     }
210 
211     /* Set the substitution character U+fffd as a Unicode string. */
212     cnv->subUChars[0]=0xfffd;
213     cnv->subCharLen=-1;
214 }
215 
216 static void
_SCSUClose(UConverter * cnv)217 _SCSUClose(UConverter *cnv) {
218     if(cnv->extraInfo!=NULL) {
219         if(!cnv->isExtraLocal) {
220             uprv_free(cnv->extraInfo);
221         }
222         cnv->extraInfo=NULL;
223     }
224 }
225 
226 /* SCSU-to-Unicode conversion functions ------------------------------------- */
227 
228 static void
_SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)229 _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
230                           UErrorCode *pErrorCode) {
231     UConverter *cnv;
232     SCSUData *scsu;
233     const uint8_t *source, *sourceLimit;
234     UChar *target;
235     const UChar *targetLimit;
236     int32_t *offsets;
237     UBool isSingleByteMode;
238     uint8_t state, byteOne;
239     int8_t quoteWindow, dynamicWindow;
240 
241     int32_t sourceIndex, nextSourceIndex;
242 
243     uint8_t b;
244 
245     /* set up the local pointers */
246     cnv=pArgs->converter;
247     scsu=(SCSUData *)cnv->extraInfo;
248 
249     source=(const uint8_t *)pArgs->source;
250     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
251     target=pArgs->target;
252     targetLimit=pArgs->targetLimit;
253     offsets=pArgs->offsets;
254 
255     /* get the state machine state */
256     isSingleByteMode=scsu->toUIsSingleByteMode;
257     state=scsu->toUState;
258     quoteWindow=scsu->toUQuoteWindow;
259     dynamicWindow=scsu->toUDynamicWindow;
260     byteOne=scsu->toUByteOne;
261 
262     /* sourceIndex=-1 if the current character began in the previous buffer */
263     sourceIndex=state==readCommand ? 0 : -1;
264     nextSourceIndex=0;
265 
266     /*
267      * conversion "loop"
268      *
269      * For performance, this is not a normal C loop.
270      * Instead, there are two code blocks for the two SCSU modes.
271      * The function branches to either one, and a change of the mode is done with a goto to
272      * the other branch.
273      *
274      * Each branch has two conventional loops:
275      * - a fast-path loop for the most common codes in the mode
276      * - a loop for all other codes in the mode
277      * When the fast-path runs into a code that it cannot handle, its loop ends and it
278      * runs into the following loop to handle the other codes.
279      * The end of the input or output buffer is also handled by the slower loop.
280      * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
281      *
282      * The callback handling is done by returning with an error code.
283      * The conversion framework actually calls the callback function.
284      */
285     if(isSingleByteMode) {
286         /* fast path for single-byte mode */
287         if(state==readCommand) {
288 fastSingle:
289             while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
290                 ++source;
291                 ++nextSourceIndex;
292                 if(b<=0x7f) {
293                     /* write US-ASCII graphic character or DEL */
294                     *target++=(UChar)b;
295                     if(offsets!=NULL) {
296                         *offsets++=sourceIndex;
297                     }
298                 } else {
299                     /* write from dynamic window */
300                     uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
301                     if(c<=0xffff) {
302                         *target++=(UChar)c;
303                         if(offsets!=NULL) {
304                             *offsets++=sourceIndex;
305                         }
306                     } else {
307                         /* output surrogate pair */
308                         *target++=(UChar)(0xd7c0+(c>>10));
309                         if(target<targetLimit) {
310                             *target++=(UChar)(0xdc00|(c&0x3ff));
311                             if(offsets!=NULL) {
312                                 *offsets++=sourceIndex;
313                                 *offsets++=sourceIndex;
314                             }
315                         } else {
316                             /* target overflow */
317                             if(offsets!=NULL) {
318                                 *offsets++=sourceIndex;
319                             }
320                             cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
321                             cnv->UCharErrorBufferLength=1;
322                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
323                             goto endloop;
324                         }
325                     }
326                 }
327                 sourceIndex=nextSourceIndex;
328             }
329         }
330 
331         /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
332 singleByteMode:
333         while(source<sourceLimit) {
334             if(target>=targetLimit) {
335                 /* target is full */
336                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
337                 break;
338             }
339             b=*source++;
340             ++nextSourceIndex;
341             switch(state) {
342             case readCommand:
343                 /* redundant conditions are commented out */
344                 /* here: b<0x20 because otherwise we would be in fastSingle */
345                 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
346                     /* CR/LF/TAB/NUL */
347                     *target++=(UChar)b;
348                     if(offsets!=NULL) {
349                         *offsets++=sourceIndex;
350                     }
351                     sourceIndex=nextSourceIndex;
352                     goto fastSingle;
353                 } else if(SC0<=b) {
354                     if(b<=SC7) {
355                         dynamicWindow=(int8_t)(b-SC0);
356                         sourceIndex=nextSourceIndex;
357                         goto fastSingle;
358                     } else /* if(SD0<=b && b<=SD7) */ {
359                         dynamicWindow=(int8_t)(b-SD0);
360                         state=defineOne;
361                     }
362                 } else if(/* SQ0<=b && */ b<=SQ7) {
363                     quoteWindow=(int8_t)(b-SQ0);
364                     state=quoteOne;
365                 } else if(b==SDX) {
366                     state=definePairOne;
367                 } else if(b==SQU) {
368                     state=quotePairOne;
369                 } else if(b==SCU) {
370                     sourceIndex=nextSourceIndex;
371                     isSingleByteMode=FALSE;
372                     goto fastUnicode;
373                 } else /* Srs */ {
374                     /* callback(illegal) */
375                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
376                     cnv->toUBytes[0]=b;
377                     cnv->toULength=1;
378                     goto endloop;
379                 }
380 
381                 /* store the first byte of a multibyte sequence in toUBytes[] */
382                 cnv->toUBytes[0]=b;
383                 cnv->toULength=1;
384                 break;
385             case quotePairOne:
386                 byteOne=b;
387                 cnv->toUBytes[1]=b;
388                 cnv->toULength=2;
389                 state=quotePairTwo;
390                 break;
391             case quotePairTwo:
392                 *target++=(UChar)((byteOne<<8)|b);
393                 if(offsets!=NULL) {
394                     *offsets++=sourceIndex;
395                 }
396                 sourceIndex=nextSourceIndex;
397                 state=readCommand;
398                 goto fastSingle;
399             case quoteOne:
400                 if(b<0x80) {
401                     /* all static offsets are in the BMP */
402                     *target++=(UChar)(staticOffsets[quoteWindow]+b);
403                     if(offsets!=NULL) {
404                         *offsets++=sourceIndex;
405                     }
406                 } else {
407                     /* write from dynamic window */
408                     uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
409                     if(c<=0xffff) {
410                         *target++=(UChar)c;
411                         if(offsets!=NULL) {
412                             *offsets++=sourceIndex;
413                         }
414                     } else {
415                         /* output surrogate pair */
416                         *target++=(UChar)(0xd7c0+(c>>10));
417                         if(target<targetLimit) {
418                             *target++=(UChar)(0xdc00|(c&0x3ff));
419                             if(offsets!=NULL) {
420                                 *offsets++=sourceIndex;
421                                 *offsets++=sourceIndex;
422                             }
423                         } else {
424                             /* target overflow */
425                             if(offsets!=NULL) {
426                                 *offsets++=sourceIndex;
427                             }
428                             cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
429                             cnv->UCharErrorBufferLength=1;
430                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
431                             goto endloop;
432                         }
433                     }
434                 }
435                 sourceIndex=nextSourceIndex;
436                 state=readCommand;
437                 goto fastSingle;
438             case definePairOne:
439                 dynamicWindow=(int8_t)((b>>5)&7);
440                 byteOne=(uint8_t)(b&0x1f);
441                 cnv->toUBytes[1]=b;
442                 cnv->toULength=2;
443                 state=definePairTwo;
444                 break;
445             case definePairTwo:
446                 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
447                 sourceIndex=nextSourceIndex;
448                 state=readCommand;
449                 goto fastSingle;
450             case defineOne:
451                 if(b==0) {
452                     /* callback(illegal): Reserved window offset value 0 */
453                     cnv->toUBytes[1]=b;
454                     cnv->toULength=2;
455                     goto endloop;
456                 } else if(b<gapThreshold) {
457                     scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
458                 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
459                     scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
460                 } else if(b>=fixedThreshold) {
461                     scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
462                 } else {
463                     /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
464                     cnv->toUBytes[1]=b;
465                     cnv->toULength=2;
466                     goto endloop;
467                 }
468                 sourceIndex=nextSourceIndex;
469                 state=readCommand;
470                 goto fastSingle;
471             }
472         }
473     } else {
474         /* fast path for Unicode mode */
475         if(state==readCommand) {
476 fastUnicode:
477             while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
478                 *target++=(UChar)((b<<8)|source[1]);
479                 if(offsets!=NULL) {
480                     *offsets++=sourceIndex;
481                 }
482                 sourceIndex=nextSourceIndex;
483                 nextSourceIndex+=2;
484                 source+=2;
485             }
486         }
487 
488         /* normal state machine for Unicode mode */
489 /* unicodeByteMode: */
490         while(source<sourceLimit) {
491             if(target>=targetLimit) {
492                 /* target is full */
493                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
494                 break;
495             }
496             b=*source++;
497             ++nextSourceIndex;
498             switch(state) {
499             case readCommand:
500                 if((uint8_t)(b-UC0)>(Urs-UC0)) {
501                     byteOne=b;
502                     cnv->toUBytes[0]=b;
503                     cnv->toULength=1;
504                     state=quotePairTwo;
505                 } else if(/* UC0<=b && */ b<=UC7) {
506                     dynamicWindow=(int8_t)(b-UC0);
507                     sourceIndex=nextSourceIndex;
508                     isSingleByteMode=TRUE;
509                     goto fastSingle;
510                 } else if(/* UD0<=b && */ b<=UD7) {
511                     dynamicWindow=(int8_t)(b-UD0);
512                     isSingleByteMode=TRUE;
513                     cnv->toUBytes[0]=b;
514                     cnv->toULength=1;
515                     state=defineOne;
516                     goto singleByteMode;
517                 } else if(b==UDX) {
518                     isSingleByteMode=TRUE;
519                     cnv->toUBytes[0]=b;
520                     cnv->toULength=1;
521                     state=definePairOne;
522                     goto singleByteMode;
523                 } else if(b==UQU) {
524                     cnv->toUBytes[0]=b;
525                     cnv->toULength=1;
526                     state=quotePairOne;
527                 } else /* Urs */ {
528                     /* callback(illegal) */
529                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
530                     cnv->toUBytes[0]=b;
531                     cnv->toULength=1;
532                     goto endloop;
533                 }
534                 break;
535             case quotePairOne:
536                 byteOne=b;
537                 cnv->toUBytes[1]=b;
538                 cnv->toULength=2;
539                 state=quotePairTwo;
540                 break;
541             case quotePairTwo:
542                 *target++=(UChar)((byteOne<<8)|b);
543                 if(offsets!=NULL) {
544                     *offsets++=sourceIndex;
545                 }
546                 sourceIndex=nextSourceIndex;
547                 state=readCommand;
548                 goto fastUnicode;
549             }
550         }
551     }
552 endloop:
553 
554     /* set the converter state back into UConverter */
555     if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
556         /* reset to deal with the next character */
557         state=readCommand;
558     } else if(state==readCommand) {
559         /* not in a multi-byte sequence, reset toULength */
560         cnv->toULength=0;
561     }
562     scsu->toUIsSingleByteMode=isSingleByteMode;
563     scsu->toUState=state;
564     scsu->toUQuoteWindow=quoteWindow;
565     scsu->toUDynamicWindow=dynamicWindow;
566     scsu->toUByteOne=byteOne;
567 
568     /* write back the updated pointers */
569     pArgs->source=(const char *)source;
570     pArgs->target=target;
571     pArgs->offsets=offsets;
572     return;
573 }
574 
575 /*
576  * Identical to _SCSUToUnicodeWithOffsets but without offset handling.
577  * If a change is made in the original function, then either
578  * change this function the same way or
579  * re-copy the original function and remove the variables
580  * offsets, sourceIndex, and nextSourceIndex.
581  */
582 static void
_SCSUToUnicode(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)583 _SCSUToUnicode(UConverterToUnicodeArgs *pArgs,
584                UErrorCode *pErrorCode) {
585     UConverter *cnv;
586     SCSUData *scsu;
587     const uint8_t *source, *sourceLimit;
588     UChar *target;
589     const UChar *targetLimit;
590     UBool isSingleByteMode;
591     uint8_t state, byteOne;
592     int8_t quoteWindow, dynamicWindow;
593 
594     uint8_t b;
595 
596     /* set up the local pointers */
597     cnv=pArgs->converter;
598     scsu=(SCSUData *)cnv->extraInfo;
599 
600     source=(const uint8_t *)pArgs->source;
601     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
602     target=pArgs->target;
603     targetLimit=pArgs->targetLimit;
604 
605     /* get the state machine state */
606     isSingleByteMode=scsu->toUIsSingleByteMode;
607     state=scsu->toUState;
608     quoteWindow=scsu->toUQuoteWindow;
609     dynamicWindow=scsu->toUDynamicWindow;
610     byteOne=scsu->toUByteOne;
611 
612     /*
613      * conversion "loop"
614      *
615      * For performance, this is not a normal C loop.
616      * Instead, there are two code blocks for the two SCSU modes.
617      * The function branches to either one, and a change of the mode is done with a goto to
618      * the other branch.
619      *
620      * Each branch has two conventional loops:
621      * - a fast-path loop for the most common codes in the mode
622      * - a loop for all other codes in the mode
623      * When the fast-path runs into a code that it cannot handle, its loop ends and it
624      * runs into the following loop to handle the other codes.
625      * The end of the input or output buffer is also handled by the slower loop.
626      * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
627      *
628      * The callback handling is done by returning with an error code.
629      * The conversion framework actually calls the callback function.
630      */
631     if(isSingleByteMode) {
632         /* fast path for single-byte mode */
633         if(state==readCommand) {
634 fastSingle:
635             while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
636                 ++source;
637                 if(b<=0x7f) {
638                     /* write US-ASCII graphic character or DEL */
639                     *target++=(UChar)b;
640                 } else {
641                     /* write from dynamic window */
642                     uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
643                     if(c<=0xffff) {
644                         *target++=(UChar)c;
645                     } else {
646                         /* output surrogate pair */
647                         *target++=(UChar)(0xd7c0+(c>>10));
648                         if(target<targetLimit) {
649                             *target++=(UChar)(0xdc00|(c&0x3ff));
650                         } else {
651                             /* target overflow */
652                             cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
653                             cnv->UCharErrorBufferLength=1;
654                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
655                             goto endloop;
656                         }
657                     }
658                 }
659             }
660         }
661 
662         /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
663 singleByteMode:
664         while(source<sourceLimit) {
665             if(target>=targetLimit) {
666                 /* target is full */
667                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
668                 break;
669             }
670             b=*source++;
671             switch(state) {
672             case readCommand:
673                 /* redundant conditions are commented out */
674                 /* here: b<0x20 because otherwise we would be in fastSingle */
675                 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
676                     /* CR/LF/TAB/NUL */
677                     *target++=(UChar)b;
678                     goto fastSingle;
679                 } else if(SC0<=b) {
680                     if(b<=SC7) {
681                         dynamicWindow=(int8_t)(b-SC0);
682                         goto fastSingle;
683                     } else /* if(SD0<=b && b<=SD7) */ {
684                         dynamicWindow=(int8_t)(b-SD0);
685                         state=defineOne;
686                     }
687                 } else if(/* SQ0<=b && */ b<=SQ7) {
688                     quoteWindow=(int8_t)(b-SQ0);
689                     state=quoteOne;
690                 } else if(b==SDX) {
691                     state=definePairOne;
692                 } else if(b==SQU) {
693                     state=quotePairOne;
694                 } else if(b==SCU) {
695                     isSingleByteMode=FALSE;
696                     goto fastUnicode;
697                 } else /* Srs */ {
698                     /* callback(illegal) */
699                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
700                     cnv->toUBytes[0]=b;
701                     cnv->toULength=1;
702                     goto endloop;
703                 }
704 
705                 /* store the first byte of a multibyte sequence in toUBytes[] */
706                 cnv->toUBytes[0]=b;
707                 cnv->toULength=1;
708                 break;
709             case quotePairOne:
710                 byteOne=b;
711                 cnv->toUBytes[1]=b;
712                 cnv->toULength=2;
713                 state=quotePairTwo;
714                 break;
715             case quotePairTwo:
716                 *target++=(UChar)((byteOne<<8)|b);
717                 state=readCommand;
718                 goto fastSingle;
719             case quoteOne:
720                 if(b<0x80) {
721                     /* all static offsets are in the BMP */
722                     *target++=(UChar)(staticOffsets[quoteWindow]+b);
723                 } else {
724                     /* write from dynamic window */
725                     uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
726                     if(c<=0xffff) {
727                         *target++=(UChar)c;
728                     } else {
729                         /* output surrogate pair */
730                         *target++=(UChar)(0xd7c0+(c>>10));
731                         if(target<targetLimit) {
732                             *target++=(UChar)(0xdc00|(c&0x3ff));
733                         } else {
734                             /* target overflow */
735                             cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
736                             cnv->UCharErrorBufferLength=1;
737                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
738                             goto endloop;
739                         }
740                     }
741                 }
742                 state=readCommand;
743                 goto fastSingle;
744             case definePairOne:
745                 dynamicWindow=(int8_t)((b>>5)&7);
746                 byteOne=(uint8_t)(b&0x1f);
747                 cnv->toUBytes[1]=b;
748                 cnv->toULength=2;
749                 state=definePairTwo;
750                 break;
751             case definePairTwo:
752                 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
753                 state=readCommand;
754                 goto fastSingle;
755             case defineOne:
756                 if(b==0) {
757                     /* callback(illegal): Reserved window offset value 0 */
758                     cnv->toUBytes[1]=b;
759                     cnv->toULength=2;
760                     goto endloop;
761                 } else if(b<gapThreshold) {
762                     scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
763                 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
764                     scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
765                 } else if(b>=fixedThreshold) {
766                     scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
767                 } else {
768                     /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
769                     cnv->toUBytes[1]=b;
770                     cnv->toULength=2;
771                     goto endloop;
772                 }
773                 state=readCommand;
774                 goto fastSingle;
775             }
776         }
777     } else {
778         /* fast path for Unicode mode */
779         if(state==readCommand) {
780 fastUnicode:
781             while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
782                 *target++=(UChar)((b<<8)|source[1]);
783                 source+=2;
784             }
785         }
786 
787         /* normal state machine for Unicode mode */
788 /* unicodeByteMode: */
789         while(source<sourceLimit) {
790             if(target>=targetLimit) {
791                 /* target is full */
792                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
793                 break;
794             }
795             b=*source++;
796             switch(state) {
797             case readCommand:
798                 if((uint8_t)(b-UC0)>(Urs-UC0)) {
799                     byteOne=b;
800                     cnv->toUBytes[0]=b;
801                     cnv->toULength=1;
802                     state=quotePairTwo;
803                 } else if(/* UC0<=b && */ b<=UC7) {
804                     dynamicWindow=(int8_t)(b-UC0);
805                     isSingleByteMode=TRUE;
806                     goto fastSingle;
807                 } else if(/* UD0<=b && */ b<=UD7) {
808                     dynamicWindow=(int8_t)(b-UD0);
809                     isSingleByteMode=TRUE;
810                     cnv->toUBytes[0]=b;
811                     cnv->toULength=1;
812                     state=defineOne;
813                     goto singleByteMode;
814                 } else if(b==UDX) {
815                     isSingleByteMode=TRUE;
816                     cnv->toUBytes[0]=b;
817                     cnv->toULength=1;
818                     state=definePairOne;
819                     goto singleByteMode;
820                 } else if(b==UQU) {
821                     cnv->toUBytes[0]=b;
822                     cnv->toULength=1;
823                     state=quotePairOne;
824                 } else /* Urs */ {
825                     /* callback(illegal) */
826                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
827                     cnv->toUBytes[0]=b;
828                     cnv->toULength=1;
829                     goto endloop;
830                 }
831                 break;
832             case quotePairOne:
833                 byteOne=b;
834                 cnv->toUBytes[1]=b;
835                 cnv->toULength=2;
836                 state=quotePairTwo;
837                 break;
838             case quotePairTwo:
839                 *target++=(UChar)((byteOne<<8)|b);
840                 state=readCommand;
841                 goto fastUnicode;
842             }
843         }
844     }
845 endloop:
846 
847     /* set the converter state back into UConverter */
848     if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
849         /* reset to deal with the next character */
850         state=readCommand;
851     } else if(state==readCommand) {
852         /* not in a multi-byte sequence, reset toULength */
853         cnv->toULength=0;
854     }
855     scsu->toUIsSingleByteMode=isSingleByteMode;
856     scsu->toUState=state;
857     scsu->toUQuoteWindow=quoteWindow;
858     scsu->toUDynamicWindow=dynamicWindow;
859     scsu->toUByteOne=byteOne;
860 
861     /* write back the updated pointers */
862     pArgs->source=(const char *)source;
863     pArgs->target=target;
864     return;
865 }
866 
867 /* SCSU-from-Unicode conversion functions ----------------------------------- */
868 
869 /*
870  * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve
871  * reasonable results. The lookahead is minimal.
872  * Many cases are simple:
873  * A character fits directly into the current mode, a dynamic or static window,
874  * or is not compressible. These cases are tested first.
875  * Real compression heuristics are applied to the rest, in code branches for
876  * single/Unicode mode and BMP/supplementary code points.
877  * The heuristics used here are extremely simple.
878  */
879 
880 /* get the number of the window that this character is in, or -1 */
881 static int8_t
getWindow(const uint32_t offsets[8],uint32_t c)882 getWindow(const uint32_t offsets[8], uint32_t c) {
883     int i;
884     for(i=0; i<8; ++i) {
885         if((uint32_t)(c-offsets[i])<=0x7f) {
886             return (int8_t)(i);
887         }
888     }
889     return -1;
890 }
891 
892 /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */
893 static UBool
isInOffsetWindowOrDirect(uint32_t offset,uint32_t c)894 isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) {
895     return (UBool)(c<=offset+0x7f &&
896           (c>=offset || (c<=0x7f &&
897                         (c>=0x20 || (1UL<<c)&0x2601))));
898                                 /* binary 0010 0110 0000 0001,
899                                    check for b==0xd || b==0xa || b==9 || b==0 */
900 }
901 
902 /*
903  * getNextDynamicWindow returns the next dynamic window to be redefined
904  */
905 static int8_t
getNextDynamicWindow(SCSUData * scsu)906 getNextDynamicWindow(SCSUData *scsu) {
907     int8_t window=scsu->windowUse[scsu->nextWindowUseIndex];
908     if(++scsu->nextWindowUseIndex==8) {
909         scsu->nextWindowUseIndex=0;
910     }
911     return window;
912 }
913 
914 /*
915  * useDynamicWindow() adjusts
916  * windowUse[] and nextWindowUseIndex for the algorithm to choose
917  * the next dynamic window to be defined;
918  * a subclass may override it and provide its own algorithm.
919  */
920 static void
useDynamicWindow(SCSUData * scsu,int8_t window)921 useDynamicWindow(SCSUData *scsu, int8_t window) {
922     /*
923      * move the existing window, which just became the most recently used one,
924      * up in windowUse[] to nextWindowUseIndex-1
925      */
926 
927     /* first, find the index of the window - backwards to favor the more recently used windows */
928     int i, j;
929 
930     i=scsu->nextWindowUseIndex;
931     do {
932         if(--i<0) {
933             i=7;
934         }
935     } while(scsu->windowUse[i]!=window);
936 
937     /* now copy each windowUse[i+1] to [i] */
938     j=i+1;
939     if(j==8) {
940         j=0;
941     }
942     while(j!=scsu->nextWindowUseIndex) {
943         scsu->windowUse[i]=scsu->windowUse[j];
944         i=j;
945         if(++j==8) { j=0; }
946     }
947 
948     /* finally, set the window into the most recently used index */
949     scsu->windowUse[i]=window;
950 }
951 
952 /*
953  * calculate the offset and the code for a dynamic window that contains the character
954  * takes fixed offsets into account
955  * the offset of the window is stored in the offset variable,
956  * the code is returned
957  *
958  * return offset code: -1 none  <=0xff code for SDn/UDn  else code for SDX/UDX, subtract 0x200 to get the true code
959  */
960 static int
getDynamicOffset(uint32_t c,uint32_t * pOffset)961 getDynamicOffset(uint32_t c, uint32_t *pOffset) {
962     int i;
963 
964     for(i=0; i<7; ++i) {
965         if((uint32_t)(c-fixedOffsets[i])<=0x7f) {
966             *pOffset=fixedOffsets[i];
967             return 0xf9+i;
968         }
969     }
970 
971     if(c<0x80) {
972         /* No dynamic window for US-ASCII. */
973         return -1;
974     } else if(c<0x3400 ||
975               (uint32_t)(c-0x10000)<(0x14000-0x10000) ||
976               (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000)
977     ) {
978         /* This character is in a code range for a "small", i.e., reasonably windowable, script. */
979         *pOffset=c&0x7fffff80;
980         return (int)(c>>7);
981     } else if(0xe000<=c && c!=0xfeff && c<0xfff0) {
982         /* For these characters we need to take the gapOffset into account. */
983         *pOffset=c&0x7fffff80;
984         return (int)((c-gapOffset)>>7);
985     } else {
986         return -1;
987     }
988 }
989 
990 /*
991  * Idea for compression:
992  *  - save SCSUData and other state before really starting work
993  *  - at endloop, see if compression could be better with just unicode mode
994  *  - don't do this if a callback has been called
995  *  - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning
996  *  - different buffer handling!
997  *
998  * Drawback or need for corrective handling:
999  * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and
1000  * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible
1001  * not only for compression but also for HTML/XML documents with following charset/encoding announcers.
1002  *
1003  * How to achieve both?
1004  *  - Only replace the result after an SDX or SCU?
1005  */
1006 
1007 static void
_SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)1008 _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1009                             UErrorCode *pErrorCode) {
1010     UConverter *cnv;
1011     SCSUData *scsu;
1012     const UChar *source, *sourceLimit;
1013     uint8_t *target;
1014     int32_t targetCapacity;
1015     int32_t *offsets;
1016 
1017     UBool isSingleByteMode;
1018     uint8_t dynamicWindow;
1019     uint32_t currentOffset;
1020 
1021     uint32_t c, delta;
1022 
1023     int32_t sourceIndex, nextSourceIndex;
1024 
1025     int32_t length;
1026 
1027     /* variables for compression heuristics */
1028     uint32_t offset;
1029     UChar lead, trail;
1030     int code;
1031     int8_t window;
1032 
1033     /* set up the local pointers */
1034     cnv=pArgs->converter;
1035     scsu=(SCSUData *)cnv->extraInfo;
1036 
1037     /* set up the local pointers */
1038     source=pArgs->source;
1039     sourceLimit=pArgs->sourceLimit;
1040     target=(uint8_t *)pArgs->target;
1041     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
1042     offsets=pArgs->offsets;
1043 
1044     /* get the state machine state */
1045     isSingleByteMode=scsu->fromUIsSingleByteMode;
1046     dynamicWindow=scsu->fromUDynamicWindow;
1047     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1048 
1049     c=cnv->fromUChar32;
1050 
1051     /* sourceIndex=-1 if the current character began in the previous buffer */
1052     sourceIndex= c==0 ? 0 : -1;
1053     nextSourceIndex=0;
1054 
1055     /* similar conversion "loop" as in toUnicode */
1056 loop:
1057     if(isSingleByteMode) {
1058         if(c!=0 && targetCapacity>0) {
1059             goto getTrailSingle;
1060         }
1061 
1062         /* state machine for single-byte mode */
1063 /* singleByteMode: */
1064         while(source<sourceLimit) {
1065             if(targetCapacity<=0) {
1066                 /* target is full */
1067                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1068                 break;
1069             }
1070             c=*source++;
1071             ++nextSourceIndex;
1072 
1073             if((c-0x20)<=0x5f) {
1074                 /* pass US-ASCII graphic character through */
1075                 *target++=(uint8_t)c;
1076                 if(offsets!=NULL) {
1077                     *offsets++=sourceIndex;
1078                 }
1079                 --targetCapacity;
1080             } else if(c<0x20) {
1081                 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1082                     /* CR/LF/TAB/NUL */
1083                     *target++=(uint8_t)c;
1084                     if(offsets!=NULL) {
1085                         *offsets++=sourceIndex;
1086                     }
1087                     --targetCapacity;
1088                 } else {
1089                     /* quote C0 control character */
1090                     c|=SQ0<<8;
1091                     length=2;
1092                     goto outputBytes;
1093                 }
1094             } else if((delta=c-currentOffset)<=0x7f) {
1095                 /* use the current dynamic window */
1096                 *target++=(uint8_t)(delta|0x80);
1097                 if(offsets!=NULL) {
1098                     *offsets++=sourceIndex;
1099                 }
1100                 --targetCapacity;
1101             } else if(UTF_IS_SURROGATE(c)) {
1102                 if(UTF_IS_SURROGATE_FIRST(c)) {
1103 getTrailSingle:
1104                     lead=(UChar)c;
1105                     if(source<sourceLimit) {
1106                         /* test the following code unit */
1107                         trail=*source;
1108                         if(UTF_IS_SECOND_SURROGATE(trail)) {
1109                             ++source;
1110                             ++nextSourceIndex;
1111                             c=UTF16_GET_PAIR_VALUE(c, trail);
1112                             /* convert this surrogate code point */
1113                             /* exit this condition tree */
1114                         } else {
1115                             /* this is an unmatched lead code unit (1st surrogate) */
1116                             /* callback(illegal) */
1117                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1118                             goto endloop;
1119                         }
1120                     } else {
1121                         /* no more input */
1122                         break;
1123                     }
1124                 } else {
1125                     /* this is an unmatched trail code unit (2nd surrogate) */
1126                     /* callback(illegal) */
1127                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1128                     goto endloop;
1129                 }
1130 
1131                 /* compress supplementary character U+10000..U+10ffff */
1132                 if((delta=c-currentOffset)<=0x7f) {
1133                     /* use the current dynamic window */
1134                     *target++=(uint8_t)(delta|0x80);
1135                     if(offsets!=NULL) {
1136                         *offsets++=sourceIndex;
1137                     }
1138                     --targetCapacity;
1139                 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1140                     /* there is a dynamic window that contains this character, change to it */
1141                     dynamicWindow=window;
1142                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1143                     useDynamicWindow(scsu, dynamicWindow);
1144                     c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1145                     length=2;
1146                     goto outputBytes;
1147                 } else if((code=getDynamicOffset(c, &offset))>=0) {
1148                     /* might check if there are more characters in this window to come */
1149                     /* define an extended window with this character */
1150                     code-=0x200;
1151                     dynamicWindow=getNextDynamicWindow(scsu);
1152                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1153                     useDynamicWindow(scsu, dynamicWindow);
1154                     c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1155                     length=4;
1156                     goto outputBytes;
1157                 } else {
1158                     /* change to Unicode mode and output this (lead, trail) pair */
1159                     isSingleByteMode=FALSE;
1160                     *target++=(uint8_t)SCU;
1161                     if(offsets!=NULL) {
1162                         *offsets++=sourceIndex;
1163                     }
1164                     --targetCapacity;
1165                     c=((uint32_t)lead<<16)|trail;
1166                     length=4;
1167                     goto outputBytes;
1168                 }
1169             } else if(c<0xa0) {
1170                 /* quote C1 control character */
1171                 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
1172                 length=2;
1173                 goto outputBytes;
1174             } else if(c==0xfeff || c>=0xfff0) {
1175                 /* quote signature character=byte order mark and specials */
1176                 c|=SQU<<16;
1177                 length=3;
1178                 goto outputBytes;
1179             } else {
1180                 /* compress all other BMP characters */
1181                 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1182                     /* there is a window defined that contains this character - switch to it or quote from it? */
1183                     if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
1184                         /* change to dynamic window */
1185                         dynamicWindow=window;
1186                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1187                         useDynamicWindow(scsu, dynamicWindow);
1188                         c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1189                         length=2;
1190                         goto outputBytes;
1191                     } else {
1192                         /* quote from dynamic window */
1193                         c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
1194                         length=2;
1195                         goto outputBytes;
1196                     }
1197                 } else if((window=getWindow(staticOffsets, c))>=0) {
1198                     /* quote from static window */
1199                     c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
1200                     length=2;
1201                     goto outputBytes;
1202                 } else if((code=getDynamicOffset(c, &offset))>=0) {
1203                     /* define a dynamic window with this character */
1204                     dynamicWindow=getNextDynamicWindow(scsu);
1205                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1206                     useDynamicWindow(scsu, dynamicWindow);
1207                     c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1208                     length=3;
1209                     goto outputBytes;
1210                 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
1211                           (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1212                 ) {
1213                     /*
1214                      * this character is not compressible (a BMP ideograph or similar);
1215                      * switch to Unicode mode if this is the last character in the block
1216                      * or there is at least one more ideograph following immediately
1217                      */
1218                     isSingleByteMode=FALSE;
1219                     c|=SCU<<16;
1220                     length=3;
1221                     goto outputBytes;
1222                 } else {
1223                     /* quote Unicode */
1224                     c|=SQU<<16;
1225                     length=3;
1226                     goto outputBytes;
1227                 }
1228             }
1229 
1230             /* normal end of conversion: prepare for a new character */
1231             c=0;
1232             sourceIndex=nextSourceIndex;
1233         }
1234     } else {
1235         if(c!=0 && targetCapacity>0) {
1236             goto getTrailUnicode;
1237         }
1238 
1239         /* state machine for Unicode mode */
1240 /* unicodeByteMode: */
1241         while(source<sourceLimit) {
1242             if(targetCapacity<=0) {
1243                 /* target is full */
1244                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1245                 break;
1246             }
1247             c=*source++;
1248             ++nextSourceIndex;
1249 
1250             if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
1251                 /* not compressible, write character directly */
1252                 if(targetCapacity>=2) {
1253                     *target++=(uint8_t)(c>>8);
1254                     *target++=(uint8_t)c;
1255                     if(offsets!=NULL) {
1256                         *offsets++=sourceIndex;
1257                         *offsets++=sourceIndex;
1258                     }
1259                     targetCapacity-=2;
1260                 } else {
1261                     length=2;
1262                     goto outputBytes;
1263                 }
1264             } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1265                 /* compress BMP character if the following one is not an uncompressible ideograph */
1266                 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
1267                     if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
1268                         /* ASCII digit or letter */
1269                         isSingleByteMode=TRUE;
1270                         c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
1271                         length=2;
1272                         goto outputBytes;
1273                     } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1274                         /* there is a dynamic window that contains this character, change to it */
1275                         isSingleByteMode=TRUE;
1276                         dynamicWindow=window;
1277                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1278                         useDynamicWindow(scsu, dynamicWindow);
1279                         c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1280                         length=2;
1281                         goto outputBytes;
1282                     } else if((code=getDynamicOffset(c, &offset))>=0) {
1283                         /* define a dynamic window with this character */
1284                         isSingleByteMode=TRUE;
1285                         dynamicWindow=getNextDynamicWindow(scsu);
1286                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1287                         useDynamicWindow(scsu, dynamicWindow);
1288                         c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1289                         length=3;
1290                         goto outputBytes;
1291                     }
1292                 }
1293 
1294                 /* don't know how to compress this character, just write it directly */
1295                 length=2;
1296                 goto outputBytes;
1297             } else if(c<0xe000) {
1298                 /* c is a surrogate */
1299                 if(UTF_IS_SURROGATE_FIRST(c)) {
1300 getTrailUnicode:
1301                     lead=(UChar)c;
1302                     if(source<sourceLimit) {
1303                         /* test the following code unit */
1304                         trail=*source;
1305                         if(UTF_IS_SECOND_SURROGATE(trail)) {
1306                             ++source;
1307                             ++nextSourceIndex;
1308                             c=UTF16_GET_PAIR_VALUE(c, trail);
1309                             /* convert this surrogate code point */
1310                             /* exit this condition tree */
1311                         } else {
1312                             /* this is an unmatched lead code unit (1st surrogate) */
1313                             /* callback(illegal) */
1314                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1315                             goto endloop;
1316                         }
1317                     } else {
1318                         /* no more input */
1319                         break;
1320                     }
1321                 } else {
1322                     /* this is an unmatched trail code unit (2nd surrogate) */
1323                     /* callback(illegal) */
1324                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1325                     goto endloop;
1326                 }
1327 
1328                 /* compress supplementary character */
1329                 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
1330                     !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1331                 ) {
1332                     /*
1333                      * there is a dynamic window that contains this character and
1334                      * the following character is not uncompressible,
1335                      * change to the window
1336                      */
1337                     isSingleByteMode=TRUE;
1338                     dynamicWindow=window;
1339                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1340                     useDynamicWindow(scsu, dynamicWindow);
1341                     c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1342                     length=2;
1343                     goto outputBytes;
1344                 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
1345                           (code=getDynamicOffset(c, &offset))>=0
1346                 ) {
1347                     /* two supplementary characters in (probably) the same window - define an extended one */
1348                     isSingleByteMode=TRUE;
1349                     code-=0x200;
1350                     dynamicWindow=getNextDynamicWindow(scsu);
1351                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1352                     useDynamicWindow(scsu, dynamicWindow);
1353                     c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1354                     length=4;
1355                     goto outputBytes;
1356                 } else {
1357                     /* don't know how to compress this character, just write it directly */
1358                     c=((uint32_t)lead<<16)|trail;
1359                     length=4;
1360                     goto outputBytes;
1361                 }
1362             } else /* 0xe000<=c<0xf300 */ {
1363                 /* quote to avoid SCSU tags */
1364                 c|=UQU<<16;
1365                 length=3;
1366                 goto outputBytes;
1367             }
1368 
1369             /* normal end of conversion: prepare for a new character */
1370             c=0;
1371             sourceIndex=nextSourceIndex;
1372         }
1373     }
1374 endloop:
1375 
1376     /* set the converter state back into UConverter */
1377     scsu->fromUIsSingleByteMode=isSingleByteMode;
1378     scsu->fromUDynamicWindow=dynamicWindow;
1379 
1380     cnv->fromUChar32=c;
1381 
1382     /* write back the updated pointers */
1383     pArgs->source=source;
1384     pArgs->target=(char *)target;
1385     pArgs->offsets=offsets;
1386     return;
1387 
1388 outputBytes:
1389     /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1390     /* from the first if in the loop we know that targetCapacity>0 */
1391     if(length<=targetCapacity) {
1392         if(offsets==NULL) {
1393             switch(length) {
1394                 /* each branch falls through to the next one */
1395             case 4:
1396                 *target++=(uint8_t)(c>>24);
1397             case 3:
1398                 *target++=(uint8_t)(c>>16);
1399             case 2:
1400                 *target++=(uint8_t)(c>>8);
1401             case 1:
1402                 *target++=(uint8_t)c;
1403             default:
1404                 /* will never occur */
1405                 break;
1406             }
1407         } else {
1408             switch(length) {
1409                 /* each branch falls through to the next one */
1410             case 4:
1411                 *target++=(uint8_t)(c>>24);
1412                 *offsets++=sourceIndex;
1413             case 3:
1414                 *target++=(uint8_t)(c>>16);
1415                 *offsets++=sourceIndex;
1416             case 2:
1417                 *target++=(uint8_t)(c>>8);
1418                 *offsets++=sourceIndex;
1419             case 1:
1420                 *target++=(uint8_t)c;
1421                 *offsets++=sourceIndex;
1422             default:
1423                 /* will never occur */
1424                 break;
1425             }
1426         }
1427         targetCapacity-=length;
1428 
1429         /* normal end of conversion: prepare for a new character */
1430         c=0;
1431         sourceIndex=nextSourceIndex;
1432         goto loop;
1433     } else {
1434         uint8_t *p;
1435 
1436         /*
1437          * We actually do this backwards here:
1438          * In order to save an intermediate variable, we output
1439          * first to the overflow buffer what does not fit into the
1440          * regular target.
1441          */
1442         /* we know that 0<=targetCapacity<length<=4 */
1443         /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1444         length-=targetCapacity;
1445         p=(uint8_t *)cnv->charErrorBuffer;
1446         switch(length) {
1447             /* each branch falls through to the next one */
1448         case 4:
1449             *p++=(uint8_t)(c>>24);
1450         case 3:
1451             *p++=(uint8_t)(c>>16);
1452         case 2:
1453             *p++=(uint8_t)(c>>8);
1454         case 1:
1455             *p=(uint8_t)c;
1456         default:
1457             /* will never occur */
1458             break;
1459         }
1460         cnv->charErrorBufferLength=(int8_t)length;
1461 
1462         /* now output what fits into the regular target */
1463         c>>=8*length; /* length was reduced by targetCapacity */
1464         switch(targetCapacity) {
1465             /* each branch falls through to the next one */
1466         case 3:
1467             *target++=(uint8_t)(c>>16);
1468             if(offsets!=NULL) {
1469                 *offsets++=sourceIndex;
1470             }
1471         case 2:
1472             *target++=(uint8_t)(c>>8);
1473             if(offsets!=NULL) {
1474                 *offsets++=sourceIndex;
1475             }
1476         case 1:
1477             *target++=(uint8_t)c;
1478             if(offsets!=NULL) {
1479                 *offsets++=sourceIndex;
1480             }
1481         default:
1482             break;
1483         }
1484 
1485         /* target overflow */
1486         targetCapacity=0;
1487         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1488         c=0;
1489         goto endloop;
1490     }
1491 }
1492 
1493 /*
1494  * Identical to _SCSUFromUnicodeWithOffsets but without offset handling.
1495  * If a change is made in the original function, then either
1496  * change this function the same way or
1497  * re-copy the original function and remove the variables
1498  * offsets, sourceIndex, and nextSourceIndex.
1499  */
1500 static void
_SCSUFromUnicode(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)1501 _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs,
1502                  UErrorCode *pErrorCode) {
1503     UConverter *cnv;
1504     SCSUData *scsu;
1505     const UChar *source, *sourceLimit;
1506     uint8_t *target;
1507     int32_t targetCapacity;
1508 
1509     UBool isSingleByteMode;
1510     uint8_t dynamicWindow;
1511     uint32_t currentOffset;
1512 
1513     uint32_t c, delta;
1514 
1515     int32_t length;
1516 
1517     /* variables for compression heuristics */
1518     uint32_t offset;
1519     UChar lead, trail;
1520     int code;
1521     int8_t window;
1522 
1523     /* set up the local pointers */
1524     cnv=pArgs->converter;
1525     scsu=(SCSUData *)cnv->extraInfo;
1526 
1527     /* set up the local pointers */
1528     source=pArgs->source;
1529     sourceLimit=pArgs->sourceLimit;
1530     target=(uint8_t *)pArgs->target;
1531     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
1532 
1533     /* get the state machine state */
1534     isSingleByteMode=scsu->fromUIsSingleByteMode;
1535     dynamicWindow=scsu->fromUDynamicWindow;
1536     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1537 
1538     c=cnv->fromUChar32;
1539 
1540     /* similar conversion "loop" as in toUnicode */
1541 loop:
1542     if(isSingleByteMode) {
1543         if(c!=0 && targetCapacity>0) {
1544             goto getTrailSingle;
1545         }
1546 
1547         /* state machine for single-byte mode */
1548 /* singleByteMode: */
1549         while(source<sourceLimit) {
1550             if(targetCapacity<=0) {
1551                 /* target is full */
1552                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1553                 break;
1554             }
1555             c=*source++;
1556 
1557             if((c-0x20)<=0x5f) {
1558                 /* pass US-ASCII graphic character through */
1559                 *target++=(uint8_t)c;
1560                 --targetCapacity;
1561             } else if(c<0x20) {
1562                 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1563                     /* CR/LF/TAB/NUL */
1564                     *target++=(uint8_t)c;
1565                     --targetCapacity;
1566                 } else {
1567                     /* quote C0 control character */
1568                     c|=SQ0<<8;
1569                     length=2;
1570                     goto outputBytes;
1571                 }
1572             } else if((delta=c-currentOffset)<=0x7f) {
1573                 /* use the current dynamic window */
1574                 *target++=(uint8_t)(delta|0x80);
1575                 --targetCapacity;
1576             } else if(UTF_IS_SURROGATE(c)) {
1577                 if(UTF_IS_SURROGATE_FIRST(c)) {
1578 getTrailSingle:
1579                     lead=(UChar)c;
1580                     if(source<sourceLimit) {
1581                         /* test the following code unit */
1582                         trail=*source;
1583                         if(UTF_IS_SECOND_SURROGATE(trail)) {
1584                             ++source;
1585                             c=UTF16_GET_PAIR_VALUE(c, trail);
1586                             /* convert this surrogate code point */
1587                             /* exit this condition tree */
1588                         } else {
1589                             /* this is an unmatched lead code unit (1st surrogate) */
1590                             /* callback(illegal) */
1591                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1592                             goto endloop;
1593                         }
1594                     } else {
1595                         /* no more input */
1596                         break;
1597                     }
1598                 } else {
1599                     /* this is an unmatched trail code unit (2nd surrogate) */
1600                     /* callback(illegal) */
1601                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1602                     goto endloop;
1603                 }
1604 
1605                 /* compress supplementary character U+10000..U+10ffff */
1606                 if((delta=c-currentOffset)<=0x7f) {
1607                     /* use the current dynamic window */
1608                     *target++=(uint8_t)(delta|0x80);
1609                     --targetCapacity;
1610                 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1611                     /* there is a dynamic window that contains this character, change to it */
1612                     dynamicWindow=window;
1613                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1614                     useDynamicWindow(scsu, dynamicWindow);
1615                     c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1616                     length=2;
1617                     goto outputBytes;
1618                 } else if((code=getDynamicOffset(c, &offset))>=0) {
1619                     /* might check if there are more characters in this window to come */
1620                     /* define an extended window with this character */
1621                     code-=0x200;
1622                     dynamicWindow=getNextDynamicWindow(scsu);
1623                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1624                     useDynamicWindow(scsu, dynamicWindow);
1625                     c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1626                     length=4;
1627                     goto outputBytes;
1628                 } else {
1629                     /* change to Unicode mode and output this (lead, trail) pair */
1630                     isSingleByteMode=FALSE;
1631                     *target++=(uint8_t)SCU;
1632                     --targetCapacity;
1633                     c=((uint32_t)lead<<16)|trail;
1634                     length=4;
1635                     goto outputBytes;
1636                 }
1637             } else if(c<0xa0) {
1638                 /* quote C1 control character */
1639                 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
1640                 length=2;
1641                 goto outputBytes;
1642             } else if(c==0xfeff || c>=0xfff0) {
1643                 /* quote signature character=byte order mark and specials */
1644                 c|=SQU<<16;
1645                 length=3;
1646                 goto outputBytes;
1647             } else {
1648                 /* compress all other BMP characters */
1649                 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1650                     /* there is a window defined that contains this character - switch to it or quote from it? */
1651                     if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
1652                         /* change to dynamic window */
1653                         dynamicWindow=window;
1654                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1655                         useDynamicWindow(scsu, dynamicWindow);
1656                         c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1657                         length=2;
1658                         goto outputBytes;
1659                     } else {
1660                         /* quote from dynamic window */
1661                         c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
1662                         length=2;
1663                         goto outputBytes;
1664                     }
1665                 } else if((window=getWindow(staticOffsets, c))>=0) {
1666                     /* quote from static window */
1667                     c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
1668                     length=2;
1669                     goto outputBytes;
1670                 } else if((code=getDynamicOffset(c, &offset))>=0) {
1671                     /* define a dynamic window with this character */
1672                     dynamicWindow=getNextDynamicWindow(scsu);
1673                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1674                     useDynamicWindow(scsu, dynamicWindow);
1675                     c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1676                     length=3;
1677                     goto outputBytes;
1678                 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
1679                           (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1680                 ) {
1681                     /*
1682                      * this character is not compressible (a BMP ideograph or similar);
1683                      * switch to Unicode mode if this is the last character in the block
1684                      * or there is at least one more ideograph following immediately
1685                      */
1686                     isSingleByteMode=FALSE;
1687                     c|=SCU<<16;
1688                     length=3;
1689                     goto outputBytes;
1690                 } else {
1691                     /* quote Unicode */
1692                     c|=SQU<<16;
1693                     length=3;
1694                     goto outputBytes;
1695                 }
1696             }
1697 
1698             /* normal end of conversion: prepare for a new character */
1699             c=0;
1700         }
1701     } else {
1702         if(c!=0 && targetCapacity>0) {
1703             goto getTrailUnicode;
1704         }
1705 
1706         /* state machine for Unicode mode */
1707 /* unicodeByteMode: */
1708         while(source<sourceLimit) {
1709             if(targetCapacity<=0) {
1710                 /* target is full */
1711                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1712                 break;
1713             }
1714             c=*source++;
1715 
1716             if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
1717                 /* not compressible, write character directly */
1718                 if(targetCapacity>=2) {
1719                     *target++=(uint8_t)(c>>8);
1720                     *target++=(uint8_t)c;
1721                     targetCapacity-=2;
1722                 } else {
1723                     length=2;
1724                     goto outputBytes;
1725                 }
1726             } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1727                 /* compress BMP character if the following one is not an uncompressible ideograph */
1728                 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
1729                     if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
1730                         /* ASCII digit or letter */
1731                         isSingleByteMode=TRUE;
1732                         c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
1733                         length=2;
1734                         goto outputBytes;
1735                     } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1736                         /* there is a dynamic window that contains this character, change to it */
1737                         isSingleByteMode=TRUE;
1738                         dynamicWindow=window;
1739                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1740                         useDynamicWindow(scsu, dynamicWindow);
1741                         c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1742                         length=2;
1743                         goto outputBytes;
1744                     } else if((code=getDynamicOffset(c, &offset))>=0) {
1745                         /* define a dynamic window with this character */
1746                         isSingleByteMode=TRUE;
1747                         dynamicWindow=getNextDynamicWindow(scsu);
1748                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1749                         useDynamicWindow(scsu, dynamicWindow);
1750                         c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1751                         length=3;
1752                         goto outputBytes;
1753                     }
1754                 }
1755 
1756                 /* don't know how to compress this character, just write it directly */
1757                 length=2;
1758                 goto outputBytes;
1759             } else if(c<0xe000) {
1760                 /* c is a surrogate */
1761                 if(UTF_IS_SURROGATE_FIRST(c)) {
1762 getTrailUnicode:
1763                     lead=(UChar)c;
1764                     if(source<sourceLimit) {
1765                         /* test the following code unit */
1766                         trail=*source;
1767                         if(UTF_IS_SECOND_SURROGATE(trail)) {
1768                             ++source;
1769                             c=UTF16_GET_PAIR_VALUE(c, trail);
1770                             /* convert this surrogate code point */
1771                             /* exit this condition tree */
1772                         } else {
1773                             /* this is an unmatched lead code unit (1st surrogate) */
1774                             /* callback(illegal) */
1775                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1776                             goto endloop;
1777                         }
1778                     } else {
1779                         /* no more input */
1780                         break;
1781                     }
1782                 } else {
1783                     /* this is an unmatched trail code unit (2nd surrogate) */
1784                     /* callback(illegal) */
1785                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1786                     goto endloop;
1787                 }
1788 
1789                 /* compress supplementary character */
1790                 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
1791                     !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1792                 ) {
1793                     /*
1794                      * there is a dynamic window that contains this character and
1795                      * the following character is not uncompressible,
1796                      * change to the window
1797                      */
1798                     isSingleByteMode=TRUE;
1799                     dynamicWindow=window;
1800                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1801                     useDynamicWindow(scsu, dynamicWindow);
1802                     c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1803                     length=2;
1804                     goto outputBytes;
1805                 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
1806                           (code=getDynamicOffset(c, &offset))>=0
1807                 ) {
1808                     /* two supplementary characters in (probably) the same window - define an extended one */
1809                     isSingleByteMode=TRUE;
1810                     code-=0x200;
1811                     dynamicWindow=getNextDynamicWindow(scsu);
1812                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1813                     useDynamicWindow(scsu, dynamicWindow);
1814                     c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1815                     length=4;
1816                     goto outputBytes;
1817                 } else {
1818                     /* don't know how to compress this character, just write it directly */
1819                     c=((uint32_t)lead<<16)|trail;
1820                     length=4;
1821                     goto outputBytes;
1822                 }
1823             } else /* 0xe000<=c<0xf300 */ {
1824                 /* quote to avoid SCSU tags */
1825                 c|=UQU<<16;
1826                 length=3;
1827                 goto outputBytes;
1828             }
1829 
1830             /* normal end of conversion: prepare for a new character */
1831             c=0;
1832         }
1833     }
1834 endloop:
1835 
1836     /* set the converter state back into UConverter */
1837     scsu->fromUIsSingleByteMode=isSingleByteMode;
1838     scsu->fromUDynamicWindow=dynamicWindow;
1839 
1840     cnv->fromUChar32=c;
1841 
1842     /* write back the updated pointers */
1843     pArgs->source=source;
1844     pArgs->target=(char *)target;
1845     return;
1846 
1847 outputBytes:
1848     /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1849     /* from the first if in the loop we know that targetCapacity>0 */
1850     if(length<=targetCapacity) {
1851         switch(length) {
1852             /* each branch falls through to the next one */
1853         case 4:
1854             *target++=(uint8_t)(c>>24);
1855         case 3:
1856             *target++=(uint8_t)(c>>16);
1857         case 2:
1858             *target++=(uint8_t)(c>>8);
1859         case 1:
1860             *target++=(uint8_t)c;
1861         default:
1862             /* will never occur */
1863             break;
1864         }
1865         targetCapacity-=length;
1866 
1867         /* normal end of conversion: prepare for a new character */
1868         c=0;
1869         goto loop;
1870     } else {
1871         uint8_t *p;
1872 
1873         /*
1874          * We actually do this backwards here:
1875          * In order to save an intermediate variable, we output
1876          * first to the overflow buffer what does not fit into the
1877          * regular target.
1878          */
1879         /* we know that 0<=targetCapacity<length<=4 */
1880         /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1881         length-=targetCapacity;
1882         p=(uint8_t *)cnv->charErrorBuffer;
1883         switch(length) {
1884             /* each branch falls through to the next one */
1885         case 4:
1886             *p++=(uint8_t)(c>>24);
1887         case 3:
1888             *p++=(uint8_t)(c>>16);
1889         case 2:
1890             *p++=(uint8_t)(c>>8);
1891         case 1:
1892             *p=(uint8_t)c;
1893         default:
1894             /* will never occur */
1895             break;
1896         }
1897         cnv->charErrorBufferLength=(int8_t)length;
1898 
1899         /* now output what fits into the regular target */
1900         c>>=8*length; /* length was reduced by targetCapacity */
1901         switch(targetCapacity) {
1902             /* each branch falls through to the next one */
1903         case 3:
1904             *target++=(uint8_t)(c>>16);
1905         case 2:
1906             *target++=(uint8_t)(c>>8);
1907         case 1:
1908             *target++=(uint8_t)c;
1909         default:
1910             break;
1911         }
1912 
1913         /* target overflow */
1914         targetCapacity=0;
1915         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1916         c=0;
1917         goto endloop;
1918     }
1919 }
1920 
1921 /* miscellaneous ------------------------------------------------------------ */
1922 
1923 static const char *
_SCSUGetName(const UConverter * cnv)1924 _SCSUGetName(const UConverter *cnv) {
1925     SCSUData *scsu=(SCSUData *)cnv->extraInfo;
1926 
1927     switch(scsu->locale) {
1928     case l_ja:
1929         return "SCSU,locale=ja";
1930     default:
1931         return "SCSU";
1932     }
1933 }
1934 
1935 /* structure for SafeClone calculations */
1936 struct cloneSCSUStruct
1937 {
1938     UConverter cnv;
1939     SCSUData mydata;
1940 };
1941 
1942 static UConverter *
_SCSUSafeClone(const UConverter * cnv,void * stackBuffer,int32_t * pBufferSize,UErrorCode * status)1943 _SCSUSafeClone(const UConverter *cnv,
1944                void *stackBuffer,
1945                int32_t *pBufferSize,
1946                UErrorCode *status)
1947 {
1948     struct cloneSCSUStruct * localClone;
1949     int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct);
1950 
1951     if (U_FAILURE(*status)){
1952         return 0;
1953     }
1954 
1955     if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
1956         *pBufferSize = bufferSizeNeeded;
1957         return 0;
1958     }
1959 
1960     localClone = (struct cloneSCSUStruct *)stackBuffer;
1961     /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
1962 
1963     uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData));
1964     localClone->cnv.extraInfo = &localClone->mydata;
1965     localClone->cnv.isExtraLocal = TRUE;
1966 
1967     return &localClone->cnv;
1968 }
1969 
1970 
1971 static const UConverterImpl _SCSUImpl={
1972     UCNV_SCSU,
1973 
1974     NULL,
1975     NULL,
1976 
1977     _SCSUOpen,
1978     _SCSUClose,
1979     _SCSUReset,
1980 
1981     _SCSUToUnicode,
1982     _SCSUToUnicodeWithOffsets,
1983     _SCSUFromUnicode,
1984     _SCSUFromUnicodeWithOffsets,
1985     NULL,
1986 
1987     NULL,
1988     _SCSUGetName,
1989     NULL,
1990     _SCSUSafeClone,
1991     ucnv_getCompleteUnicodeSet
1992 };
1993 
1994 static const UConverterStaticData _SCSUStaticData={
1995     sizeof(UConverterStaticData),
1996     "SCSU",
1997     1212, /* CCSID for SCSU */
1998     UCNV_IBM, UCNV_SCSU,
1999     1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
2000     /*
2001      * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode
2002      * substitution string.
2003      */
2004     { 0x0e, 0xff, 0xfd, 0 }, 3,
2005     FALSE, FALSE,
2006     0,
2007     0,
2008     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
2009 };
2010 
2011 const UConverterSharedData _SCSUData={
2012     sizeof(UConverterSharedData), ~((uint32_t)0),
2013     NULL, NULL, &_SCSUStaticData, FALSE, &_SCSUImpl,
2014     0
2015 };
2016 
2017 #endif
2018