• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 *   Copyright (C) 2000-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 ******************************************************************************
10 *   file name:  ucnvscsu.c
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2000nov18
16 *   created by: Markus W. Scherer
17 *
18 *   This is an implementation of the Standard Compression Scheme for Unicode
19 *   as defined in http://www.unicode.org/unicode/reports/tr6/ .
20 *   Reserved commands and window settings are treated as illegal sequences and
21 *   will result in callback calls.
22 */
23 
24 #include "unicode/utypes.h"
25 
26 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
27 
28 #include "unicode/ucnv.h"
29 #include "unicode/ucnv_cb.h"
30 #include "unicode/utf16.h"
31 #include "ucnv_bld.h"
32 #include "ucnv_cnv.h"
33 #include "cmemory.h"
34 
35 /* SCSU definitions --------------------------------------------------------- */
36 
37 /* SCSU command byte values */
38 enum {
39     SQ0=0x01, /* Quote from window pair 0 */
40     SQ7=0x08, /* Quote from window pair 7 */
41     SDX=0x0B, /* Define a window as extended */
42     Srs=0x0C, /* reserved */
43     SQU=0x0E, /* Quote a single Unicode character */
44     SCU=0x0F, /* Change to Unicode mode */
45     SC0=0x10, /* Select window 0 */
46     SC7=0x17, /* Select window 7 */
47     SD0=0x18, /* Define and select window 0 */
48     SD7=0x1F, /* Define and select window 7 */
49 
50     UC0=0xE0, /* Select window 0 */
51     UC7=0xE7, /* Select window 7 */
52     UD0=0xE8, /* Define and select window 0 */
53     UD7=0xEF, /* Define and select window 7 */
54     UQU=0xF0, /* Quote a single Unicode character */
55     UDX=0xF1, /* Define a Window as extended */
56     Urs=0xF2  /* reserved */
57 };
58 
59 enum {
60     /*
61      * Unicode code points from 3400 to E000 are not adressible by
62      * dynamic window, since in these areas no short run alphabets are
63      * found. Therefore add gapOffset to all values from gapThreshold.
64      */
65     gapThreshold=0x68,
66     gapOffset=0xAC00,
67 
68     /* values between reservedStart and fixedThreshold are reserved */
69     reservedStart=0xA8,
70 
71     /* use table of predefined fixed offsets for values from fixedThreshold */
72     fixedThreshold=0xF9
73 };
74 
75 /* constant offsets for the 8 static windows */
76 static const uint32_t staticOffsets[8]={
77     0x0000, /* ASCII for quoted tags */
78     0x0080, /* Latin - 1 Supplement (for access to punctuation) */
79     0x0100, /* Latin Extended-A */
80     0x0300, /* Combining Diacritical Marks */
81     0x2000, /* General Punctuation */
82     0x2080, /* Currency Symbols */
83     0x2100, /* Letterlike Symbols and Number Forms */
84     0x3000  /* CJK Symbols and punctuation */
85 };
86 
87 /* initial offsets for the 8 dynamic (sliding) windows */
88 static const uint32_t initialDynamicOffsets[8]={
89     0x0080, /* Latin-1 */
90     0x00C0, /* Latin Extended A */
91     0x0400, /* Cyrillic */
92     0x0600, /* Arabic */
93     0x0900, /* Devanagari */
94     0x3040, /* Hiragana */
95     0x30A0, /* Katakana */
96     0xFF00  /* Fullwidth ASCII */
97 };
98 
99 /* Table of fixed predefined Offsets */
100 static const uint32_t fixedOffsets[]={
101     /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */
102     /* 0xFA */ 0x0250, /* IPA extensions */
103     /* 0xFB */ 0x0370, /* Greek */
104     /* 0xFC */ 0x0530, /* Armenian */
105     /* 0xFD */ 0x3040, /* Hiragana */
106     /* 0xFE */ 0x30A0, /* Katakana */
107     /* 0xFF */ 0xFF60  /* Halfwidth Katakana */
108 };
109 
110 /* state values */
111 enum {
112     readCommand,
113     quotePairOne,
114     quotePairTwo,
115     quoteOne,
116     definePairOne,
117     definePairTwo,
118     defineOne
119 };
120 
121 typedef struct SCSUData {
122     /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */
123     uint32_t toUDynamicOffsets[8];
124     uint32_t fromUDynamicOffsets[8];
125 
126     /* state machine state - toUnicode */
127     UBool toUIsSingleByteMode;
128     uint8_t toUState;
129     int8_t toUQuoteWindow, toUDynamicWindow;
130     uint8_t toUByteOne;
131     uint8_t toUPadding[3];
132 
133     /* state machine state - fromUnicode */
134     UBool fromUIsSingleByteMode;
135     int8_t fromUDynamicWindow;
136 
137     /*
138      * windowUse[] keeps track of the use of the dynamic windows:
139      * At nextWindowUseIndex there is the least recently used window,
140      * and the following windows (in a wrapping manner) are more and more
141      * recently used.
142      * At nextWindowUseIndex-1 there is the most recently used window.
143      */
144     uint8_t locale;
145     int8_t nextWindowUseIndex;
146     int8_t windowUse[8];
147 } SCSUData;
148 
149 static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 };
150 static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 };
151 
152 enum {
153     lGeneric, l_ja
154 };
155 
156 /* SCSU setup functions ----------------------------------------------------- */
157 U_CDECL_BEGIN
158 static void U_CALLCONV
_SCSUReset(UConverter * cnv,UConverterResetChoice choice)159 _SCSUReset(UConverter *cnv, UConverterResetChoice choice) {
160     SCSUData *scsu=(SCSUData *)cnv->extraInfo;
161 
162     if(choice<=UCNV_RESET_TO_UNICODE) {
163         /* reset toUnicode */
164         uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32);
165 
166         scsu->toUIsSingleByteMode=TRUE;
167         scsu->toUState=readCommand;
168         scsu->toUQuoteWindow=scsu->toUDynamicWindow=0;
169         scsu->toUByteOne=0;
170 
171         cnv->toULength=0;
172     }
173     if(choice!=UCNV_RESET_TO_UNICODE) {
174         /* reset fromUnicode */
175         uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32);
176 
177         scsu->fromUIsSingleByteMode=TRUE;
178         scsu->fromUDynamicWindow=0;
179 
180         scsu->nextWindowUseIndex=0;
181         switch(scsu->locale) {
182         case l_ja:
183             uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8);
184             break;
185         default:
186             uprv_memcpy(scsu->windowUse, initialWindowUse, 8);
187             break;
188         }
189 
190         cnv->fromUChar32=0;
191     }
192 }
193 
194 static void U_CALLCONV
_SCSUOpen(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)195 _SCSUOpen(UConverter *cnv,
196           UConverterLoadArgs *pArgs,
197           UErrorCode *pErrorCode) {
198     const char *locale=pArgs->locale;
199     if(pArgs->onlyTestIsLoadable) {
200         return;
201     }
202     cnv->extraInfo=uprv_malloc(sizeof(SCSUData));
203     if(cnv->extraInfo!=NULL) {
204         if(locale!=NULL && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) {
205             ((SCSUData *)cnv->extraInfo)->locale=l_ja;
206         } else {
207             ((SCSUData *)cnv->extraInfo)->locale=lGeneric;
208         }
209         _SCSUReset(cnv, UCNV_RESET_BOTH);
210     } else {
211         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
212     }
213 
214     /* Set the substitution character U+fffd as a Unicode string. */
215     cnv->subUChars[0]=0xfffd;
216     cnv->subCharLen=-1;
217 }
218 
219 static void U_CALLCONV
_SCSUClose(UConverter * cnv)220 _SCSUClose(UConverter *cnv) {
221     if(cnv->extraInfo!=NULL) {
222         if(!cnv->isExtraLocal) {
223             uprv_free(cnv->extraInfo);
224         }
225         cnv->extraInfo=NULL;
226     }
227 }
228 
229 /* SCSU-to-Unicode conversion functions ------------------------------------- */
230 
231 static void U_CALLCONV
_SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)232 _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
233                           UErrorCode *pErrorCode) {
234     UConverter *cnv;
235     SCSUData *scsu;
236     const uint8_t *source, *sourceLimit;
237     UChar *target;
238     const UChar *targetLimit;
239     int32_t *offsets;
240     UBool isSingleByteMode;
241     uint8_t state, byteOne;
242     int8_t quoteWindow, dynamicWindow;
243 
244     int32_t sourceIndex, nextSourceIndex;
245 
246     uint8_t b;
247 
248     /* set up the local pointers */
249     cnv=pArgs->converter;
250     scsu=(SCSUData *)cnv->extraInfo;
251 
252     source=(const uint8_t *)pArgs->source;
253     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
254     target=pArgs->target;
255     targetLimit=pArgs->targetLimit;
256     offsets=pArgs->offsets;
257 
258     /* get the state machine state */
259     isSingleByteMode=scsu->toUIsSingleByteMode;
260     state=scsu->toUState;
261     quoteWindow=scsu->toUQuoteWindow;
262     dynamicWindow=scsu->toUDynamicWindow;
263     byteOne=scsu->toUByteOne;
264 
265     /* sourceIndex=-1 if the current character began in the previous buffer */
266     sourceIndex=state==readCommand ? 0 : -1;
267     nextSourceIndex=0;
268 
269     /*
270      * conversion "loop"
271      *
272      * For performance, this is not a normal C loop.
273      * Instead, there are two code blocks for the two SCSU modes.
274      * The function branches to either one, and a change of the mode is done with a goto to
275      * the other branch.
276      *
277      * Each branch has two conventional loops:
278      * - a fast-path loop for the most common codes in the mode
279      * - a loop for all other codes in the mode
280      * When the fast-path runs into a code that it cannot handle, its loop ends and it
281      * runs into the following loop to handle the other codes.
282      * The end of the input or output buffer is also handled by the slower loop.
283      * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
284      *
285      * The callback handling is done by returning with an error code.
286      * The conversion framework actually calls the callback function.
287      */
288     if(isSingleByteMode) {
289         /* fast path for single-byte mode */
290         if(state==readCommand) {
291 fastSingle:
292             while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
293                 ++source;
294                 ++nextSourceIndex;
295                 if(b<=0x7f) {
296                     /* write US-ASCII graphic character or DEL */
297                     *target++=(UChar)b;
298                     if(offsets!=NULL) {
299                         *offsets++=sourceIndex;
300                     }
301                 } else {
302                     /* write from dynamic window */
303                     uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
304                     if(c<=0xffff) {
305                         *target++=(UChar)c;
306                         if(offsets!=NULL) {
307                             *offsets++=sourceIndex;
308                         }
309                     } else {
310                         /* output surrogate pair */
311                         *target++=(UChar)(0xd7c0+(c>>10));
312                         if(target<targetLimit) {
313                             *target++=(UChar)(0xdc00|(c&0x3ff));
314                             if(offsets!=NULL) {
315                                 *offsets++=sourceIndex;
316                                 *offsets++=sourceIndex;
317                             }
318                         } else {
319                             /* target overflow */
320                             if(offsets!=NULL) {
321                                 *offsets++=sourceIndex;
322                             }
323                             cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
324                             cnv->UCharErrorBufferLength=1;
325                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
326                             goto endloop;
327                         }
328                     }
329                 }
330                 sourceIndex=nextSourceIndex;
331             }
332         }
333 
334         /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
335 singleByteMode:
336         while(source<sourceLimit) {
337             if(target>=targetLimit) {
338                 /* target is full */
339                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
340                 break;
341             }
342             b=*source++;
343             ++nextSourceIndex;
344             switch(state) {
345             case readCommand:
346                 /* redundant conditions are commented out */
347                 /* here: b<0x20 because otherwise we would be in fastSingle */
348                 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
349                     /* CR/LF/TAB/NUL */
350                     *target++=(UChar)b;
351                     if(offsets!=NULL) {
352                         *offsets++=sourceIndex;
353                     }
354                     sourceIndex=nextSourceIndex;
355                     goto fastSingle;
356                 } else if(SC0<=b) {
357                     if(b<=SC7) {
358                         dynamicWindow=(int8_t)(b-SC0);
359                         sourceIndex=nextSourceIndex;
360                         goto fastSingle;
361                     } else /* if(SD0<=b && b<=SD7) */ {
362                         dynamicWindow=(int8_t)(b-SD0);
363                         state=defineOne;
364                     }
365                 } else if(/* SQ0<=b && */ b<=SQ7) {
366                     quoteWindow=(int8_t)(b-SQ0);
367                     state=quoteOne;
368                 } else if(b==SDX) {
369                     state=definePairOne;
370                 } else if(b==SQU) {
371                     state=quotePairOne;
372                 } else if(b==SCU) {
373                     sourceIndex=nextSourceIndex;
374                     isSingleByteMode=FALSE;
375                     goto fastUnicode;
376                 } else /* Srs */ {
377                     /* callback(illegal) */
378                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
379                     cnv->toUBytes[0]=b;
380                     cnv->toULength=1;
381                     goto endloop;
382                 }
383 
384                 /* store the first byte of a multibyte sequence in toUBytes[] */
385                 cnv->toUBytes[0]=b;
386                 cnv->toULength=1;
387                 break;
388             case quotePairOne:
389                 byteOne=b;
390                 cnv->toUBytes[1]=b;
391                 cnv->toULength=2;
392                 state=quotePairTwo;
393                 break;
394             case quotePairTwo:
395                 *target++=(UChar)((byteOne<<8)|b);
396                 if(offsets!=NULL) {
397                     *offsets++=sourceIndex;
398                 }
399                 sourceIndex=nextSourceIndex;
400                 state=readCommand;
401                 goto fastSingle;
402             case quoteOne:
403                 if(b<0x80) {
404                     /* all static offsets are in the BMP */
405                     *target++=(UChar)(staticOffsets[quoteWindow]+b);
406                     if(offsets!=NULL) {
407                         *offsets++=sourceIndex;
408                     }
409                 } else {
410                     /* write from dynamic window */
411                     uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
412                     if(c<=0xffff) {
413                         *target++=(UChar)c;
414                         if(offsets!=NULL) {
415                             *offsets++=sourceIndex;
416                         }
417                     } else {
418                         /* output surrogate pair */
419                         *target++=(UChar)(0xd7c0+(c>>10));
420                         if(target<targetLimit) {
421                             *target++=(UChar)(0xdc00|(c&0x3ff));
422                             if(offsets!=NULL) {
423                                 *offsets++=sourceIndex;
424                                 *offsets++=sourceIndex;
425                             }
426                         } else {
427                             /* target overflow */
428                             if(offsets!=NULL) {
429                                 *offsets++=sourceIndex;
430                             }
431                             cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
432                             cnv->UCharErrorBufferLength=1;
433                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
434                             goto endloop;
435                         }
436                     }
437                 }
438                 sourceIndex=nextSourceIndex;
439                 state=readCommand;
440                 goto fastSingle;
441             case definePairOne:
442                 dynamicWindow=(int8_t)((b>>5)&7);
443                 byteOne=(uint8_t)(b&0x1f);
444                 cnv->toUBytes[1]=b;
445                 cnv->toULength=2;
446                 state=definePairTwo;
447                 break;
448             case definePairTwo:
449                 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
450                 sourceIndex=nextSourceIndex;
451                 state=readCommand;
452                 goto fastSingle;
453             case defineOne:
454                 if(b==0) {
455                     /* callback(illegal): Reserved window offset value 0 */
456                     cnv->toUBytes[1]=b;
457                     cnv->toULength=2;
458                     goto endloop;
459                 } else if(b<gapThreshold) {
460                     scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
461                 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
462                     scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
463                 } else if(b>=fixedThreshold) {
464                     scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
465                 } else {
466                     /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
467                     cnv->toUBytes[1]=b;
468                     cnv->toULength=2;
469                     goto endloop;
470                 }
471                 sourceIndex=nextSourceIndex;
472                 state=readCommand;
473                 goto fastSingle;
474             }
475         }
476     } else {
477         /* fast path for Unicode mode */
478         if(state==readCommand) {
479 fastUnicode:
480             while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
481                 *target++=(UChar)((b<<8)|source[1]);
482                 if(offsets!=NULL) {
483                     *offsets++=sourceIndex;
484                 }
485                 sourceIndex=nextSourceIndex;
486                 nextSourceIndex+=2;
487                 source+=2;
488             }
489         }
490 
491         /* normal state machine for Unicode mode */
492 /* unicodeByteMode: */
493         while(source<sourceLimit) {
494             if(target>=targetLimit) {
495                 /* target is full */
496                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
497                 break;
498             }
499             b=*source++;
500             ++nextSourceIndex;
501             switch(state) {
502             case readCommand:
503                 if((uint8_t)(b-UC0)>(Urs-UC0)) {
504                     byteOne=b;
505                     cnv->toUBytes[0]=b;
506                     cnv->toULength=1;
507                     state=quotePairTwo;
508                 } else if(/* UC0<=b && */ b<=UC7) {
509                     dynamicWindow=(int8_t)(b-UC0);
510                     sourceIndex=nextSourceIndex;
511                     isSingleByteMode=TRUE;
512                     goto fastSingle;
513                 } else if(/* UD0<=b && */ b<=UD7) {
514                     dynamicWindow=(int8_t)(b-UD0);
515                     isSingleByteMode=TRUE;
516                     cnv->toUBytes[0]=b;
517                     cnv->toULength=1;
518                     state=defineOne;
519                     goto singleByteMode;
520                 } else if(b==UDX) {
521                     isSingleByteMode=TRUE;
522                     cnv->toUBytes[0]=b;
523                     cnv->toULength=1;
524                     state=definePairOne;
525                     goto singleByteMode;
526                 } else if(b==UQU) {
527                     cnv->toUBytes[0]=b;
528                     cnv->toULength=1;
529                     state=quotePairOne;
530                 } else /* Urs */ {
531                     /* callback(illegal) */
532                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
533                     cnv->toUBytes[0]=b;
534                     cnv->toULength=1;
535                     goto endloop;
536                 }
537                 break;
538             case quotePairOne:
539                 byteOne=b;
540                 cnv->toUBytes[1]=b;
541                 cnv->toULength=2;
542                 state=quotePairTwo;
543                 break;
544             case quotePairTwo:
545                 *target++=(UChar)((byteOne<<8)|b);
546                 if(offsets!=NULL) {
547                     *offsets++=sourceIndex;
548                 }
549                 sourceIndex=nextSourceIndex;
550                 state=readCommand;
551                 goto fastUnicode;
552             }
553         }
554     }
555 endloop:
556 
557     /* set the converter state back into UConverter */
558     if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
559         /* reset to deal with the next character */
560         state=readCommand;
561     } else if(state==readCommand) {
562         /* not in a multi-byte sequence, reset toULength */
563         cnv->toULength=0;
564     }
565     scsu->toUIsSingleByteMode=isSingleByteMode;
566     scsu->toUState=state;
567     scsu->toUQuoteWindow=quoteWindow;
568     scsu->toUDynamicWindow=dynamicWindow;
569     scsu->toUByteOne=byteOne;
570 
571     /* write back the updated pointers */
572     pArgs->source=(const char *)source;
573     pArgs->target=target;
574     pArgs->offsets=offsets;
575     return;
576 }
577 
578 /*
579  * Identical to _SCSUToUnicodeWithOffsets but without offset handling.
580  * If a change is made in the original function, then either
581  * change this function the same way or
582  * re-copy the original function and remove the variables
583  * offsets, sourceIndex, and nextSourceIndex.
584  */
585 static void U_CALLCONV
_SCSUToUnicode(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)586 _SCSUToUnicode(UConverterToUnicodeArgs *pArgs,
587                UErrorCode *pErrorCode) {
588     UConverter *cnv;
589     SCSUData *scsu;
590     const uint8_t *source, *sourceLimit;
591     UChar *target;
592     const UChar *targetLimit;
593     UBool isSingleByteMode;
594     uint8_t state, byteOne;
595     int8_t quoteWindow, dynamicWindow;
596 
597     uint8_t b;
598 
599     /* set up the local pointers */
600     cnv=pArgs->converter;
601     scsu=(SCSUData *)cnv->extraInfo;
602 
603     source=(const uint8_t *)pArgs->source;
604     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
605     target=pArgs->target;
606     targetLimit=pArgs->targetLimit;
607 
608     /* get the state machine state */
609     isSingleByteMode=scsu->toUIsSingleByteMode;
610     state=scsu->toUState;
611     quoteWindow=scsu->toUQuoteWindow;
612     dynamicWindow=scsu->toUDynamicWindow;
613     byteOne=scsu->toUByteOne;
614 
615     /*
616      * conversion "loop"
617      *
618      * For performance, this is not a normal C loop.
619      * Instead, there are two code blocks for the two SCSU modes.
620      * The function branches to either one, and a change of the mode is done with a goto to
621      * the other branch.
622      *
623      * Each branch has two conventional loops:
624      * - a fast-path loop for the most common codes in the mode
625      * - a loop for all other codes in the mode
626      * When the fast-path runs into a code that it cannot handle, its loop ends and it
627      * runs into the following loop to handle the other codes.
628      * The end of the input or output buffer is also handled by the slower loop.
629      * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
630      *
631      * The callback handling is done by returning with an error code.
632      * The conversion framework actually calls the callback function.
633      */
634     if(isSingleByteMode) {
635         /* fast path for single-byte mode */
636         if(state==readCommand) {
637 fastSingle:
638             while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
639                 ++source;
640                 if(b<=0x7f) {
641                     /* write US-ASCII graphic character or DEL */
642                     *target++=(UChar)b;
643                 } else {
644                     /* write from dynamic window */
645                     uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
646                     if(c<=0xffff) {
647                         *target++=(UChar)c;
648                     } else {
649                         /* output surrogate pair */
650                         *target++=(UChar)(0xd7c0+(c>>10));
651                         if(target<targetLimit) {
652                             *target++=(UChar)(0xdc00|(c&0x3ff));
653                         } else {
654                             /* target overflow */
655                             cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
656                             cnv->UCharErrorBufferLength=1;
657                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
658                             goto endloop;
659                         }
660                     }
661                 }
662             }
663         }
664 
665         /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
666 singleByteMode:
667         while(source<sourceLimit) {
668             if(target>=targetLimit) {
669                 /* target is full */
670                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
671                 break;
672             }
673             b=*source++;
674             switch(state) {
675             case readCommand:
676                 /* redundant conditions are commented out */
677                 /* here: b<0x20 because otherwise we would be in fastSingle */
678                 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
679                     /* CR/LF/TAB/NUL */
680                     *target++=(UChar)b;
681                     goto fastSingle;
682                 } else if(SC0<=b) {
683                     if(b<=SC7) {
684                         dynamicWindow=(int8_t)(b-SC0);
685                         goto fastSingle;
686                     } else /* if(SD0<=b && b<=SD7) */ {
687                         dynamicWindow=(int8_t)(b-SD0);
688                         state=defineOne;
689                     }
690                 } else if(/* SQ0<=b && */ b<=SQ7) {
691                     quoteWindow=(int8_t)(b-SQ0);
692                     state=quoteOne;
693                 } else if(b==SDX) {
694                     state=definePairOne;
695                 } else if(b==SQU) {
696                     state=quotePairOne;
697                 } else if(b==SCU) {
698                     isSingleByteMode=FALSE;
699                     goto fastUnicode;
700                 } else /* Srs */ {
701                     /* callback(illegal) */
702                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
703                     cnv->toUBytes[0]=b;
704                     cnv->toULength=1;
705                     goto endloop;
706                 }
707 
708                 /* store the first byte of a multibyte sequence in toUBytes[] */
709                 cnv->toUBytes[0]=b;
710                 cnv->toULength=1;
711                 break;
712             case quotePairOne:
713                 byteOne=b;
714                 cnv->toUBytes[1]=b;
715                 cnv->toULength=2;
716                 state=quotePairTwo;
717                 break;
718             case quotePairTwo:
719                 *target++=(UChar)((byteOne<<8)|b);
720                 state=readCommand;
721                 goto fastSingle;
722             case quoteOne:
723                 if(b<0x80) {
724                     /* all static offsets are in the BMP */
725                     *target++=(UChar)(staticOffsets[quoteWindow]+b);
726                 } else {
727                     /* write from dynamic window */
728                     uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
729                     if(c<=0xffff) {
730                         *target++=(UChar)c;
731                     } else {
732                         /* output surrogate pair */
733                         *target++=(UChar)(0xd7c0+(c>>10));
734                         if(target<targetLimit) {
735                             *target++=(UChar)(0xdc00|(c&0x3ff));
736                         } else {
737                             /* target overflow */
738                             cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
739                             cnv->UCharErrorBufferLength=1;
740                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
741                             goto endloop;
742                         }
743                     }
744                 }
745                 state=readCommand;
746                 goto fastSingle;
747             case definePairOne:
748                 dynamicWindow=(int8_t)((b>>5)&7);
749                 byteOne=(uint8_t)(b&0x1f);
750                 cnv->toUBytes[1]=b;
751                 cnv->toULength=2;
752                 state=definePairTwo;
753                 break;
754             case definePairTwo:
755                 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
756                 state=readCommand;
757                 goto fastSingle;
758             case defineOne:
759                 if(b==0) {
760                     /* callback(illegal): Reserved window offset value 0 */
761                     cnv->toUBytes[1]=b;
762                     cnv->toULength=2;
763                     goto endloop;
764                 } else if(b<gapThreshold) {
765                     scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
766                 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
767                     scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
768                 } else if(b>=fixedThreshold) {
769                     scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
770                 } else {
771                     /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
772                     cnv->toUBytes[1]=b;
773                     cnv->toULength=2;
774                     goto endloop;
775                 }
776                 state=readCommand;
777                 goto fastSingle;
778             }
779         }
780     } else {
781         /* fast path for Unicode mode */
782         if(state==readCommand) {
783 fastUnicode:
784             while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
785                 *target++=(UChar)((b<<8)|source[1]);
786                 source+=2;
787             }
788         }
789 
790         /* normal state machine for Unicode mode */
791 /* unicodeByteMode: */
792         while(source<sourceLimit) {
793             if(target>=targetLimit) {
794                 /* target is full */
795                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
796                 break;
797             }
798             b=*source++;
799             switch(state) {
800             case readCommand:
801                 if((uint8_t)(b-UC0)>(Urs-UC0)) {
802                     byteOne=b;
803                     cnv->toUBytes[0]=b;
804                     cnv->toULength=1;
805                     state=quotePairTwo;
806                 } else if(/* UC0<=b && */ b<=UC7) {
807                     dynamicWindow=(int8_t)(b-UC0);
808                     isSingleByteMode=TRUE;
809                     goto fastSingle;
810                 } else if(/* UD0<=b && */ b<=UD7) {
811                     dynamicWindow=(int8_t)(b-UD0);
812                     isSingleByteMode=TRUE;
813                     cnv->toUBytes[0]=b;
814                     cnv->toULength=1;
815                     state=defineOne;
816                     goto singleByteMode;
817                 } else if(b==UDX) {
818                     isSingleByteMode=TRUE;
819                     cnv->toUBytes[0]=b;
820                     cnv->toULength=1;
821                     state=definePairOne;
822                     goto singleByteMode;
823                 } else if(b==UQU) {
824                     cnv->toUBytes[0]=b;
825                     cnv->toULength=1;
826                     state=quotePairOne;
827                 } else /* Urs */ {
828                     /* callback(illegal) */
829                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
830                     cnv->toUBytes[0]=b;
831                     cnv->toULength=1;
832                     goto endloop;
833                 }
834                 break;
835             case quotePairOne:
836                 byteOne=b;
837                 cnv->toUBytes[1]=b;
838                 cnv->toULength=2;
839                 state=quotePairTwo;
840                 break;
841             case quotePairTwo:
842                 *target++=(UChar)((byteOne<<8)|b);
843                 state=readCommand;
844                 goto fastUnicode;
845             }
846         }
847     }
848 endloop:
849 
850     /* set the converter state back into UConverter */
851     if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
852         /* reset to deal with the next character */
853         state=readCommand;
854     } else if(state==readCommand) {
855         /* not in a multi-byte sequence, reset toULength */
856         cnv->toULength=0;
857     }
858     scsu->toUIsSingleByteMode=isSingleByteMode;
859     scsu->toUState=state;
860     scsu->toUQuoteWindow=quoteWindow;
861     scsu->toUDynamicWindow=dynamicWindow;
862     scsu->toUByteOne=byteOne;
863 
864     /* write back the updated pointers */
865     pArgs->source=(const char *)source;
866     pArgs->target=target;
867     return;
868 }
869 U_CDECL_END
870 /* SCSU-from-Unicode conversion functions ----------------------------------- */
871 
872 /*
873  * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve
874  * reasonable results. The lookahead is minimal.
875  * Many cases are simple:
876  * A character fits directly into the current mode, a dynamic or static window,
877  * or is not compressible. These cases are tested first.
878  * Real compression heuristics are applied to the rest, in code branches for
879  * single/Unicode mode and BMP/supplementary code points.
880  * The heuristics used here are extremely simple.
881  */
882 
883 /* get the number of the window that this character is in, or -1 */
884 static int8_t
getWindow(const uint32_t offsets[8],uint32_t c)885 getWindow(const uint32_t offsets[8], uint32_t c) {
886     int i;
887     for(i=0; i<8; ++i) {
888         if((uint32_t)(c-offsets[i])<=0x7f) {
889             return (int8_t)(i);
890         }
891     }
892     return -1;
893 }
894 
895 /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */
896 static UBool
isInOffsetWindowOrDirect(uint32_t offset,uint32_t c)897 isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) {
898     return (UBool)(c<=offset+0x7f &&
899           (c>=offset || (c<=0x7f &&
900                         (c>=0x20 || (1UL<<c)&0x2601))));
901                                 /* binary 0010 0110 0000 0001,
902                                    check for b==0xd || b==0xa || b==9 || b==0 */
903 }
904 
905 /*
906  * getNextDynamicWindow returns the next dynamic window to be redefined
907  */
908 static int8_t
getNextDynamicWindow(SCSUData * scsu)909 getNextDynamicWindow(SCSUData *scsu) {
910     int8_t window=scsu->windowUse[scsu->nextWindowUseIndex];
911     if(++scsu->nextWindowUseIndex==8) {
912         scsu->nextWindowUseIndex=0;
913     }
914     return window;
915 }
916 
917 /*
918  * useDynamicWindow() adjusts
919  * windowUse[] and nextWindowUseIndex for the algorithm to choose
920  * the next dynamic window to be defined;
921  * a subclass may override it and provide its own algorithm.
922  */
923 static void
useDynamicWindow(SCSUData * scsu,int8_t window)924 useDynamicWindow(SCSUData *scsu, int8_t window) {
925     /*
926      * move the existing window, which just became the most recently used one,
927      * up in windowUse[] to nextWindowUseIndex-1
928      */
929 
930     /* first, find the index of the window - backwards to favor the more recently used windows */
931     int i, j;
932 
933     i=scsu->nextWindowUseIndex;
934     do {
935         if(--i<0) {
936             i=7;
937         }
938     } while(scsu->windowUse[i]!=window);
939 
940     /* now copy each windowUse[i+1] to [i] */
941     j=i+1;
942     if(j==8) {
943         j=0;
944     }
945     while(j!=scsu->nextWindowUseIndex) {
946         scsu->windowUse[i]=scsu->windowUse[j];
947         i=j;
948         if(++j==8) { j=0; }
949     }
950 
951     /* finally, set the window into the most recently used index */
952     scsu->windowUse[i]=window;
953 }
954 
955 /*
956  * calculate the offset and the code for a dynamic window that contains the character
957  * takes fixed offsets into account
958  * the offset of the window is stored in the offset variable,
959  * the code is returned
960  *
961  * return offset code: -1 none  <=0xff code for SDn/UDn  else code for SDX/UDX, subtract 0x200 to get the true code
962  */
963 static int
getDynamicOffset(uint32_t c,uint32_t * pOffset)964 getDynamicOffset(uint32_t c, uint32_t *pOffset) {
965     int i;
966 
967     for(i=0; i<7; ++i) {
968         if((uint32_t)(c-fixedOffsets[i])<=0x7f) {
969             *pOffset=fixedOffsets[i];
970             return 0xf9+i;
971         }
972     }
973 
974     if(c<0x80) {
975         /* No dynamic window for US-ASCII. */
976         return -1;
977     } else if(c<0x3400 ||
978               (uint32_t)(c-0x10000)<(0x14000-0x10000) ||
979               (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000)
980     ) {
981         /* This character is in a code range for a "small", i.e., reasonably windowable, script. */
982         *pOffset=c&0x7fffff80;
983         return (int)(c>>7);
984     } else if(0xe000<=c && c!=0xfeff && c<0xfff0) {
985         /* For these characters we need to take the gapOffset into account. */
986         *pOffset=c&0x7fffff80;
987         return (int)((c-gapOffset)>>7);
988     } else {
989         return -1;
990     }
991 }
992 U_CDECL_BEGIN
993 /*
994  * Idea for compression:
995  *  - save SCSUData and other state before really starting work
996  *  - at endloop, see if compression could be better with just unicode mode
997  *  - don't do this if a callback has been called
998  *  - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning
999  *  - different buffer handling!
1000  *
1001  * Drawback or need for corrective handling:
1002  * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and
1003  * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible
1004  * not only for compression but also for HTML/XML documents with following charset/encoding announcers.
1005  *
1006  * How to achieve both?
1007  *  - Only replace the result after an SDX or SCU?
1008  */
1009 
1010 static void U_CALLCONV
_SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)1011 _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1012                             UErrorCode *pErrorCode) {
1013     UConverter *cnv;
1014     SCSUData *scsu;
1015     const UChar *source, *sourceLimit;
1016     uint8_t *target;
1017     int32_t targetCapacity;
1018     int32_t *offsets;
1019 
1020     UBool isSingleByteMode;
1021     uint8_t dynamicWindow;
1022     uint32_t currentOffset;
1023 
1024     uint32_t c, delta;
1025 
1026     int32_t sourceIndex, nextSourceIndex;
1027 
1028     int32_t length;
1029 
1030     /* variables for compression heuristics */
1031     uint32_t offset;
1032     UChar lead, trail;
1033     int code;
1034     int8_t window;
1035 
1036     /* set up the local pointers */
1037     cnv=pArgs->converter;
1038     scsu=(SCSUData *)cnv->extraInfo;
1039 
1040     /* set up the local pointers */
1041     source=pArgs->source;
1042     sourceLimit=pArgs->sourceLimit;
1043     target=(uint8_t *)pArgs->target;
1044     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
1045     offsets=pArgs->offsets;
1046 
1047     /* get the state machine state */
1048     isSingleByteMode=scsu->fromUIsSingleByteMode;
1049     dynamicWindow=scsu->fromUDynamicWindow;
1050     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1051 
1052     c=cnv->fromUChar32;
1053 
1054     /* sourceIndex=-1 if the current character began in the previous buffer */
1055     sourceIndex= c==0 ? 0 : -1;
1056     nextSourceIndex=0;
1057 
1058     /* similar conversion "loop" as in toUnicode */
1059 loop:
1060     if(isSingleByteMode) {
1061         if(c!=0 && targetCapacity>0) {
1062             goto getTrailSingle;
1063         }
1064 
1065         /* state machine for single-byte mode */
1066 /* singleByteMode: */
1067         while(source<sourceLimit) {
1068             if(targetCapacity<=0) {
1069                 /* target is full */
1070                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1071                 break;
1072             }
1073             c=*source++;
1074             ++nextSourceIndex;
1075 
1076             if((c-0x20)<=0x5f) {
1077                 /* pass US-ASCII graphic character through */
1078                 *target++=(uint8_t)c;
1079                 if(offsets!=NULL) {
1080                     *offsets++=sourceIndex;
1081                 }
1082                 --targetCapacity;
1083             } else if(c<0x20) {
1084                 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1085                     /* CR/LF/TAB/NUL */
1086                     *target++=(uint8_t)c;
1087                     if(offsets!=NULL) {
1088                         *offsets++=sourceIndex;
1089                     }
1090                     --targetCapacity;
1091                 } else {
1092                     /* quote C0 control character */
1093                     c|=SQ0<<8;
1094                     length=2;
1095                     goto outputBytes;
1096                 }
1097             } else if((delta=c-currentOffset)<=0x7f) {
1098                 /* use the current dynamic window */
1099                 *target++=(uint8_t)(delta|0x80);
1100                 if(offsets!=NULL) {
1101                     *offsets++=sourceIndex;
1102                 }
1103                 --targetCapacity;
1104             } else if(U16_IS_SURROGATE(c)) {
1105                 if(U16_IS_SURROGATE_LEAD(c)) {
1106 getTrailSingle:
1107                     lead=(UChar)c;
1108                     if(source<sourceLimit) {
1109                         /* test the following code unit */
1110                         trail=*source;
1111                         if(U16_IS_TRAIL(trail)) {
1112                             ++source;
1113                             ++nextSourceIndex;
1114                             c=U16_GET_SUPPLEMENTARY(c, trail);
1115                             /* convert this surrogate code point */
1116                             /* exit this condition tree */
1117                         } else {
1118                             /* this is an unmatched lead code unit (1st surrogate) */
1119                             /* callback(illegal) */
1120                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1121                             goto endloop;
1122                         }
1123                     } else {
1124                         /* no more input */
1125                         break;
1126                     }
1127                 } else {
1128                     /* this is an unmatched trail code unit (2nd surrogate) */
1129                     /* callback(illegal) */
1130                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1131                     goto endloop;
1132                 }
1133 
1134                 /* compress supplementary character U+10000..U+10ffff */
1135                 if((delta=c-currentOffset)<=0x7f) {
1136                     /* use the current dynamic window */
1137                     *target++=(uint8_t)(delta|0x80);
1138                     if(offsets!=NULL) {
1139                         *offsets++=sourceIndex;
1140                     }
1141                     --targetCapacity;
1142                 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1143                     /* there is a dynamic window that contains this character, change to it */
1144                     dynamicWindow=window;
1145                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1146                     useDynamicWindow(scsu, dynamicWindow);
1147                     c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1148                     length=2;
1149                     goto outputBytes;
1150                 } else if((code=getDynamicOffset(c, &offset))>=0) {
1151                     /* might check if there are more characters in this window to come */
1152                     /* define an extended window with this character */
1153                     code-=0x200;
1154                     dynamicWindow=getNextDynamicWindow(scsu);
1155                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1156                     useDynamicWindow(scsu, dynamicWindow);
1157                     c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1158                     length=4;
1159                     goto outputBytes;
1160                 } else {
1161                     /* change to Unicode mode and output this (lead, trail) pair */
1162                     isSingleByteMode=FALSE;
1163                     *target++=(uint8_t)SCU;
1164                     if(offsets!=NULL) {
1165                         *offsets++=sourceIndex;
1166                     }
1167                     --targetCapacity;
1168                     c=((uint32_t)lead<<16)|trail;
1169                     length=4;
1170                     goto outputBytes;
1171                 }
1172             } else if(c<0xa0) {
1173                 /* quote C1 control character */
1174                 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
1175                 length=2;
1176                 goto outputBytes;
1177             } else if(c==0xfeff || c>=0xfff0) {
1178                 /* quote signature character=byte order mark and specials */
1179                 c|=SQU<<16;
1180                 length=3;
1181                 goto outputBytes;
1182             } else {
1183                 /* compress all other BMP characters */
1184                 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1185                     /* there is a window defined that contains this character - switch to it or quote from it? */
1186                     if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
1187                         /* change to dynamic window */
1188                         dynamicWindow=window;
1189                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1190                         useDynamicWindow(scsu, dynamicWindow);
1191                         c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1192                         length=2;
1193                         goto outputBytes;
1194                     } else {
1195                         /* quote from dynamic window */
1196                         c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
1197                         length=2;
1198                         goto outputBytes;
1199                     }
1200                 } else if((window=getWindow(staticOffsets, c))>=0) {
1201                     /* quote from static window */
1202                     c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
1203                     length=2;
1204                     goto outputBytes;
1205                 } else if((code=getDynamicOffset(c, &offset))>=0) {
1206                     /* define a dynamic window with this character */
1207                     dynamicWindow=getNextDynamicWindow(scsu);
1208                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1209                     useDynamicWindow(scsu, dynamicWindow);
1210                     c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1211                     length=3;
1212                     goto outputBytes;
1213                 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
1214                           (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1215                 ) {
1216                     /*
1217                      * this character is not compressible (a BMP ideograph or similar);
1218                      * switch to Unicode mode if this is the last character in the block
1219                      * or there is at least one more ideograph following immediately
1220                      */
1221                     isSingleByteMode=FALSE;
1222                     c|=SCU<<16;
1223                     length=3;
1224                     goto outputBytes;
1225                 } else {
1226                     /* quote Unicode */
1227                     c|=SQU<<16;
1228                     length=3;
1229                     goto outputBytes;
1230                 }
1231             }
1232 
1233             /* normal end of conversion: prepare for a new character */
1234             c=0;
1235             sourceIndex=nextSourceIndex;
1236         }
1237     } else {
1238         if(c!=0 && targetCapacity>0) {
1239             goto getTrailUnicode;
1240         }
1241 
1242         /* state machine for Unicode mode */
1243 /* unicodeByteMode: */
1244         while(source<sourceLimit) {
1245             if(targetCapacity<=0) {
1246                 /* target is full */
1247                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1248                 break;
1249             }
1250             c=*source++;
1251             ++nextSourceIndex;
1252 
1253             if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
1254                 /* not compressible, write character directly */
1255                 if(targetCapacity>=2) {
1256                     *target++=(uint8_t)(c>>8);
1257                     *target++=(uint8_t)c;
1258                     if(offsets!=NULL) {
1259                         *offsets++=sourceIndex;
1260                         *offsets++=sourceIndex;
1261                     }
1262                     targetCapacity-=2;
1263                 } else {
1264                     length=2;
1265                     goto outputBytes;
1266                 }
1267             } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1268                 /* compress BMP character if the following one is not an uncompressible ideograph */
1269                 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
1270                     if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
1271                         /* ASCII digit or letter */
1272                         isSingleByteMode=TRUE;
1273                         c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
1274                         length=2;
1275                         goto outputBytes;
1276                     } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1277                         /* there is a dynamic window that contains this character, change to it */
1278                         isSingleByteMode=TRUE;
1279                         dynamicWindow=window;
1280                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1281                         useDynamicWindow(scsu, dynamicWindow);
1282                         c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1283                         length=2;
1284                         goto outputBytes;
1285                     } else if((code=getDynamicOffset(c, &offset))>=0) {
1286                         /* define a dynamic window with this character */
1287                         isSingleByteMode=TRUE;
1288                         dynamicWindow=getNextDynamicWindow(scsu);
1289                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1290                         useDynamicWindow(scsu, dynamicWindow);
1291                         c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1292                         length=3;
1293                         goto outputBytes;
1294                     }
1295                 }
1296 
1297                 /* don't know how to compress this character, just write it directly */
1298                 length=2;
1299                 goto outputBytes;
1300             } else if(c<0xe000) {
1301                 /* c is a surrogate */
1302                 if(U16_IS_SURROGATE_LEAD(c)) {
1303 getTrailUnicode:
1304                     lead=(UChar)c;
1305                     if(source<sourceLimit) {
1306                         /* test the following code unit */
1307                         trail=*source;
1308                         if(U16_IS_TRAIL(trail)) {
1309                             ++source;
1310                             ++nextSourceIndex;
1311                             c=U16_GET_SUPPLEMENTARY(c, trail);
1312                             /* convert this surrogate code point */
1313                             /* exit this condition tree */
1314                         } else {
1315                             /* this is an unmatched lead code unit (1st surrogate) */
1316                             /* callback(illegal) */
1317                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1318                             goto endloop;
1319                         }
1320                     } else {
1321                         /* no more input */
1322                         break;
1323                     }
1324                 } else {
1325                     /* this is an unmatched trail code unit (2nd surrogate) */
1326                     /* callback(illegal) */
1327                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1328                     goto endloop;
1329                 }
1330 
1331                 /* compress supplementary character */
1332                 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
1333                     !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1334                 ) {
1335                     /*
1336                      * there is a dynamic window that contains this character and
1337                      * the following character is not uncompressible,
1338                      * change to the window
1339                      */
1340                     isSingleByteMode=TRUE;
1341                     dynamicWindow=window;
1342                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1343                     useDynamicWindow(scsu, dynamicWindow);
1344                     c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1345                     length=2;
1346                     goto outputBytes;
1347                 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
1348                           (code=getDynamicOffset(c, &offset))>=0
1349                 ) {
1350                     /* two supplementary characters in (probably) the same window - define an extended one */
1351                     isSingleByteMode=TRUE;
1352                     code-=0x200;
1353                     dynamicWindow=getNextDynamicWindow(scsu);
1354                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1355                     useDynamicWindow(scsu, dynamicWindow);
1356                     c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1357                     length=4;
1358                     goto outputBytes;
1359                 } else {
1360                     /* don't know how to compress this character, just write it directly */
1361                     c=((uint32_t)lead<<16)|trail;
1362                     length=4;
1363                     goto outputBytes;
1364                 }
1365             } else /* 0xe000<=c<0xf300 */ {
1366                 /* quote to avoid SCSU tags */
1367                 c|=UQU<<16;
1368                 length=3;
1369                 goto outputBytes;
1370             }
1371 
1372             /* normal end of conversion: prepare for a new character */
1373             c=0;
1374             sourceIndex=nextSourceIndex;
1375         }
1376     }
1377 endloop:
1378 
1379     /* set the converter state back into UConverter */
1380     scsu->fromUIsSingleByteMode=isSingleByteMode;
1381     scsu->fromUDynamicWindow=dynamicWindow;
1382 
1383     cnv->fromUChar32=c;
1384 
1385     /* write back the updated pointers */
1386     pArgs->source=source;
1387     pArgs->target=(char *)target;
1388     pArgs->offsets=offsets;
1389     return;
1390 
1391 outputBytes:
1392     /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1393     /* from the first if in the loop we know that targetCapacity>0 */
1394     if(length<=targetCapacity) {
1395         if(offsets==NULL) {
1396             switch(length) {
1397                 /* each branch falls through to the next one */
1398             case 4:
1399                 *target++=(uint8_t)(c>>24);
1400                 U_FALLTHROUGH;
1401             case 3:
1402                 *target++=(uint8_t)(c>>16);
1403                 U_FALLTHROUGH;
1404             case 2:
1405                 *target++=(uint8_t)(c>>8);
1406                 U_FALLTHROUGH;
1407             case 1:
1408                 *target++=(uint8_t)c;
1409                 U_FALLTHROUGH;
1410             default:
1411                 /* will never occur */
1412                 break;
1413             }
1414         } else {
1415             switch(length) {
1416                 /* each branch falls through to the next one */
1417             case 4:
1418                 *target++=(uint8_t)(c>>24);
1419                 *offsets++=sourceIndex;
1420                 U_FALLTHROUGH;
1421             case 3:
1422                 *target++=(uint8_t)(c>>16);
1423                 *offsets++=sourceIndex;
1424                 U_FALLTHROUGH;
1425             case 2:
1426                 *target++=(uint8_t)(c>>8);
1427                 *offsets++=sourceIndex;
1428                 U_FALLTHROUGH;
1429             case 1:
1430                 *target++=(uint8_t)c;
1431                 *offsets++=sourceIndex;
1432                 U_FALLTHROUGH;
1433             default:
1434                 /* will never occur */
1435                 break;
1436             }
1437         }
1438         targetCapacity-=length;
1439 
1440         /* normal end of conversion: prepare for a new character */
1441         c=0;
1442         sourceIndex=nextSourceIndex;
1443         goto loop;
1444     } else {
1445         uint8_t *p;
1446 
1447         /*
1448          * We actually do this backwards here:
1449          * In order to save an intermediate variable, we output
1450          * first to the overflow buffer what does not fit into the
1451          * regular target.
1452          */
1453         /* we know that 0<=targetCapacity<length<=4 */
1454         /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1455         length-=targetCapacity;
1456         p=(uint8_t *)cnv->charErrorBuffer;
1457         switch(length) {
1458             /* each branch falls through to the next one */
1459         case 4:
1460             *p++=(uint8_t)(c>>24);
1461             U_FALLTHROUGH;
1462         case 3:
1463             *p++=(uint8_t)(c>>16);
1464             U_FALLTHROUGH;
1465         case 2:
1466             *p++=(uint8_t)(c>>8);
1467             U_FALLTHROUGH;
1468         case 1:
1469             *p=(uint8_t)c;
1470             U_FALLTHROUGH;
1471         default:
1472             /* will never occur */
1473             break;
1474         }
1475         cnv->charErrorBufferLength=(int8_t)length;
1476 
1477         /* now output what fits into the regular target */
1478         c>>=8*length; /* length was reduced by targetCapacity */
1479         switch(targetCapacity) {
1480             /* each branch falls through to the next one */
1481         case 3:
1482             *target++=(uint8_t)(c>>16);
1483             if(offsets!=NULL) {
1484                 *offsets++=sourceIndex;
1485             }
1486             U_FALLTHROUGH;
1487         case 2:
1488             *target++=(uint8_t)(c>>8);
1489             if(offsets!=NULL) {
1490                 *offsets++=sourceIndex;
1491             }
1492             U_FALLTHROUGH;
1493         case 1:
1494             *target++=(uint8_t)c;
1495             if(offsets!=NULL) {
1496                 *offsets++=sourceIndex;
1497             }
1498             U_FALLTHROUGH;
1499         default:
1500             break;
1501         }
1502 
1503         /* target overflow */
1504         targetCapacity=0;
1505         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1506         c=0;
1507         goto endloop;
1508     }
1509 }
1510 
1511 /*
1512  * Identical to _SCSUFromUnicodeWithOffsets but without offset handling.
1513  * If a change is made in the original function, then either
1514  * change this function the same way or
1515  * re-copy the original function and remove the variables
1516  * offsets, sourceIndex, and nextSourceIndex.
1517  */
1518 static void U_CALLCONV
_SCSUFromUnicode(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)1519 _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs,
1520                  UErrorCode *pErrorCode) {
1521     UConverter *cnv;
1522     SCSUData *scsu;
1523     const UChar *source, *sourceLimit;
1524     uint8_t *target;
1525     int32_t targetCapacity;
1526 
1527     UBool isSingleByteMode;
1528     uint8_t dynamicWindow;
1529     uint32_t currentOffset;
1530 
1531     uint32_t c, delta;
1532 
1533     int32_t length;
1534 
1535     /* variables for compression heuristics */
1536     uint32_t offset;
1537     UChar lead, trail;
1538     int code;
1539     int8_t window;
1540 
1541     /* set up the local pointers */
1542     cnv=pArgs->converter;
1543     scsu=(SCSUData *)cnv->extraInfo;
1544 
1545     /* set up the local pointers */
1546     source=pArgs->source;
1547     sourceLimit=pArgs->sourceLimit;
1548     target=(uint8_t *)pArgs->target;
1549     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
1550 
1551     /* get the state machine state */
1552     isSingleByteMode=scsu->fromUIsSingleByteMode;
1553     dynamicWindow=scsu->fromUDynamicWindow;
1554     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1555 
1556     c=cnv->fromUChar32;
1557 
1558     /* similar conversion "loop" as in toUnicode */
1559 loop:
1560     if(isSingleByteMode) {
1561         if(c!=0 && targetCapacity>0) {
1562             goto getTrailSingle;
1563         }
1564 
1565         /* state machine for single-byte mode */
1566 /* singleByteMode: */
1567         while(source<sourceLimit) {
1568             if(targetCapacity<=0) {
1569                 /* target is full */
1570                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1571                 break;
1572             }
1573             c=*source++;
1574 
1575             if((c-0x20)<=0x5f) {
1576                 /* pass US-ASCII graphic character through */
1577                 *target++=(uint8_t)c;
1578                 --targetCapacity;
1579             } else if(c<0x20) {
1580                 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1581                     /* CR/LF/TAB/NUL */
1582                     *target++=(uint8_t)c;
1583                     --targetCapacity;
1584                 } else {
1585                     /* quote C0 control character */
1586                     c|=SQ0<<8;
1587                     length=2;
1588                     goto outputBytes;
1589                 }
1590             } else if((delta=c-currentOffset)<=0x7f) {
1591                 /* use the current dynamic window */
1592                 *target++=(uint8_t)(delta|0x80);
1593                 --targetCapacity;
1594             } else if(U16_IS_SURROGATE(c)) {
1595                 if(U16_IS_SURROGATE_LEAD(c)) {
1596 getTrailSingle:
1597                     lead=(UChar)c;
1598                     if(source<sourceLimit) {
1599                         /* test the following code unit */
1600                         trail=*source;
1601                         if(U16_IS_TRAIL(trail)) {
1602                             ++source;
1603                             c=U16_GET_SUPPLEMENTARY(c, trail);
1604                             /* convert this surrogate code point */
1605                             /* exit this condition tree */
1606                         } else {
1607                             /* this is an unmatched lead code unit (1st surrogate) */
1608                             /* callback(illegal) */
1609                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1610                             goto endloop;
1611                         }
1612                     } else {
1613                         /* no more input */
1614                         break;
1615                     }
1616                 } else {
1617                     /* this is an unmatched trail code unit (2nd surrogate) */
1618                     /* callback(illegal) */
1619                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1620                     goto endloop;
1621                 }
1622 
1623                 /* compress supplementary character U+10000..U+10ffff */
1624                 if((delta=c-currentOffset)<=0x7f) {
1625                     /* use the current dynamic window */
1626                     *target++=(uint8_t)(delta|0x80);
1627                     --targetCapacity;
1628                 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1629                     /* there is a dynamic window that contains this character, change to it */
1630                     dynamicWindow=window;
1631                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1632                     useDynamicWindow(scsu, dynamicWindow);
1633                     c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1634                     length=2;
1635                     goto outputBytes;
1636                 } else if((code=getDynamicOffset(c, &offset))>=0) {
1637                     /* might check if there are more characters in this window to come */
1638                     /* define an extended window with this character */
1639                     code-=0x200;
1640                     dynamicWindow=getNextDynamicWindow(scsu);
1641                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1642                     useDynamicWindow(scsu, dynamicWindow);
1643                     c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1644                     length=4;
1645                     goto outputBytes;
1646                 } else {
1647                     /* change to Unicode mode and output this (lead, trail) pair */
1648                     isSingleByteMode=FALSE;
1649                     *target++=(uint8_t)SCU;
1650                     --targetCapacity;
1651                     c=((uint32_t)lead<<16)|trail;
1652                     length=4;
1653                     goto outputBytes;
1654                 }
1655             } else if(c<0xa0) {
1656                 /* quote C1 control character */
1657                 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
1658                 length=2;
1659                 goto outputBytes;
1660             } else if(c==0xfeff || c>=0xfff0) {
1661                 /* quote signature character=byte order mark and specials */
1662                 c|=SQU<<16;
1663                 length=3;
1664                 goto outputBytes;
1665             } else {
1666                 /* compress all other BMP characters */
1667                 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1668                     /* there is a window defined that contains this character - switch to it or quote from it? */
1669                     if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
1670                         /* change to dynamic window */
1671                         dynamicWindow=window;
1672                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1673                         useDynamicWindow(scsu, dynamicWindow);
1674                         c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1675                         length=2;
1676                         goto outputBytes;
1677                     } else {
1678                         /* quote from dynamic window */
1679                         c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
1680                         length=2;
1681                         goto outputBytes;
1682                     }
1683                 } else if((window=getWindow(staticOffsets, c))>=0) {
1684                     /* quote from static window */
1685                     c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
1686                     length=2;
1687                     goto outputBytes;
1688                 } else if((code=getDynamicOffset(c, &offset))>=0) {
1689                     /* define a dynamic window with this character */
1690                     dynamicWindow=getNextDynamicWindow(scsu);
1691                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1692                     useDynamicWindow(scsu, dynamicWindow);
1693                     c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1694                     length=3;
1695                     goto outputBytes;
1696                 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
1697                           (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1698                 ) {
1699                     /*
1700                      * this character is not compressible (a BMP ideograph or similar);
1701                      * switch to Unicode mode if this is the last character in the block
1702                      * or there is at least one more ideograph following immediately
1703                      */
1704                     isSingleByteMode=FALSE;
1705                     c|=SCU<<16;
1706                     length=3;
1707                     goto outputBytes;
1708                 } else {
1709                     /* quote Unicode */
1710                     c|=SQU<<16;
1711                     length=3;
1712                     goto outputBytes;
1713                 }
1714             }
1715 
1716             /* normal end of conversion: prepare for a new character */
1717             c=0;
1718         }
1719     } else {
1720         if(c!=0 && targetCapacity>0) {
1721             goto getTrailUnicode;
1722         }
1723 
1724         /* state machine for Unicode mode */
1725 /* unicodeByteMode: */
1726         while(source<sourceLimit) {
1727             if(targetCapacity<=0) {
1728                 /* target is full */
1729                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1730                 break;
1731             }
1732             c=*source++;
1733 
1734             if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
1735                 /* not compressible, write character directly */
1736                 if(targetCapacity>=2) {
1737                     *target++=(uint8_t)(c>>8);
1738                     *target++=(uint8_t)c;
1739                     targetCapacity-=2;
1740                 } else {
1741                     length=2;
1742                     goto outputBytes;
1743                 }
1744             } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1745                 /* compress BMP character if the following one is not an uncompressible ideograph */
1746                 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
1747                     if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
1748                         /* ASCII digit or letter */
1749                         isSingleByteMode=TRUE;
1750                         c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
1751                         length=2;
1752                         goto outputBytes;
1753                     } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1754                         /* there is a dynamic window that contains this character, change to it */
1755                         isSingleByteMode=TRUE;
1756                         dynamicWindow=window;
1757                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1758                         useDynamicWindow(scsu, dynamicWindow);
1759                         c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1760                         length=2;
1761                         goto outputBytes;
1762                     } else if((code=getDynamicOffset(c, &offset))>=0) {
1763                         /* define a dynamic window with this character */
1764                         isSingleByteMode=TRUE;
1765                         dynamicWindow=getNextDynamicWindow(scsu);
1766                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1767                         useDynamicWindow(scsu, dynamicWindow);
1768                         c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1769                         length=3;
1770                         goto outputBytes;
1771                     }
1772                 }
1773 
1774                 /* don't know how to compress this character, just write it directly */
1775                 length=2;
1776                 goto outputBytes;
1777             } else if(c<0xe000) {
1778                 /* c is a surrogate */
1779                 if(U16_IS_SURROGATE_LEAD(c)) {
1780 getTrailUnicode:
1781                     lead=(UChar)c;
1782                     if(source<sourceLimit) {
1783                         /* test the following code unit */
1784                         trail=*source;
1785                         if(U16_IS_TRAIL(trail)) {
1786                             ++source;
1787                             c=U16_GET_SUPPLEMENTARY(c, trail);
1788                             /* convert this surrogate code point */
1789                             /* exit this condition tree */
1790                         } else {
1791                             /* this is an unmatched lead code unit (1st surrogate) */
1792                             /* callback(illegal) */
1793                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1794                             goto endloop;
1795                         }
1796                     } else {
1797                         /* no more input */
1798                         break;
1799                     }
1800                 } else {
1801                     /* this is an unmatched trail code unit (2nd surrogate) */
1802                     /* callback(illegal) */
1803                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1804                     goto endloop;
1805                 }
1806 
1807                 /* compress supplementary character */
1808                 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
1809                     !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1810                 ) {
1811                     /*
1812                      * there is a dynamic window that contains this character and
1813                      * the following character is not uncompressible,
1814                      * change to the window
1815                      */
1816                     isSingleByteMode=TRUE;
1817                     dynamicWindow=window;
1818                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1819                     useDynamicWindow(scsu, dynamicWindow);
1820                     c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1821                     length=2;
1822                     goto outputBytes;
1823                 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
1824                           (code=getDynamicOffset(c, &offset))>=0
1825                 ) {
1826                     /* two supplementary characters in (probably) the same window - define an extended one */
1827                     isSingleByteMode=TRUE;
1828                     code-=0x200;
1829                     dynamicWindow=getNextDynamicWindow(scsu);
1830                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1831                     useDynamicWindow(scsu, dynamicWindow);
1832                     c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1833                     length=4;
1834                     goto outputBytes;
1835                 } else {
1836                     /* don't know how to compress this character, just write it directly */
1837                     c=((uint32_t)lead<<16)|trail;
1838                     length=4;
1839                     goto outputBytes;
1840                 }
1841             } else /* 0xe000<=c<0xf300 */ {
1842                 /* quote to avoid SCSU tags */
1843                 c|=UQU<<16;
1844                 length=3;
1845                 goto outputBytes;
1846             }
1847 
1848             /* normal end of conversion: prepare for a new character */
1849             c=0;
1850         }
1851     }
1852 endloop:
1853 
1854     /* set the converter state back into UConverter */
1855     scsu->fromUIsSingleByteMode=isSingleByteMode;
1856     scsu->fromUDynamicWindow=dynamicWindow;
1857 
1858     cnv->fromUChar32=c;
1859 
1860     /* write back the updated pointers */
1861     pArgs->source=source;
1862     pArgs->target=(char *)target;
1863     return;
1864 
1865 outputBytes:
1866     /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1867     /* from the first if in the loop we know that targetCapacity>0 */
1868     if(length<=targetCapacity) {
1869         switch(length) {
1870             /* each branch falls through to the next one */
1871         case 4:
1872             *target++=(uint8_t)(c>>24);
1873             U_FALLTHROUGH;
1874         case 3:
1875             *target++=(uint8_t)(c>>16);
1876             U_FALLTHROUGH;
1877         case 2:
1878             *target++=(uint8_t)(c>>8);
1879             U_FALLTHROUGH;
1880         case 1:
1881             *target++=(uint8_t)c;
1882             U_FALLTHROUGH;
1883         default:
1884             /* will never occur */
1885             break;
1886         }
1887         targetCapacity-=length;
1888 
1889         /* normal end of conversion: prepare for a new character */
1890         c=0;
1891         goto loop;
1892     } else {
1893         uint8_t *p;
1894 
1895         /*
1896          * We actually do this backwards here:
1897          * In order to save an intermediate variable, we output
1898          * first to the overflow buffer what does not fit into the
1899          * regular target.
1900          */
1901         /* we know that 0<=targetCapacity<length<=4 */
1902         /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1903         length-=targetCapacity;
1904         p=(uint8_t *)cnv->charErrorBuffer;
1905         switch(length) {
1906             /* each branch falls through to the next one */
1907         case 4:
1908             *p++=(uint8_t)(c>>24);
1909             U_FALLTHROUGH;
1910         case 3:
1911             *p++=(uint8_t)(c>>16);
1912             U_FALLTHROUGH;
1913         case 2:
1914             *p++=(uint8_t)(c>>8);
1915             U_FALLTHROUGH;
1916         case 1:
1917             *p=(uint8_t)c;
1918             U_FALLTHROUGH;
1919         default:
1920             /* will never occur */
1921             break;
1922         }
1923         cnv->charErrorBufferLength=(int8_t)length;
1924 
1925         /* now output what fits into the regular target */
1926         c>>=8*length; /* length was reduced by targetCapacity */
1927         switch(targetCapacity) {
1928             /* each branch falls through to the next one */
1929         case 3:
1930             *target++=(uint8_t)(c>>16);
1931             U_FALLTHROUGH;
1932         case 2:
1933             *target++=(uint8_t)(c>>8);
1934             U_FALLTHROUGH;
1935         case 1:
1936             *target++=(uint8_t)c;
1937             U_FALLTHROUGH;
1938         default:
1939             break;
1940         }
1941 
1942         /* target overflow */
1943         targetCapacity=0;
1944         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1945         c=0;
1946         goto endloop;
1947     }
1948 }
1949 
1950 /* miscellaneous ------------------------------------------------------------ */
1951 
1952 static const char *  U_CALLCONV
_SCSUGetName(const UConverter * cnv)1953 _SCSUGetName(const UConverter *cnv) {
1954     SCSUData *scsu=(SCSUData *)cnv->extraInfo;
1955 
1956     switch(scsu->locale) {
1957     case l_ja:
1958         return "SCSU,locale=ja";
1959     default:
1960         return "SCSU";
1961     }
1962 }
1963 
1964 /* structure for SafeClone calculations */
1965 struct cloneSCSUStruct
1966 {
1967     UConverter cnv;
1968     SCSUData mydata;
1969 };
1970 
1971 static UConverter *  U_CALLCONV
_SCSUSafeClone(const UConverter * cnv,void * stackBuffer,int32_t * pBufferSize,UErrorCode * status)1972 _SCSUSafeClone(const UConverter *cnv,
1973                void *stackBuffer,
1974                int32_t *pBufferSize,
1975                UErrorCode *status)
1976 {
1977     struct cloneSCSUStruct * localClone;
1978     int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct);
1979 
1980     if (U_FAILURE(*status)){
1981         return 0;
1982     }
1983 
1984     if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
1985         *pBufferSize = bufferSizeNeeded;
1986         return 0;
1987     }
1988 
1989     localClone = (struct cloneSCSUStruct *)stackBuffer;
1990     /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
1991 
1992     uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData));
1993     localClone->cnv.extraInfo = &localClone->mydata;
1994     localClone->cnv.isExtraLocal = TRUE;
1995 
1996     return &localClone->cnv;
1997 }
1998 U_CDECL_END
1999 
2000 static const UConverterImpl _SCSUImpl={
2001     UCNV_SCSU,
2002 
2003     NULL,
2004     NULL,
2005 
2006     _SCSUOpen,
2007     _SCSUClose,
2008     _SCSUReset,
2009 
2010     _SCSUToUnicode,
2011     _SCSUToUnicodeWithOffsets,
2012     _SCSUFromUnicode,
2013     _SCSUFromUnicodeWithOffsets,
2014     NULL,
2015 
2016     NULL,
2017     _SCSUGetName,
2018     NULL,
2019     _SCSUSafeClone,
2020     ucnv_getCompleteUnicodeSet,
2021     NULL,
2022     NULL
2023 };
2024 
2025 static const UConverterStaticData _SCSUStaticData={
2026     sizeof(UConverterStaticData),
2027     "SCSU",
2028     1212, /* CCSID for SCSU */
2029     UCNV_IBM, UCNV_SCSU,
2030     1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
2031     /*
2032      * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode
2033      * substitution string.
2034      */
2035     { 0x0e, 0xff, 0xfd, 0 }, 3,
2036     FALSE, FALSE,
2037     0,
2038     0,
2039     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
2040 };
2041 
2042 const UConverterSharedData _SCSUData=
2043         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_SCSUStaticData, &_SCSUImpl);
2044 
2045 #endif
2046