• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 *   Copyright (C) 2000-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 ******************************************************************************
10 *   file name:  ucnvscsu.c
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2000nov18
16 *   created by: Markus W. Scherer
17 *
18 *   This is an implementation of the Standard Compression Scheme for Unicode
19 *   as defined in https://www.unicode.org/reports/tr6/ .
20 *   Reserved commands and window settings are treated as illegal sequences and
21 *   will result in callback calls.
22 */
23 
24 #include "unicode/utypes.h"
25 
26 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
27 
28 #include "unicode/ucnv.h"
29 #include "unicode/ucnv_cb.h"
30 #include "unicode/utf16.h"
31 #include "ucnv_bld.h"
32 #include "ucnv_cnv.h"
33 #include "cmemory.h"
34 
35 /* SCSU definitions --------------------------------------------------------- */
36 
37 /* SCSU command byte values */
38 enum {
39     SQ0=0x01, /* Quote from window pair 0 */
40     SQ7=0x08, /* Quote from window pair 7 */
41     SDX=0x0B, /* Define a window as extended */
42     Srs=0x0C, /* reserved */
43     SQU=0x0E, /* Quote a single Unicode character */
44     SCU=0x0F, /* Change to Unicode mode */
45     SC0=0x10, /* Select window 0 */
46     SC7=0x17, /* Select window 7 */
47     SD0=0x18, /* Define and select window 0 */
48     SD7=0x1F, /* Define and select window 7 */
49 
50     UC0=0xE0, /* Select window 0 */
51     UC7=0xE7, /* Select window 7 */
52     UD0=0xE8, /* Define and select window 0 */
53     UD7=0xEF, /* Define and select window 7 */
54     UQU=0xF0, /* Quote a single Unicode character */
55     UDX=0xF1, /* Define a Window as extended */
56     Urs=0xF2  /* reserved */
57 };
58 
59 enum {
60     /*
61      * Unicode code points from 3400 to E000 are not adressible by
62      * dynamic window, since in these areas no short run alphabets are
63      * found. Therefore add gapOffset to all values from gapThreshold.
64      */
65     gapThreshold=0x68,
66     gapOffset=0xAC00,
67 
68     /* values between reservedStart and fixedThreshold are reserved */
69     reservedStart=0xA8,
70 
71     /* use table of predefined fixed offsets for values from fixedThreshold */
72     fixedThreshold=0xF9
73 };
74 
75 /* constant offsets for the 8 static windows */
76 static const uint32_t staticOffsets[8]={
77     0x0000, /* ASCII for quoted tags */
78     0x0080, /* Latin - 1 Supplement (for access to punctuation) */
79     0x0100, /* Latin Extended-A */
80     0x0300, /* Combining Diacritical Marks */
81     0x2000, /* General Punctuation */
82     0x2080, /* Currency Symbols */
83     0x2100, /* Letterlike Symbols and Number Forms */
84     0x3000  /* CJK Symbols and punctuation */
85 };
86 
87 /* initial offsets for the 8 dynamic (sliding) windows */
88 static const uint32_t initialDynamicOffsets[8]={
89     0x0080, /* Latin-1 */
90     0x00C0, /* Latin Extended A */
91     0x0400, /* Cyrillic */
92     0x0600, /* Arabic */
93     0x0900, /* Devanagari */
94     0x3040, /* Hiragana */
95     0x30A0, /* Katakana */
96     0xFF00  /* Fullwidth ASCII */
97 };
98 
99 /* Table of fixed predefined Offsets */
100 static const uint32_t fixedOffsets[]={
101     /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */
102     /* 0xFA */ 0x0250, /* IPA extensions */
103     /* 0xFB */ 0x0370, /* Greek */
104     /* 0xFC */ 0x0530, /* Armenian */
105     /* 0xFD */ 0x3040, /* Hiragana */
106     /* 0xFE */ 0x30A0, /* Katakana */
107     /* 0xFF */ 0xFF60  /* Halfwidth Katakana */
108 };
109 
110 /* state values */
111 enum {
112     readCommand,
113     quotePairOne,
114     quotePairTwo,
115     quoteOne,
116     definePairOne,
117     definePairTwo,
118     defineOne
119 };
120 
121 typedef struct SCSUData {
122     /* dynamic window offsets, initialize to default values from initialDynamicOffsets */
123     uint32_t toUDynamicOffsets[8];
124     uint32_t fromUDynamicOffsets[8];
125 
126     /* state machine state - toUnicode */
127     UBool toUIsSingleByteMode;
128     uint8_t toUState;
129     int8_t toUQuoteWindow, toUDynamicWindow;
130     uint8_t toUByteOne;
131     uint8_t toUPadding[3];
132 
133     /* state machine state - fromUnicode */
134     UBool fromUIsSingleByteMode;
135     int8_t fromUDynamicWindow;
136 
137     /*
138      * windowUse[] keeps track of the use of the dynamic windows:
139      * At nextWindowUseIndex there is the least recently used window,
140      * and the following windows (in a wrapping manner) are more and more
141      * recently used.
142      * At nextWindowUseIndex-1 there is the most recently used window.
143      */
144     uint8_t locale;
145     int8_t nextWindowUseIndex;
146     int8_t windowUse[8];
147 } SCSUData;
148 
149 static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 };
150 static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 };
151 
152 enum {
153     lGeneric, l_ja
154 };
155 
156 /* SCSU setup functions ----------------------------------------------------- */
157 U_CDECL_BEGIN
158 static void U_CALLCONV
_SCSUReset(UConverter * cnv,UConverterResetChoice choice)159 _SCSUReset(UConverter *cnv, UConverterResetChoice choice) {
160     SCSUData *scsu=(SCSUData *)cnv->extraInfo;
161 
162     if(choice<=UCNV_RESET_TO_UNICODE) {
163         /* reset toUnicode */
164         uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32);
165 
166         scsu->toUIsSingleByteMode=true;
167         scsu->toUState=readCommand;
168         scsu->toUQuoteWindow=scsu->toUDynamicWindow=0;
169         scsu->toUByteOne=0;
170 
171         cnv->toULength=0;
172     }
173     if(choice!=UCNV_RESET_TO_UNICODE) {
174         /* reset fromUnicode */
175         uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32);
176 
177         scsu->fromUIsSingleByteMode=true;
178         scsu->fromUDynamicWindow=0;
179 
180         scsu->nextWindowUseIndex=0;
181         switch(scsu->locale) {
182         case l_ja:
183             uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8);
184             break;
185         default:
186             uprv_memcpy(scsu->windowUse, initialWindowUse, 8);
187             break;
188         }
189 
190         cnv->fromUChar32=0;
191     }
192 }
193 
194 static void U_CALLCONV
_SCSUOpen(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)195 _SCSUOpen(UConverter *cnv,
196           UConverterLoadArgs *pArgs,
197           UErrorCode *pErrorCode) {
198     const char *locale=pArgs->locale;
199     if(pArgs->onlyTestIsLoadable) {
200         return;
201     }
202     cnv->extraInfo=uprv_malloc(sizeof(SCSUData));
203     if(cnv->extraInfo!=nullptr) {
204         if(locale!=nullptr && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) {
205             ((SCSUData *)cnv->extraInfo)->locale=l_ja;
206         } else {
207             ((SCSUData *)cnv->extraInfo)->locale=lGeneric;
208         }
209         _SCSUReset(cnv, UCNV_RESET_BOTH);
210     } else {
211         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
212     }
213 
214     /* Set the substitution character U+fffd as a Unicode string. */
215     cnv->subUChars[0]=0xfffd;
216     cnv->subCharLen=-1;
217 }
218 
219 static void U_CALLCONV
_SCSUClose(UConverter * cnv)220 _SCSUClose(UConverter *cnv) {
221     if(cnv->extraInfo!=nullptr) {
222         if(!cnv->isExtraLocal) {
223             uprv_free(cnv->extraInfo);
224         }
225         cnv->extraInfo=nullptr;
226     }
227 }
228 
229 /* SCSU-to-Unicode conversion functions ------------------------------------- */
230 
231 static void U_CALLCONV
_SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)232 _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
233                           UErrorCode *pErrorCode) {
234     UConverter *cnv;
235     SCSUData *scsu;
236     const uint8_t *source, *sourceLimit;
237     char16_t *target;
238     const char16_t *targetLimit;
239     int32_t *offsets;
240     UBool isSingleByteMode;
241     uint8_t state, byteOne;
242     int8_t quoteWindow, dynamicWindow;
243 
244     int32_t sourceIndex, nextSourceIndex;
245 
246     uint8_t b;
247 
248     /* set up the local pointers */
249     cnv=pArgs->converter;
250     scsu=(SCSUData *)cnv->extraInfo;
251 
252     source=(const uint8_t *)pArgs->source;
253     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
254     target=pArgs->target;
255     targetLimit=pArgs->targetLimit;
256     offsets=pArgs->offsets;
257 
258     /* get the state machine state */
259     isSingleByteMode=scsu->toUIsSingleByteMode;
260     state=scsu->toUState;
261     quoteWindow=scsu->toUQuoteWindow;
262     dynamicWindow=scsu->toUDynamicWindow;
263     byteOne=scsu->toUByteOne;
264 
265     /* sourceIndex=-1 if the current character began in the previous buffer */
266     sourceIndex=state==readCommand ? 0 : -1;
267     nextSourceIndex=0;
268 
269     /*
270      * conversion "loop"
271      *
272      * For performance, this is not a normal C loop.
273      * Instead, there are two code blocks for the two SCSU modes.
274      * The function branches to either one, and a change of the mode is done with a goto to
275      * the other branch.
276      *
277      * Each branch has two conventional loops:
278      * - a fast-path loop for the most common codes in the mode
279      * - a loop for all other codes in the mode
280      * When the fast-path runs into a code that it cannot handle, its loop ends and it
281      * runs into the following loop to handle the other codes.
282      * The end of the input or output buffer is also handled by the slower loop.
283      * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
284      *
285      * The callback handling is done by returning with an error code.
286      * The conversion framework actually calls the callback function.
287      */
288     if(isSingleByteMode) {
289         /* fast path for single-byte mode */
290         if(state==readCommand) {
291 fastSingle:
292             while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
293                 ++source;
294                 ++nextSourceIndex;
295                 if(b<=0x7f) {
296                     /* write US-ASCII graphic character or DEL */
297                     *target++=(char16_t)b;
298                     if(offsets!=nullptr) {
299                         *offsets++=sourceIndex;
300                     }
301                 } else {
302                     /* write from dynamic window */
303                     uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
304                     if(c<=0xffff) {
305                         *target++=(char16_t)c;
306                         if(offsets!=nullptr) {
307                             *offsets++=sourceIndex;
308                         }
309                     } else {
310                         /* output surrogate pair */
311                         *target++=(char16_t)(0xd7c0+(c>>10));
312                         if(target<targetLimit) {
313                             *target++=(char16_t)(0xdc00|(c&0x3ff));
314                             if(offsets!=nullptr) {
315                                 *offsets++=sourceIndex;
316                                 *offsets++=sourceIndex;
317                             }
318                         } else {
319                             /* target overflow */
320                             if(offsets!=nullptr) {
321                                 *offsets++=sourceIndex;
322                             }
323                             cnv->UCharErrorBuffer[0]=(char16_t)(0xdc00|(c&0x3ff));
324                             cnv->UCharErrorBufferLength=1;
325                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
326                             goto endloop;
327                         }
328                     }
329                 }
330                 sourceIndex=nextSourceIndex;
331             }
332         }
333 
334         /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
335 singleByteMode:
336         while(source<sourceLimit) {
337             if(target>=targetLimit) {
338                 /* target is full */
339                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
340                 break;
341             }
342             b=*source++;
343             ++nextSourceIndex;
344             switch(state) {
345             case readCommand:
346                 /* redundant conditions are commented out */
347                 /* here: b<0x20 because otherwise we would be in fastSingle */
348                 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
349                     /* CR/LF/TAB/NUL */
350                     *target++=(char16_t)b;
351                     if(offsets!=nullptr) {
352                         *offsets++=sourceIndex;
353                     }
354                     sourceIndex=nextSourceIndex;
355                     goto fastSingle;
356                 } else if(SC0<=b) {
357                     if(b<=SC7) {
358                         dynamicWindow=(int8_t)(b-SC0);
359                         sourceIndex=nextSourceIndex;
360                         goto fastSingle;
361                     } else /* if(SD0<=b && b<=SD7) */ {
362                         dynamicWindow=(int8_t)(b-SD0);
363                         state=defineOne;
364                     }
365                 } else if(/* SQ0<=b && */ b<=SQ7) {
366                     quoteWindow=(int8_t)(b-SQ0);
367                     state=quoteOne;
368                 } else if(b==SDX) {
369                     state=definePairOne;
370                 } else if(b==SQU) {
371                     state=quotePairOne;
372                 } else if(b==SCU) {
373                     sourceIndex=nextSourceIndex;
374                     isSingleByteMode=false;
375                     goto fastUnicode;
376                 } else /* Srs */ {
377                     /* callback(illegal) */
378                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
379                     cnv->toUBytes[0]=b;
380                     cnv->toULength=1;
381                     goto endloop;
382                 }
383 
384                 /* store the first byte of a multibyte sequence in toUBytes[] */
385                 cnv->toUBytes[0]=b;
386                 cnv->toULength=1;
387                 break;
388             case quotePairOne:
389                 byteOne=b;
390                 cnv->toUBytes[1]=b;
391                 cnv->toULength=2;
392                 state=quotePairTwo;
393                 break;
394             case quotePairTwo:
395                 *target++=(char16_t)((byteOne<<8)|b);
396                 if(offsets!=nullptr) {
397                     *offsets++=sourceIndex;
398                 }
399                 sourceIndex=nextSourceIndex;
400                 state=readCommand;
401                 goto fastSingle;
402             case quoteOne:
403                 if(b<0x80) {
404                     /* all static offsets are in the BMP */
405                     *target++=(char16_t)(staticOffsets[quoteWindow]+b);
406                     if(offsets!=nullptr) {
407                         *offsets++=sourceIndex;
408                     }
409                 } else {
410                     /* write from dynamic window */
411                     uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
412                     if(c<=0xffff) {
413                         *target++=(char16_t)c;
414                         if(offsets!=nullptr) {
415                             *offsets++=sourceIndex;
416                         }
417                     } else {
418                         /* output surrogate pair */
419                         *target++=(char16_t)(0xd7c0+(c>>10));
420                         if(target<targetLimit) {
421                             *target++=(char16_t)(0xdc00|(c&0x3ff));
422                             if(offsets!=nullptr) {
423                                 *offsets++=sourceIndex;
424                                 *offsets++=sourceIndex;
425                             }
426                         } else {
427                             /* target overflow */
428                             if(offsets!=nullptr) {
429                                 *offsets++=sourceIndex;
430                             }
431                             cnv->UCharErrorBuffer[0]=(char16_t)(0xdc00|(c&0x3ff));
432                             cnv->UCharErrorBufferLength=1;
433                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
434                             goto endloop;
435                         }
436                     }
437                 }
438                 sourceIndex=nextSourceIndex;
439                 state=readCommand;
440                 goto fastSingle;
441             case definePairOne:
442                 dynamicWindow=(int8_t)((b>>5)&7);
443                 byteOne=(uint8_t)(b&0x1f);
444                 cnv->toUBytes[1]=b;
445                 cnv->toULength=2;
446                 state=definePairTwo;
447                 break;
448             case definePairTwo:
449                 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
450                 sourceIndex=nextSourceIndex;
451                 state=readCommand;
452                 goto fastSingle;
453             case defineOne:
454                 if(b==0) {
455                     /* callback(illegal): Reserved window offset value 0 */
456                     cnv->toUBytes[1]=b;
457                     cnv->toULength=2;
458                     goto endloop;
459                 } else if(b<gapThreshold) {
460                     scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
461                 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
462                     scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
463                 } else if(b>=fixedThreshold) {
464                     scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
465                 } else {
466                     /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
467                     cnv->toUBytes[1]=b;
468                     cnv->toULength=2;
469                     goto endloop;
470                 }
471                 sourceIndex=nextSourceIndex;
472                 state=readCommand;
473                 goto fastSingle;
474             }
475         }
476     } else {
477         /* fast path for Unicode mode */
478         if(state==readCommand) {
479 fastUnicode:
480             while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
481                 *target++=(char16_t)((b<<8)|source[1]);
482                 if(offsets!=nullptr) {
483                     *offsets++=sourceIndex;
484                 }
485                 sourceIndex=nextSourceIndex;
486                 nextSourceIndex+=2;
487                 source+=2;
488             }
489         }
490 
491         /* normal state machine for Unicode mode */
492 /* unicodeByteMode: */
493         while(source<sourceLimit) {
494             if(target>=targetLimit) {
495                 /* target is full */
496                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
497                 break;
498             }
499             b=*source++;
500             ++nextSourceIndex;
501             switch(state) {
502             case readCommand:
503                 if((uint8_t)(b-UC0)>(Urs-UC0)) {
504                     byteOne=b;
505                     cnv->toUBytes[0]=b;
506                     cnv->toULength=1;
507                     state=quotePairTwo;
508                 } else if(/* UC0<=b && */ b<=UC7) {
509                     dynamicWindow=(int8_t)(b-UC0);
510                     sourceIndex=nextSourceIndex;
511                     isSingleByteMode=true;
512                     goto fastSingle;
513                 } else if(/* UD0<=b && */ b<=UD7) {
514                     dynamicWindow=(int8_t)(b-UD0);
515                     isSingleByteMode=true;
516                     cnv->toUBytes[0]=b;
517                     cnv->toULength=1;
518                     state=defineOne;
519                     goto singleByteMode;
520                 } else if(b==UDX) {
521                     isSingleByteMode=true;
522                     cnv->toUBytes[0]=b;
523                     cnv->toULength=1;
524                     state=definePairOne;
525                     goto singleByteMode;
526                 } else if(b==UQU) {
527                     cnv->toUBytes[0]=b;
528                     cnv->toULength=1;
529                     state=quotePairOne;
530                 } else /* Urs */ {
531                     /* callback(illegal) */
532                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
533                     cnv->toUBytes[0]=b;
534                     cnv->toULength=1;
535                     goto endloop;
536                 }
537                 break;
538             case quotePairOne:
539                 byteOne=b;
540                 cnv->toUBytes[1]=b;
541                 cnv->toULength=2;
542                 state=quotePairTwo;
543                 break;
544             case quotePairTwo:
545                 *target++=(char16_t)((byteOne<<8)|b);
546                 if(offsets!=nullptr) {
547                     *offsets++=sourceIndex;
548                 }
549                 sourceIndex=nextSourceIndex;
550                 state=readCommand;
551                 goto fastUnicode;
552             }
553         }
554     }
555 endloop:
556 
557     /* set the converter state back into UConverter */
558     if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
559         /* reset to deal with the next character */
560         state=readCommand;
561     } else if(state==readCommand) {
562         /* not in a multi-byte sequence, reset toULength */
563         cnv->toULength=0;
564     }
565     scsu->toUIsSingleByteMode=isSingleByteMode;
566     scsu->toUState=state;
567     scsu->toUQuoteWindow=quoteWindow;
568     scsu->toUDynamicWindow=dynamicWindow;
569     scsu->toUByteOne=byteOne;
570 
571     /* write back the updated pointers */
572     pArgs->source=(const char *)source;
573     pArgs->target=target;
574     pArgs->offsets=offsets;
575 }
576 
577 /*
578  * Identical to _SCSUToUnicodeWithOffsets but without offset handling.
579  * If a change is made in the original function, then either
580  * change this function the same way or
581  * re-copy the original function and remove the variables
582  * offsets, sourceIndex, and nextSourceIndex.
583  */
584 static void U_CALLCONV
_SCSUToUnicode(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)585 _SCSUToUnicode(UConverterToUnicodeArgs *pArgs,
586                UErrorCode *pErrorCode) {
587     UConverter *cnv;
588     SCSUData *scsu;
589     const uint8_t *source, *sourceLimit;
590     char16_t *target;
591     const char16_t *targetLimit;
592     UBool isSingleByteMode;
593     uint8_t state, byteOne;
594     int8_t quoteWindow, dynamicWindow;
595 
596     uint8_t b;
597 
598     /* set up the local pointers */
599     cnv=pArgs->converter;
600     scsu=(SCSUData *)cnv->extraInfo;
601 
602     source=(const uint8_t *)pArgs->source;
603     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
604     target=pArgs->target;
605     targetLimit=pArgs->targetLimit;
606 
607     /* get the state machine state */
608     isSingleByteMode=scsu->toUIsSingleByteMode;
609     state=scsu->toUState;
610     quoteWindow=scsu->toUQuoteWindow;
611     dynamicWindow=scsu->toUDynamicWindow;
612     byteOne=scsu->toUByteOne;
613 
614     /*
615      * conversion "loop"
616      *
617      * For performance, this is not a normal C loop.
618      * Instead, there are two code blocks for the two SCSU modes.
619      * The function branches to either one, and a change of the mode is done with a goto to
620      * the other branch.
621      *
622      * Each branch has two conventional loops:
623      * - a fast-path loop for the most common codes in the mode
624      * - a loop for all other codes in the mode
625      * When the fast-path runs into a code that it cannot handle, its loop ends and it
626      * runs into the following loop to handle the other codes.
627      * The end of the input or output buffer is also handled by the slower loop.
628      * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
629      *
630      * The callback handling is done by returning with an error code.
631      * The conversion framework actually calls the callback function.
632      */
633     if(isSingleByteMode) {
634         /* fast path for single-byte mode */
635         if(state==readCommand) {
636 fastSingle:
637             while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
638                 ++source;
639                 if(b<=0x7f) {
640                     /* write US-ASCII graphic character or DEL */
641                     *target++=(char16_t)b;
642                 } else {
643                     /* write from dynamic window */
644                     uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
645                     if(c<=0xffff) {
646                         *target++=(char16_t)c;
647                     } else {
648                         /* output surrogate pair */
649                         *target++=(char16_t)(0xd7c0+(c>>10));
650                         if(target<targetLimit) {
651                             *target++=(char16_t)(0xdc00|(c&0x3ff));
652                         } else {
653                             /* target overflow */
654                             cnv->UCharErrorBuffer[0]=(char16_t)(0xdc00|(c&0x3ff));
655                             cnv->UCharErrorBufferLength=1;
656                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
657                             goto endloop;
658                         }
659                     }
660                 }
661             }
662         }
663 
664         /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
665 singleByteMode:
666         while(source<sourceLimit) {
667             if(target>=targetLimit) {
668                 /* target is full */
669                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
670                 break;
671             }
672             b=*source++;
673             switch(state) {
674             case readCommand:
675                 /* redundant conditions are commented out */
676                 /* here: b<0x20 because otherwise we would be in fastSingle */
677                 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
678                     /* CR/LF/TAB/NUL */
679                     *target++=(char16_t)b;
680                     goto fastSingle;
681                 } else if(SC0<=b) {
682                     if(b<=SC7) {
683                         dynamicWindow=(int8_t)(b-SC0);
684                         goto fastSingle;
685                     } else /* if(SD0<=b && b<=SD7) */ {
686                         dynamicWindow=(int8_t)(b-SD0);
687                         state=defineOne;
688                     }
689                 } else if(/* SQ0<=b && */ b<=SQ7) {
690                     quoteWindow=(int8_t)(b-SQ0);
691                     state=quoteOne;
692                 } else if(b==SDX) {
693                     state=definePairOne;
694                 } else if(b==SQU) {
695                     state=quotePairOne;
696                 } else if(b==SCU) {
697                     isSingleByteMode=false;
698                     goto fastUnicode;
699                 } else /* Srs */ {
700                     /* callback(illegal) */
701                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
702                     cnv->toUBytes[0]=b;
703                     cnv->toULength=1;
704                     goto endloop;
705                 }
706 
707                 /* store the first byte of a multibyte sequence in toUBytes[] */
708                 cnv->toUBytes[0]=b;
709                 cnv->toULength=1;
710                 break;
711             case quotePairOne:
712                 byteOne=b;
713                 cnv->toUBytes[1]=b;
714                 cnv->toULength=2;
715                 state=quotePairTwo;
716                 break;
717             case quotePairTwo:
718                 *target++=(char16_t)((byteOne<<8)|b);
719                 state=readCommand;
720                 goto fastSingle;
721             case quoteOne:
722                 if(b<0x80) {
723                     /* all static offsets are in the BMP */
724                     *target++=(char16_t)(staticOffsets[quoteWindow]+b);
725                 } else {
726                     /* write from dynamic window */
727                     uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
728                     if(c<=0xffff) {
729                         *target++=(char16_t)c;
730                     } else {
731                         /* output surrogate pair */
732                         *target++=(char16_t)(0xd7c0+(c>>10));
733                         if(target<targetLimit) {
734                             *target++=(char16_t)(0xdc00|(c&0x3ff));
735                         } else {
736                             /* target overflow */
737                             cnv->UCharErrorBuffer[0]=(char16_t)(0xdc00|(c&0x3ff));
738                             cnv->UCharErrorBufferLength=1;
739                             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
740                             goto endloop;
741                         }
742                     }
743                 }
744                 state=readCommand;
745                 goto fastSingle;
746             case definePairOne:
747                 dynamicWindow=(int8_t)((b>>5)&7);
748                 byteOne=(uint8_t)(b&0x1f);
749                 cnv->toUBytes[1]=b;
750                 cnv->toULength=2;
751                 state=definePairTwo;
752                 break;
753             case definePairTwo:
754                 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
755                 state=readCommand;
756                 goto fastSingle;
757             case defineOne:
758                 if(b==0) {
759                     /* callback(illegal): Reserved window offset value 0 */
760                     cnv->toUBytes[1]=b;
761                     cnv->toULength=2;
762                     goto endloop;
763                 } else if(b<gapThreshold) {
764                     scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
765                 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
766                     scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
767                 } else if(b>=fixedThreshold) {
768                     scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
769                 } else {
770                     /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
771                     cnv->toUBytes[1]=b;
772                     cnv->toULength=2;
773                     goto endloop;
774                 }
775                 state=readCommand;
776                 goto fastSingle;
777             }
778         }
779     } else {
780         /* fast path for Unicode mode */
781         if(state==readCommand) {
782 fastUnicode:
783             while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
784                 *target++=(char16_t)((b<<8)|source[1]);
785                 source+=2;
786             }
787         }
788 
789         /* normal state machine for Unicode mode */
790 /* unicodeByteMode: */
791         while(source<sourceLimit) {
792             if(target>=targetLimit) {
793                 /* target is full */
794                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
795                 break;
796             }
797             b=*source++;
798             switch(state) {
799             case readCommand:
800                 if((uint8_t)(b-UC0)>(Urs-UC0)) {
801                     byteOne=b;
802                     cnv->toUBytes[0]=b;
803                     cnv->toULength=1;
804                     state=quotePairTwo;
805                 } else if(/* UC0<=b && */ b<=UC7) {
806                     dynamicWindow=(int8_t)(b-UC0);
807                     isSingleByteMode=true;
808                     goto fastSingle;
809                 } else if(/* UD0<=b && */ b<=UD7) {
810                     dynamicWindow=(int8_t)(b-UD0);
811                     isSingleByteMode=true;
812                     cnv->toUBytes[0]=b;
813                     cnv->toULength=1;
814                     state=defineOne;
815                     goto singleByteMode;
816                 } else if(b==UDX) {
817                     isSingleByteMode=true;
818                     cnv->toUBytes[0]=b;
819                     cnv->toULength=1;
820                     state=definePairOne;
821                     goto singleByteMode;
822                 } else if(b==UQU) {
823                     cnv->toUBytes[0]=b;
824                     cnv->toULength=1;
825                     state=quotePairOne;
826                 } else /* Urs */ {
827                     /* callback(illegal) */
828                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
829                     cnv->toUBytes[0]=b;
830                     cnv->toULength=1;
831                     goto endloop;
832                 }
833                 break;
834             case quotePairOne:
835                 byteOne=b;
836                 cnv->toUBytes[1]=b;
837                 cnv->toULength=2;
838                 state=quotePairTwo;
839                 break;
840             case quotePairTwo:
841                 *target++=(char16_t)((byteOne<<8)|b);
842                 state=readCommand;
843                 goto fastUnicode;
844             }
845         }
846     }
847 endloop:
848 
849     /* set the converter state back into UConverter */
850     if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
851         /* reset to deal with the next character */
852         state=readCommand;
853     } else if(state==readCommand) {
854         /* not in a multi-byte sequence, reset toULength */
855         cnv->toULength=0;
856     }
857     scsu->toUIsSingleByteMode=isSingleByteMode;
858     scsu->toUState=state;
859     scsu->toUQuoteWindow=quoteWindow;
860     scsu->toUDynamicWindow=dynamicWindow;
861     scsu->toUByteOne=byteOne;
862 
863     /* write back the updated pointers */
864     pArgs->source=(const char *)source;
865     pArgs->target=target;
866 }
867 U_CDECL_END
868 /* SCSU-from-Unicode conversion functions ----------------------------------- */
869 
870 /*
871  * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve
872  * reasonable results. The lookahead is minimal.
873  * Many cases are simple:
874  * A character fits directly into the current mode, a dynamic or static window,
875  * or is not compressible. These cases are tested first.
876  * Real compression heuristics are applied to the rest, in code branches for
877  * single/Unicode mode and BMP/supplementary code points.
878  * The heuristics used here are extremely simple.
879  */
880 
881 /* get the number of the window that this character is in, or -1 */
882 static int8_t
getWindow(const uint32_t offsets[8],uint32_t c)883 getWindow(const uint32_t offsets[8], uint32_t c) {
884     int i;
885     for(i=0; i<8; ++i) {
886         if((uint32_t)(c-offsets[i])<=0x7f) {
887             return (int8_t)(i);
888         }
889     }
890     return -1;
891 }
892 
893 /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */
894 static UBool
isInOffsetWindowOrDirect(uint32_t offset,uint32_t c)895 isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) {
896     return (UBool)(c<=offset+0x7f &&
897           (c>=offset || (c<=0x7f &&
898                         (c>=0x20 || (1UL<<c)&0x2601))));
899                                 /* binary 0010 0110 0000 0001,
900                                    check for b==0xd || b==0xa || b==9 || b==0 */
901 }
902 
903 /*
904  * getNextDynamicWindow returns the next dynamic window to be redefined
905  */
906 static int8_t
getNextDynamicWindow(SCSUData * scsu)907 getNextDynamicWindow(SCSUData *scsu) {
908     int8_t window=scsu->windowUse[scsu->nextWindowUseIndex];
909     if(++scsu->nextWindowUseIndex==8) {
910         scsu->nextWindowUseIndex=0;
911     }
912     return window;
913 }
914 
915 /*
916  * useDynamicWindow() adjusts
917  * windowUse[] and nextWindowUseIndex for the algorithm to choose
918  * the next dynamic window to be defined;
919  * a subclass may override it and provide its own algorithm.
920  */
921 static void
useDynamicWindow(SCSUData * scsu,int8_t window)922 useDynamicWindow(SCSUData *scsu, int8_t window) {
923     /*
924      * move the existing window, which just became the most recently used one,
925      * up in windowUse[] to nextWindowUseIndex-1
926      */
927 
928     /* first, find the index of the window - backwards to favor the more recently used windows */
929     int i, j;
930 
931     i=scsu->nextWindowUseIndex;
932     do {
933         if(--i<0) {
934             i=7;
935         }
936     } while(scsu->windowUse[i]!=window);
937 
938     /* now copy each windowUse[i+1] to [i] */
939     j=i+1;
940     if(j==8) {
941         j=0;
942     }
943     while(j!=scsu->nextWindowUseIndex) {
944         scsu->windowUse[i]=scsu->windowUse[j];
945         i=j;
946         if(++j==8) { j=0; }
947     }
948 
949     /* finally, set the window into the most recently used index */
950     scsu->windowUse[i]=window;
951 }
952 
953 /*
954  * calculate the offset and the code for a dynamic window that contains the character
955  * takes fixed offsets into account
956  * the offset of the window is stored in the offset variable,
957  * the code is returned
958  *
959  * return offset code: -1 none  <=0xff code for SDn/UDn  else code for SDX/UDX, subtract 0x200 to get the true code
960  */
961 static int
getDynamicOffset(uint32_t c,uint32_t * pOffset)962 getDynamicOffset(uint32_t c, uint32_t *pOffset) {
963     int i;
964 
965     for(i=0; i<7; ++i) {
966         if((uint32_t)(c-fixedOffsets[i])<=0x7f) {
967             *pOffset=fixedOffsets[i];
968             return 0xf9+i;
969         }
970     }
971 
972     if(c<0x80) {
973         /* No dynamic window for US-ASCII. */
974         return -1;
975     } else if(c<0x3400 ||
976               (uint32_t)(c-0x10000)<(0x14000-0x10000) ||
977               (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000)
978     ) {
979         /* This character is in a code range for a "small", i.e., reasonably windowable, script. */
980         *pOffset=c&0x7fffff80;
981         return (int)(c>>7);
982     } else if(0xe000<=c && c!=0xfeff && c<0xfff0) {
983         /* For these characters we need to take the gapOffset into account. */
984         *pOffset=c&0x7fffff80;
985         return (int)((c-gapOffset)>>7);
986     } else {
987         return -1;
988     }
989 }
990 U_CDECL_BEGIN
991 /*
992  * Idea for compression:
993  *  - save SCSUData and other state before really starting work
994  *  - at endloop, see if compression could be better with just unicode mode
995  *  - don't do this if a callback has been called
996  *  - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning
997  *  - different buffer handling!
998  *
999  * Drawback or need for corrective handling:
1000  * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and
1001  * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible
1002  * not only for compression but also for HTML/XML documents with following charset/encoding announcers.
1003  *
1004  * How to achieve both?
1005  *  - Only replace the result after an SDX or SCU?
1006  */
1007 
1008 static void U_CALLCONV
_SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)1009 _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1010                             UErrorCode *pErrorCode) {
1011     UConverter *cnv;
1012     SCSUData *scsu;
1013     const char16_t *source, *sourceLimit;
1014     uint8_t *target;
1015     int32_t targetCapacity;
1016     int32_t *offsets;
1017 
1018     UBool isSingleByteMode;
1019     uint8_t dynamicWindow;
1020     uint32_t currentOffset;
1021 
1022     uint32_t c, delta;
1023 
1024     int32_t sourceIndex, nextSourceIndex;
1025 
1026     int32_t length;
1027 
1028     /* variables for compression heuristics */
1029     uint32_t offset;
1030     char16_t lead, trail;
1031     int code;
1032     int8_t window;
1033 
1034     /* set up the local pointers */
1035     cnv=pArgs->converter;
1036     scsu=(SCSUData *)cnv->extraInfo;
1037 
1038     /* set up the local pointers */
1039     source=pArgs->source;
1040     sourceLimit=pArgs->sourceLimit;
1041     target=(uint8_t *)pArgs->target;
1042     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
1043     offsets=pArgs->offsets;
1044 
1045     /* get the state machine state */
1046     isSingleByteMode=scsu->fromUIsSingleByteMode;
1047     dynamicWindow=scsu->fromUDynamicWindow;
1048     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1049 
1050     c=cnv->fromUChar32;
1051 
1052     /* sourceIndex=-1 if the current character began in the previous buffer */
1053     sourceIndex= c==0 ? 0 : -1;
1054     nextSourceIndex=0;
1055 
1056     /* similar conversion "loop" as in toUnicode */
1057 loop:
1058     if(isSingleByteMode) {
1059         if(c!=0 && targetCapacity>0) {
1060             goto getTrailSingle;
1061         }
1062 
1063         /* state machine for single-byte mode */
1064 /* singleByteMode: */
1065         while(source<sourceLimit) {
1066             if(targetCapacity<=0) {
1067                 /* target is full */
1068                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1069                 break;
1070             }
1071             c=*source++;
1072             ++nextSourceIndex;
1073 
1074             if((c-0x20)<=0x5f) {
1075                 /* pass US-ASCII graphic character through */
1076                 *target++=(uint8_t)c;
1077                 if(offsets!=nullptr) {
1078                     *offsets++=sourceIndex;
1079                 }
1080                 --targetCapacity;
1081             } else if(c<0x20) {
1082                 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1083                     /* CR/LF/TAB/NUL */
1084                     *target++=(uint8_t)c;
1085                     if(offsets!=nullptr) {
1086                         *offsets++=sourceIndex;
1087                     }
1088                     --targetCapacity;
1089                 } else {
1090                     /* quote C0 control character */
1091                     c|=SQ0<<8;
1092                     length=2;
1093                     goto outputBytes;
1094                 }
1095             } else if((delta=c-currentOffset)<=0x7f) {
1096                 /* use the current dynamic window */
1097                 *target++=(uint8_t)(delta|0x80);
1098                 if(offsets!=nullptr) {
1099                     *offsets++=sourceIndex;
1100                 }
1101                 --targetCapacity;
1102             } else if(U16_IS_SURROGATE(c)) {
1103                 if(U16_IS_SURROGATE_LEAD(c)) {
1104 getTrailSingle:
1105                     lead=(char16_t)c;
1106                     if(source<sourceLimit) {
1107                         /* test the following code unit */
1108                         trail=*source;
1109                         if(U16_IS_TRAIL(trail)) {
1110                             ++source;
1111                             ++nextSourceIndex;
1112                             c=U16_GET_SUPPLEMENTARY(c, trail);
1113                             /* convert this surrogate code point */
1114                             /* exit this condition tree */
1115                         } else {
1116                             /* this is an unmatched lead code unit (1st surrogate) */
1117                             /* callback(illegal) */
1118                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1119                             goto endloop;
1120                         }
1121                     } else {
1122                         /* no more input */
1123                         break;
1124                     }
1125                 } else {
1126                     /* this is an unmatched trail code unit (2nd surrogate) */
1127                     /* callback(illegal) */
1128                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1129                     goto endloop;
1130                 }
1131 
1132                 /* compress supplementary character U+10000..U+10ffff */
1133                 if((delta=c-currentOffset)<=0x7f) {
1134                     /* use the current dynamic window */
1135                     *target++=(uint8_t)(delta|0x80);
1136                     if(offsets!=nullptr) {
1137                         *offsets++=sourceIndex;
1138                     }
1139                     --targetCapacity;
1140                 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1141                     /* there is a dynamic window that contains this character, change to it */
1142                     dynamicWindow=window;
1143                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1144                     useDynamicWindow(scsu, dynamicWindow);
1145                     c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1146                     length=2;
1147                     goto outputBytes;
1148                 } else if((code=getDynamicOffset(c, &offset))>=0) {
1149                     /* might check if there are more characters in this window to come */
1150                     /* define an extended window with this character */
1151                     code-=0x200;
1152                     dynamicWindow=getNextDynamicWindow(scsu);
1153                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1154                     useDynamicWindow(scsu, dynamicWindow);
1155                     c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1156                     length=4;
1157                     goto outputBytes;
1158                 } else {
1159                     /* change to Unicode mode and output this (lead, trail) pair */
1160                     isSingleByteMode=false;
1161                     *target++=(uint8_t)SCU;
1162                     if(offsets!=nullptr) {
1163                         *offsets++=sourceIndex;
1164                     }
1165                     --targetCapacity;
1166                     c=((uint32_t)lead<<16)|trail;
1167                     length=4;
1168                     goto outputBytes;
1169                 }
1170             } else if(c<0xa0) {
1171                 /* quote C1 control character */
1172                 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
1173                 length=2;
1174                 goto outputBytes;
1175             } else if(c==0xfeff || c>=0xfff0) {
1176                 /* quote signature character=byte order mark and specials */
1177                 c|=SQU<<16;
1178                 length=3;
1179                 goto outputBytes;
1180             } else {
1181                 /* compress all other BMP characters */
1182                 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1183                     /* there is a window defined that contains this character - switch to it or quote from it? */
1184                     if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
1185                         /* change to dynamic window */
1186                         dynamicWindow=window;
1187                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1188                         useDynamicWindow(scsu, dynamicWindow);
1189                         c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1190                         length=2;
1191                         goto outputBytes;
1192                     } else {
1193                         /* quote from dynamic window */
1194                         c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
1195                         length=2;
1196                         goto outputBytes;
1197                     }
1198                 } else if((window=getWindow(staticOffsets, c))>=0) {
1199                     /* quote from static window */
1200                     c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
1201                     length=2;
1202                     goto outputBytes;
1203                 } else if((code=getDynamicOffset(c, &offset))>=0) {
1204                     /* define a dynamic window with this character */
1205                     dynamicWindow=getNextDynamicWindow(scsu);
1206                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1207                     useDynamicWindow(scsu, dynamicWindow);
1208                     c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1209                     length=3;
1210                     goto outputBytes;
1211                 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
1212                           (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1213                 ) {
1214                     /*
1215                      * this character is not compressible (a BMP ideograph or similar);
1216                      * switch to Unicode mode if this is the last character in the block
1217                      * or there is at least one more ideograph following immediately
1218                      */
1219                     isSingleByteMode=false;
1220                     c|=SCU<<16;
1221                     length=3;
1222                     goto outputBytes;
1223                 } else {
1224                     /* quote Unicode */
1225                     c|=SQU<<16;
1226                     length=3;
1227                     goto outputBytes;
1228                 }
1229             }
1230 
1231             /* normal end of conversion: prepare for a new character */
1232             c=0;
1233             sourceIndex=nextSourceIndex;
1234         }
1235     } else {
1236         if(c!=0 && targetCapacity>0) {
1237             goto getTrailUnicode;
1238         }
1239 
1240         /* state machine for Unicode mode */
1241 /* unicodeByteMode: */
1242         while(source<sourceLimit) {
1243             if(targetCapacity<=0) {
1244                 /* target is full */
1245                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1246                 break;
1247             }
1248             c=*source++;
1249             ++nextSourceIndex;
1250 
1251             if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
1252                 /* not compressible, write character directly */
1253                 if(targetCapacity>=2) {
1254                     *target++=(uint8_t)(c>>8);
1255                     *target++=(uint8_t)c;
1256                     if(offsets!=nullptr) {
1257                         *offsets++=sourceIndex;
1258                         *offsets++=sourceIndex;
1259                     }
1260                     targetCapacity-=2;
1261                 } else {
1262                     length=2;
1263                     goto outputBytes;
1264                 }
1265             } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1266                 /* compress BMP character if the following one is not an uncompressible ideograph */
1267                 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
1268                     if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
1269                         /* ASCII digit or letter */
1270                         isSingleByteMode=true;
1271                         c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
1272                         length=2;
1273                         goto outputBytes;
1274                     } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1275                         /* there is a dynamic window that contains this character, change to it */
1276                         isSingleByteMode=true;
1277                         dynamicWindow=window;
1278                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1279                         useDynamicWindow(scsu, dynamicWindow);
1280                         c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1281                         length=2;
1282                         goto outputBytes;
1283                     } else if((code=getDynamicOffset(c, &offset))>=0) {
1284                         /* define a dynamic window with this character */
1285                         isSingleByteMode=true;
1286                         dynamicWindow=getNextDynamicWindow(scsu);
1287                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1288                         useDynamicWindow(scsu, dynamicWindow);
1289                         c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1290                         length=3;
1291                         goto outputBytes;
1292                     }
1293                 }
1294 
1295                 /* don't know how to compress this character, just write it directly */
1296                 length=2;
1297                 goto outputBytes;
1298             } else if(c<0xe000) {
1299                 /* c is a surrogate */
1300                 if(U16_IS_SURROGATE_LEAD(c)) {
1301 getTrailUnicode:
1302                     lead=(char16_t)c;
1303                     if(source<sourceLimit) {
1304                         /* test the following code unit */
1305                         trail=*source;
1306                         if(U16_IS_TRAIL(trail)) {
1307                             ++source;
1308                             ++nextSourceIndex;
1309                             c=U16_GET_SUPPLEMENTARY(c, trail);
1310                             /* convert this surrogate code point */
1311                             /* exit this condition tree */
1312                         } else {
1313                             /* this is an unmatched lead code unit (1st surrogate) */
1314                             /* callback(illegal) */
1315                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1316                             goto endloop;
1317                         }
1318                     } else {
1319                         /* no more input */
1320                         break;
1321                     }
1322                 } else {
1323                     /* this is an unmatched trail code unit (2nd surrogate) */
1324                     /* callback(illegal) */
1325                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1326                     goto endloop;
1327                 }
1328 
1329                 /* compress supplementary character */
1330                 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
1331                     !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1332                 ) {
1333                     /*
1334                      * there is a dynamic window that contains this character and
1335                      * the following character is not uncompressible,
1336                      * change to the window
1337                      */
1338                     isSingleByteMode=true;
1339                     dynamicWindow=window;
1340                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1341                     useDynamicWindow(scsu, dynamicWindow);
1342                     c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1343                     length=2;
1344                     goto outputBytes;
1345                 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
1346                           (code=getDynamicOffset(c, &offset))>=0
1347                 ) {
1348                     /* two supplementary characters in (probably) the same window - define an extended one */
1349                     isSingleByteMode=true;
1350                     code-=0x200;
1351                     dynamicWindow=getNextDynamicWindow(scsu);
1352                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1353                     useDynamicWindow(scsu, dynamicWindow);
1354                     c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1355                     length=4;
1356                     goto outputBytes;
1357                 } else {
1358                     /* don't know how to compress this character, just write it directly */
1359                     c=((uint32_t)lead<<16)|trail;
1360                     length=4;
1361                     goto outputBytes;
1362                 }
1363             } else /* 0xe000<=c<0xf300 */ {
1364                 /* quote to avoid SCSU tags */
1365                 c|=UQU<<16;
1366                 length=3;
1367                 goto outputBytes;
1368             }
1369 
1370             /* normal end of conversion: prepare for a new character */
1371             c=0;
1372             sourceIndex=nextSourceIndex;
1373         }
1374     }
1375 endloop:
1376 
1377     /* set the converter state back into UConverter */
1378     scsu->fromUIsSingleByteMode=isSingleByteMode;
1379     scsu->fromUDynamicWindow=dynamicWindow;
1380 
1381     cnv->fromUChar32=c;
1382 
1383     /* write back the updated pointers */
1384     pArgs->source=source;
1385     pArgs->target=(char *)target;
1386     pArgs->offsets=offsets;
1387     return;
1388 
1389 outputBytes:
1390     /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1391     /* from the first if in the loop we know that targetCapacity>0 */
1392     if(length<=targetCapacity) {
1393         if(offsets==nullptr) {
1394             switch(length) {
1395                 /* each branch falls through to the next one */
1396             case 4:
1397                 *target++=(uint8_t)(c>>24);
1398                 U_FALLTHROUGH;
1399             case 3:
1400                 *target++=(uint8_t)(c>>16);
1401                 U_FALLTHROUGH;
1402             case 2:
1403                 *target++=(uint8_t)(c>>8);
1404                 U_FALLTHROUGH;
1405             case 1:
1406                 *target++=(uint8_t)c;
1407                 U_FALLTHROUGH;
1408             default:
1409                 /* will never occur */
1410                 break;
1411             }
1412         } else {
1413             switch(length) {
1414                 /* each branch falls through to the next one */
1415             case 4:
1416                 *target++=(uint8_t)(c>>24);
1417                 *offsets++=sourceIndex;
1418                 U_FALLTHROUGH;
1419             case 3:
1420                 *target++=(uint8_t)(c>>16);
1421                 *offsets++=sourceIndex;
1422                 U_FALLTHROUGH;
1423             case 2:
1424                 *target++=(uint8_t)(c>>8);
1425                 *offsets++=sourceIndex;
1426                 U_FALLTHROUGH;
1427             case 1:
1428                 *target++=(uint8_t)c;
1429                 *offsets++=sourceIndex;
1430                 U_FALLTHROUGH;
1431             default:
1432                 /* will never occur */
1433                 break;
1434             }
1435         }
1436         targetCapacity-=length;
1437 
1438         /* normal end of conversion: prepare for a new character */
1439         c=0;
1440         sourceIndex=nextSourceIndex;
1441         goto loop;
1442     } else {
1443         uint8_t *p;
1444 
1445         /*
1446          * We actually do this backwards here:
1447          * In order to save an intermediate variable, we output
1448          * first to the overflow buffer what does not fit into the
1449          * regular target.
1450          */
1451         /* we know that 0<=targetCapacity<length<=4 */
1452         /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1453         length-=targetCapacity;
1454         p=(uint8_t *)cnv->charErrorBuffer;
1455         switch(length) {
1456             /* each branch falls through to the next one */
1457         case 4:
1458             *p++=(uint8_t)(c>>24);
1459             U_FALLTHROUGH;
1460         case 3:
1461             *p++=(uint8_t)(c>>16);
1462             U_FALLTHROUGH;
1463         case 2:
1464             *p++=(uint8_t)(c>>8);
1465             U_FALLTHROUGH;
1466         case 1:
1467             *p=(uint8_t)c;
1468             U_FALLTHROUGH;
1469         default:
1470             /* will never occur */
1471             break;
1472         }
1473         cnv->charErrorBufferLength=(int8_t)length;
1474 
1475         /* now output what fits into the regular target */
1476         c>>=8*length; /* length was reduced by targetCapacity */
1477         switch(targetCapacity) {
1478             /* each branch falls through to the next one */
1479         case 3:
1480             *target++=(uint8_t)(c>>16);
1481             if(offsets!=nullptr) {
1482                 *offsets++=sourceIndex;
1483             }
1484             U_FALLTHROUGH;
1485         case 2:
1486             *target++=(uint8_t)(c>>8);
1487             if(offsets!=nullptr) {
1488                 *offsets++=sourceIndex;
1489             }
1490             U_FALLTHROUGH;
1491         case 1:
1492             *target++=(uint8_t)c;
1493             if(offsets!=nullptr) {
1494                 *offsets++=sourceIndex;
1495             }
1496             U_FALLTHROUGH;
1497         default:
1498             break;
1499         }
1500 
1501         /* target overflow */
1502         targetCapacity=0;
1503         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1504         c=0;
1505         goto endloop;
1506     }
1507 }
1508 
1509 /*
1510  * Identical to _SCSUFromUnicodeWithOffsets but without offset handling.
1511  * If a change is made in the original function, then either
1512  * change this function the same way or
1513  * re-copy the original function and remove the variables
1514  * offsets, sourceIndex, and nextSourceIndex.
1515  */
1516 static void U_CALLCONV
_SCSUFromUnicode(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)1517 _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs,
1518                  UErrorCode *pErrorCode) {
1519     UConverter *cnv;
1520     SCSUData *scsu;
1521     const char16_t *source, *sourceLimit;
1522     uint8_t *target;
1523     int32_t targetCapacity;
1524 
1525     UBool isSingleByteMode;
1526     uint8_t dynamicWindow;
1527     uint32_t currentOffset;
1528 
1529     uint32_t c, delta;
1530 
1531     int32_t length;
1532 
1533     /* variables for compression heuristics */
1534     uint32_t offset;
1535     char16_t lead, trail;
1536     int code;
1537     int8_t window;
1538 
1539     /* set up the local pointers */
1540     cnv=pArgs->converter;
1541     scsu=(SCSUData *)cnv->extraInfo;
1542 
1543     /* set up the local pointers */
1544     source=pArgs->source;
1545     sourceLimit=pArgs->sourceLimit;
1546     target=(uint8_t *)pArgs->target;
1547     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
1548 
1549     /* get the state machine state */
1550     isSingleByteMode=scsu->fromUIsSingleByteMode;
1551     dynamicWindow=scsu->fromUDynamicWindow;
1552     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1553 
1554     c=cnv->fromUChar32;
1555 
1556     /* similar conversion "loop" as in toUnicode */
1557 loop:
1558     if(isSingleByteMode) {
1559         if(c!=0 && targetCapacity>0) {
1560             goto getTrailSingle;
1561         }
1562 
1563         /* state machine for single-byte mode */
1564 /* singleByteMode: */
1565         while(source<sourceLimit) {
1566             if(targetCapacity<=0) {
1567                 /* target is full */
1568                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1569                 break;
1570             }
1571             c=*source++;
1572 
1573             if((c-0x20)<=0x5f) {
1574                 /* pass US-ASCII graphic character through */
1575                 *target++=(uint8_t)c;
1576                 --targetCapacity;
1577             } else if(c<0x20) {
1578                 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1579                     /* CR/LF/TAB/NUL */
1580                     *target++=(uint8_t)c;
1581                     --targetCapacity;
1582                 } else {
1583                     /* quote C0 control character */
1584                     c|=SQ0<<8;
1585                     length=2;
1586                     goto outputBytes;
1587                 }
1588             } else if((delta=c-currentOffset)<=0x7f) {
1589                 /* use the current dynamic window */
1590                 *target++=(uint8_t)(delta|0x80);
1591                 --targetCapacity;
1592             } else if(U16_IS_SURROGATE(c)) {
1593                 if(U16_IS_SURROGATE_LEAD(c)) {
1594 getTrailSingle:
1595                     lead=(char16_t)c;
1596                     if(source<sourceLimit) {
1597                         /* test the following code unit */
1598                         trail=*source;
1599                         if(U16_IS_TRAIL(trail)) {
1600                             ++source;
1601                             c=U16_GET_SUPPLEMENTARY(c, trail);
1602                             /* convert this surrogate code point */
1603                             /* exit this condition tree */
1604                         } else {
1605                             /* this is an unmatched lead code unit (1st surrogate) */
1606                             /* callback(illegal) */
1607                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1608                             goto endloop;
1609                         }
1610                     } else {
1611                         /* no more input */
1612                         break;
1613                     }
1614                 } else {
1615                     /* this is an unmatched trail code unit (2nd surrogate) */
1616                     /* callback(illegal) */
1617                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1618                     goto endloop;
1619                 }
1620 
1621                 /* compress supplementary character U+10000..U+10ffff */
1622                 if((delta=c-currentOffset)<=0x7f) {
1623                     /* use the current dynamic window */
1624                     *target++=(uint8_t)(delta|0x80);
1625                     --targetCapacity;
1626                 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1627                     /* there is a dynamic window that contains this character, change to it */
1628                     dynamicWindow=window;
1629                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1630                     useDynamicWindow(scsu, dynamicWindow);
1631                     c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1632                     length=2;
1633                     goto outputBytes;
1634                 } else if((code=getDynamicOffset(c, &offset))>=0) {
1635                     /* might check if there are more characters in this window to come */
1636                     /* define an extended window with this character */
1637                     code-=0x200;
1638                     dynamicWindow=getNextDynamicWindow(scsu);
1639                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1640                     useDynamicWindow(scsu, dynamicWindow);
1641                     c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1642                     length=4;
1643                     goto outputBytes;
1644                 } else {
1645                     /* change to Unicode mode and output this (lead, trail) pair */
1646                     isSingleByteMode=false;
1647                     *target++=(uint8_t)SCU;
1648                     --targetCapacity;
1649                     c=((uint32_t)lead<<16)|trail;
1650                     length=4;
1651                     goto outputBytes;
1652                 }
1653             } else if(c<0xa0) {
1654                 /* quote C1 control character */
1655                 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
1656                 length=2;
1657                 goto outputBytes;
1658             } else if(c==0xfeff || c>=0xfff0) {
1659                 /* quote signature character=byte order mark and specials */
1660                 c|=SQU<<16;
1661                 length=3;
1662                 goto outputBytes;
1663             } else {
1664                 /* compress all other BMP characters */
1665                 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1666                     /* there is a window defined that contains this character - switch to it or quote from it? */
1667                     if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
1668                         /* change to dynamic window */
1669                         dynamicWindow=window;
1670                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1671                         useDynamicWindow(scsu, dynamicWindow);
1672                         c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1673                         length=2;
1674                         goto outputBytes;
1675                     } else {
1676                         /* quote from dynamic window */
1677                         c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
1678                         length=2;
1679                         goto outputBytes;
1680                     }
1681                 } else if((window=getWindow(staticOffsets, c))>=0) {
1682                     /* quote from static window */
1683                     c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
1684                     length=2;
1685                     goto outputBytes;
1686                 } else if((code=getDynamicOffset(c, &offset))>=0) {
1687                     /* define a dynamic window with this character */
1688                     dynamicWindow=getNextDynamicWindow(scsu);
1689                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1690                     useDynamicWindow(scsu, dynamicWindow);
1691                     c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1692                     length=3;
1693                     goto outputBytes;
1694                 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
1695                           (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1696                 ) {
1697                     /*
1698                      * this character is not compressible (a BMP ideograph or similar);
1699                      * switch to Unicode mode if this is the last character in the block
1700                      * or there is at least one more ideograph following immediately
1701                      */
1702                     isSingleByteMode=false;
1703                     c|=SCU<<16;
1704                     length=3;
1705                     goto outputBytes;
1706                 } else {
1707                     /* quote Unicode */
1708                     c|=SQU<<16;
1709                     length=3;
1710                     goto outputBytes;
1711                 }
1712             }
1713 
1714             /* normal end of conversion: prepare for a new character */
1715             c=0;
1716         }
1717     } else {
1718         if(c!=0 && targetCapacity>0) {
1719             goto getTrailUnicode;
1720         }
1721 
1722         /* state machine for Unicode mode */
1723 /* unicodeByteMode: */
1724         while(source<sourceLimit) {
1725             if(targetCapacity<=0) {
1726                 /* target is full */
1727                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1728                 break;
1729             }
1730             c=*source++;
1731 
1732             if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
1733                 /* not compressible, write character directly */
1734                 if(targetCapacity>=2) {
1735                     *target++=(uint8_t)(c>>8);
1736                     *target++=(uint8_t)c;
1737                     targetCapacity-=2;
1738                 } else {
1739                     length=2;
1740                     goto outputBytes;
1741                 }
1742             } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1743                 /* compress BMP character if the following one is not an uncompressible ideograph */
1744                 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
1745                     if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
1746                         /* ASCII digit or letter */
1747                         isSingleByteMode=true;
1748                         c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
1749                         length=2;
1750                         goto outputBytes;
1751                     } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1752                         /* there is a dynamic window that contains this character, change to it */
1753                         isSingleByteMode=true;
1754                         dynamicWindow=window;
1755                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1756                         useDynamicWindow(scsu, dynamicWindow);
1757                         c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1758                         length=2;
1759                         goto outputBytes;
1760                     } else if((code=getDynamicOffset(c, &offset))>=0) {
1761                         /* define a dynamic window with this character */
1762                         isSingleByteMode=true;
1763                         dynamicWindow=getNextDynamicWindow(scsu);
1764                         currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1765                         useDynamicWindow(scsu, dynamicWindow);
1766                         c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1767                         length=3;
1768                         goto outputBytes;
1769                     }
1770                 }
1771 
1772                 /* don't know how to compress this character, just write it directly */
1773                 length=2;
1774                 goto outputBytes;
1775             } else if(c<0xe000) {
1776                 /* c is a surrogate */
1777                 if(U16_IS_SURROGATE_LEAD(c)) {
1778 getTrailUnicode:
1779                     lead=(char16_t)c;
1780                     if(source<sourceLimit) {
1781                         /* test the following code unit */
1782                         trail=*source;
1783                         if(U16_IS_TRAIL(trail)) {
1784                             ++source;
1785                             c=U16_GET_SUPPLEMENTARY(c, trail);
1786                             /* convert this surrogate code point */
1787                             /* exit this condition tree */
1788                         } else {
1789                             /* this is an unmatched lead code unit (1st surrogate) */
1790                             /* callback(illegal) */
1791                             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1792                             goto endloop;
1793                         }
1794                     } else {
1795                         /* no more input */
1796                         break;
1797                     }
1798                 } else {
1799                     /* this is an unmatched trail code unit (2nd surrogate) */
1800                     /* callback(illegal) */
1801                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1802                     goto endloop;
1803                 }
1804 
1805                 /* compress supplementary character */
1806                 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
1807                     !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1808                 ) {
1809                     /*
1810                      * there is a dynamic window that contains this character and
1811                      * the following character is not uncompressible,
1812                      * change to the window
1813                      */
1814                     isSingleByteMode=true;
1815                     dynamicWindow=window;
1816                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1817                     useDynamicWindow(scsu, dynamicWindow);
1818                     c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1819                     length=2;
1820                     goto outputBytes;
1821                 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
1822                           (code=getDynamicOffset(c, &offset))>=0
1823                 ) {
1824                     /* two supplementary characters in (probably) the same window - define an extended one */
1825                     isSingleByteMode=true;
1826                     code-=0x200;
1827                     dynamicWindow=getNextDynamicWindow(scsu);
1828                     currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1829                     useDynamicWindow(scsu, dynamicWindow);
1830                     c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1831                     length=4;
1832                     goto outputBytes;
1833                 } else {
1834                     /* don't know how to compress this character, just write it directly */
1835                     c=((uint32_t)lead<<16)|trail;
1836                     length=4;
1837                     goto outputBytes;
1838                 }
1839             } else /* 0xe000<=c<0xf300 */ {
1840                 /* quote to avoid SCSU tags */
1841                 c|=UQU<<16;
1842                 length=3;
1843                 goto outputBytes;
1844             }
1845 
1846             /* normal end of conversion: prepare for a new character */
1847             c=0;
1848         }
1849     }
1850 endloop:
1851 
1852     /* set the converter state back into UConverter */
1853     scsu->fromUIsSingleByteMode=isSingleByteMode;
1854     scsu->fromUDynamicWindow=dynamicWindow;
1855 
1856     cnv->fromUChar32=c;
1857 
1858     /* write back the updated pointers */
1859     pArgs->source=source;
1860     pArgs->target=(char *)target;
1861     return;
1862 
1863 outputBytes:
1864     /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1865     /* from the first if in the loop we know that targetCapacity>0 */
1866     if(length<=targetCapacity) {
1867         switch(length) {
1868             /* each branch falls through to the next one */
1869         case 4:
1870             *target++=(uint8_t)(c>>24);
1871             U_FALLTHROUGH;
1872         case 3:
1873             *target++=(uint8_t)(c>>16);
1874             U_FALLTHROUGH;
1875         case 2:
1876             *target++=(uint8_t)(c>>8);
1877             U_FALLTHROUGH;
1878         case 1:
1879             *target++=(uint8_t)c;
1880             U_FALLTHROUGH;
1881         default:
1882             /* will never occur */
1883             break;
1884         }
1885         targetCapacity-=length;
1886 
1887         /* normal end of conversion: prepare for a new character */
1888         c=0;
1889         goto loop;
1890     } else {
1891         uint8_t *p;
1892 
1893         /*
1894          * We actually do this backwards here:
1895          * In order to save an intermediate variable, we output
1896          * first to the overflow buffer what does not fit into the
1897          * regular target.
1898          */
1899         /* we know that 0<=targetCapacity<length<=4 */
1900         /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1901         length-=targetCapacity;
1902         p=(uint8_t *)cnv->charErrorBuffer;
1903         switch(length) {
1904             /* each branch falls through to the next one */
1905         case 4:
1906             *p++=(uint8_t)(c>>24);
1907             U_FALLTHROUGH;
1908         case 3:
1909             *p++=(uint8_t)(c>>16);
1910             U_FALLTHROUGH;
1911         case 2:
1912             *p++=(uint8_t)(c>>8);
1913             U_FALLTHROUGH;
1914         case 1:
1915             *p=(uint8_t)c;
1916             U_FALLTHROUGH;
1917         default:
1918             /* will never occur */
1919             break;
1920         }
1921         cnv->charErrorBufferLength=(int8_t)length;
1922 
1923         /* now output what fits into the regular target */
1924         c = (length == 4) ? 0 : c >> 8*length; /* length was reduced by targetCapacity */
1925         switch(targetCapacity) {
1926             /* each branch falls through to the next one */
1927         case 3:
1928             *target++=(uint8_t)(c>>16);
1929             U_FALLTHROUGH;
1930         case 2:
1931             *target++=(uint8_t)(c>>8);
1932             U_FALLTHROUGH;
1933         case 1:
1934             *target++=(uint8_t)c;
1935             U_FALLTHROUGH;
1936         default:
1937             break;
1938         }
1939 
1940         /* target overflow */
1941         targetCapacity=0;
1942         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1943         c=0;
1944         goto endloop;
1945     }
1946 }
1947 
1948 /* miscellaneous ------------------------------------------------------------ */
1949 
1950 static const char *  U_CALLCONV
_SCSUGetName(const UConverter * cnv)1951 _SCSUGetName(const UConverter *cnv) {
1952     SCSUData *scsu=(SCSUData *)cnv->extraInfo;
1953 
1954     switch(scsu->locale) {
1955     case l_ja:
1956         return "SCSU,locale=ja";
1957     default:
1958         return "SCSU";
1959     }
1960 }
1961 
1962 /* structure for SafeClone calculations */
1963 struct cloneSCSUStruct
1964 {
1965     UConverter cnv;
1966     SCSUData mydata;
1967 };
1968 
1969 static UConverter *  U_CALLCONV
_SCSUSafeClone(const UConverter * cnv,void * stackBuffer,int32_t * pBufferSize,UErrorCode * status)1970 _SCSUSafeClone(const UConverter *cnv,
1971                void *stackBuffer,
1972                int32_t *pBufferSize,
1973                UErrorCode *status)
1974 {
1975     struct cloneSCSUStruct * localClone;
1976     int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct);
1977 
1978     if (U_FAILURE(*status)){
1979         return nullptr;
1980     }
1981 
1982     if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
1983         *pBufferSize = bufferSizeNeeded;
1984         return nullptr;
1985     }
1986 
1987     localClone = (struct cloneSCSUStruct *)stackBuffer;
1988     /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
1989 
1990     uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData));
1991     localClone->cnv.extraInfo = &localClone->mydata;
1992     localClone->cnv.isExtraLocal = true;
1993 
1994     return &localClone->cnv;
1995 }
1996 U_CDECL_END
1997 
1998 static const UConverterImpl _SCSUImpl={
1999     UCNV_SCSU,
2000 
2001     nullptr,
2002     nullptr,
2003 
2004     _SCSUOpen,
2005     _SCSUClose,
2006     _SCSUReset,
2007 
2008     _SCSUToUnicode,
2009     _SCSUToUnicodeWithOffsets,
2010     _SCSUFromUnicode,
2011     _SCSUFromUnicodeWithOffsets,
2012     nullptr,
2013 
2014     nullptr,
2015     _SCSUGetName,
2016     nullptr,
2017     _SCSUSafeClone,
2018     ucnv_getCompleteUnicodeSet,
2019     nullptr,
2020     nullptr
2021 };
2022 
2023 static const UConverterStaticData _SCSUStaticData={
2024     sizeof(UConverterStaticData),
2025     "SCSU",
2026     1212, /* CCSID for SCSU */
2027     UCNV_IBM, UCNV_SCSU,
2028     1, 3, /* one char16_t generates at least 1 byte and at most 3 bytes */
2029     /*
2030      * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode
2031      * substitution string.
2032      */
2033     { 0x0e, 0xff, 0xfd, 0 }, 3,
2034     false, false,
2035     0,
2036     0,
2037     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
2038 };
2039 
2040 const UConverterSharedData _SCSUData=
2041         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_SCSUStaticData, &_SCSUImpl);
2042 
2043 #endif
2044