1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 2000-2009, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 * file name: ucnvscsu.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2000nov18
14 * created by: Markus W. Scherer
15 *
16 * This is an implementation of the Standard Compression Scheme for Unicode
17 * as defined in http://www.unicode.org/unicode/reports/tr6/ .
18 * Reserved commands and window settings are treated as illegal sequences and
19 * will result in callback calls.
20 */
21
22 #include "unicode/utypes.h"
23
24 #if !UCONFIG_NO_CONVERSION
25
26 #include "unicode/ucnv.h"
27 #include "unicode/ucnv_cb.h"
28 #include "ucnv_bld.h"
29 #include "ucnv_cnv.h"
30 #include "cmemory.h"
31
32 /* SCSU definitions --------------------------------------------------------- */
33
34 /* SCSU command byte values */
35 enum {
36 SQ0=0x01, /* Quote from window pair 0 */
37 SQ7=0x08, /* Quote from window pair 7 */
38 SDX=0x0B, /* Define a window as extended */
39 Srs=0x0C, /* reserved */
40 SQU=0x0E, /* Quote a single Unicode character */
41 SCU=0x0F, /* Change to Unicode mode */
42 SC0=0x10, /* Select window 0 */
43 SC7=0x17, /* Select window 7 */
44 SD0=0x18, /* Define and select window 0 */
45 SD7=0x1F, /* Define and select window 7 */
46
47 UC0=0xE0, /* Select window 0 */
48 UC7=0xE7, /* Select window 7 */
49 UD0=0xE8, /* Define and select window 0 */
50 UD7=0xEF, /* Define and select window 7 */
51 UQU=0xF0, /* Quote a single Unicode character */
52 UDX=0xF1, /* Define a Window as extended */
53 Urs=0xF2 /* reserved */
54 };
55
56 enum {
57 /*
58 * Unicode code points from 3400 to E000 are not adressible by
59 * dynamic window, since in these areas no short run alphabets are
60 * found. Therefore add gapOffset to all values from gapThreshold.
61 */
62 gapThreshold=0x68,
63 gapOffset=0xAC00,
64
65 /* values between reservedStart and fixedThreshold are reserved */
66 reservedStart=0xA8,
67
68 /* use table of predefined fixed offsets for values from fixedThreshold */
69 fixedThreshold=0xF9
70 };
71
72 /* constant offsets for the 8 static windows */
73 static const uint32_t staticOffsets[8]={
74 0x0000, /* ASCII for quoted tags */
75 0x0080, /* Latin - 1 Supplement (for access to punctuation) */
76 0x0100, /* Latin Extended-A */
77 0x0300, /* Combining Diacritical Marks */
78 0x2000, /* General Punctuation */
79 0x2080, /* Currency Symbols */
80 0x2100, /* Letterlike Symbols and Number Forms */
81 0x3000 /* CJK Symbols and punctuation */
82 };
83
84 /* initial offsets for the 8 dynamic (sliding) windows */
85 static const uint32_t initialDynamicOffsets[8]={
86 0x0080, /* Latin-1 */
87 0x00C0, /* Latin Extended A */
88 0x0400, /* Cyrillic */
89 0x0600, /* Arabic */
90 0x0900, /* Devanagari */
91 0x3040, /* Hiragana */
92 0x30A0, /* Katakana */
93 0xFF00 /* Fullwidth ASCII */
94 };
95
96 /* Table of fixed predefined Offsets */
97 static const uint32_t fixedOffsets[]={
98 /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */
99 /* 0xFA */ 0x0250, /* IPA extensions */
100 /* 0xFB */ 0x0370, /* Greek */
101 /* 0xFC */ 0x0530, /* Armenian */
102 /* 0xFD */ 0x3040, /* Hiragana */
103 /* 0xFE */ 0x30A0, /* Katakana */
104 /* 0xFF */ 0xFF60 /* Halfwidth Katakana */
105 };
106
107 /* state values */
108 enum {
109 readCommand,
110 quotePairOne,
111 quotePairTwo,
112 quoteOne,
113 definePairOne,
114 definePairTwo,
115 defineOne
116 };
117
118 typedef struct SCSUData {
119 /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */
120 uint32_t toUDynamicOffsets[8];
121 uint32_t fromUDynamicOffsets[8];
122
123 /* state machine state - toUnicode */
124 UBool toUIsSingleByteMode;
125 uint8_t toUState;
126 int8_t toUQuoteWindow, toUDynamicWindow;
127 uint8_t toUByteOne;
128 uint8_t toUPadding[3];
129
130 /* state machine state - fromUnicode */
131 UBool fromUIsSingleByteMode;
132 int8_t fromUDynamicWindow;
133
134 /*
135 * windowUse[] keeps track of the use of the dynamic windows:
136 * At nextWindowUseIndex there is the least recently used window,
137 * and the following windows (in a wrapping manner) are more and more
138 * recently used.
139 * At nextWindowUseIndex-1 there is the most recently used window.
140 */
141 uint8_t locale;
142 int8_t nextWindowUseIndex;
143 int8_t windowUse[8];
144 } SCSUData;
145
146 static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 };
147 static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 };
148
149 enum {
150 lGeneric, l_ja
151 };
152
153 /* SCSU setup functions ----------------------------------------------------- */
154
155 static void
_SCSUReset(UConverter * cnv,UConverterResetChoice choice)156 _SCSUReset(UConverter *cnv, UConverterResetChoice choice) {
157 SCSUData *scsu=(SCSUData *)cnv->extraInfo;
158
159 if(choice<=UCNV_RESET_TO_UNICODE) {
160 /* reset toUnicode */
161 uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32);
162
163 scsu->toUIsSingleByteMode=TRUE;
164 scsu->toUState=readCommand;
165 scsu->toUQuoteWindow=scsu->toUDynamicWindow=0;
166 scsu->toUByteOne=0;
167
168 cnv->toULength=0;
169 }
170 if(choice!=UCNV_RESET_TO_UNICODE) {
171 /* reset fromUnicode */
172 uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32);
173
174 scsu->fromUIsSingleByteMode=TRUE;
175 scsu->fromUDynamicWindow=0;
176
177 scsu->nextWindowUseIndex=0;
178 switch(scsu->locale) {
179 case l_ja:
180 uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8);
181 break;
182 default:
183 uprv_memcpy(scsu->windowUse, initialWindowUse, 8);
184 break;
185 }
186
187 cnv->fromUChar32=0;
188 }
189 }
190
191 static void
_SCSUOpen(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)192 _SCSUOpen(UConverter *cnv,
193 UConverterLoadArgs *pArgs,
194 UErrorCode *pErrorCode) {
195 const char *locale=pArgs->locale;
196 if(pArgs->onlyTestIsLoadable) {
197 return;
198 }
199 cnv->extraInfo=uprv_malloc(sizeof(SCSUData));
200 if(cnv->extraInfo!=NULL) {
201 if(locale!=NULL && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) {
202 ((SCSUData *)cnv->extraInfo)->locale=l_ja;
203 } else {
204 ((SCSUData *)cnv->extraInfo)->locale=lGeneric;
205 }
206 _SCSUReset(cnv, UCNV_RESET_BOTH);
207 } else {
208 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
209 }
210
211 /* Set the substitution character U+fffd as a Unicode string. */
212 cnv->subUChars[0]=0xfffd;
213 cnv->subCharLen=-1;
214 }
215
216 static void
_SCSUClose(UConverter * cnv)217 _SCSUClose(UConverter *cnv) {
218 if(cnv->extraInfo!=NULL) {
219 if(!cnv->isExtraLocal) {
220 uprv_free(cnv->extraInfo);
221 }
222 cnv->extraInfo=NULL;
223 }
224 }
225
226 /* SCSU-to-Unicode conversion functions ------------------------------------- */
227
228 static void
_SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)229 _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
230 UErrorCode *pErrorCode) {
231 UConverter *cnv;
232 SCSUData *scsu;
233 const uint8_t *source, *sourceLimit;
234 UChar *target;
235 const UChar *targetLimit;
236 int32_t *offsets;
237 UBool isSingleByteMode;
238 uint8_t state, byteOne;
239 int8_t quoteWindow, dynamicWindow;
240
241 int32_t sourceIndex, nextSourceIndex;
242
243 uint8_t b;
244
245 /* set up the local pointers */
246 cnv=pArgs->converter;
247 scsu=(SCSUData *)cnv->extraInfo;
248
249 source=(const uint8_t *)pArgs->source;
250 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
251 target=pArgs->target;
252 targetLimit=pArgs->targetLimit;
253 offsets=pArgs->offsets;
254
255 /* get the state machine state */
256 isSingleByteMode=scsu->toUIsSingleByteMode;
257 state=scsu->toUState;
258 quoteWindow=scsu->toUQuoteWindow;
259 dynamicWindow=scsu->toUDynamicWindow;
260 byteOne=scsu->toUByteOne;
261
262 /* sourceIndex=-1 if the current character began in the previous buffer */
263 sourceIndex=state==readCommand ? 0 : -1;
264 nextSourceIndex=0;
265
266 /*
267 * conversion "loop"
268 *
269 * For performance, this is not a normal C loop.
270 * Instead, there are two code blocks for the two SCSU modes.
271 * The function branches to either one, and a change of the mode is done with a goto to
272 * the other branch.
273 *
274 * Each branch has two conventional loops:
275 * - a fast-path loop for the most common codes in the mode
276 * - a loop for all other codes in the mode
277 * When the fast-path runs into a code that it cannot handle, its loop ends and it
278 * runs into the following loop to handle the other codes.
279 * The end of the input or output buffer is also handled by the slower loop.
280 * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
281 *
282 * The callback handling is done by returning with an error code.
283 * The conversion framework actually calls the callback function.
284 */
285 if(isSingleByteMode) {
286 /* fast path for single-byte mode */
287 if(state==readCommand) {
288 fastSingle:
289 while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
290 ++source;
291 ++nextSourceIndex;
292 if(b<=0x7f) {
293 /* write US-ASCII graphic character or DEL */
294 *target++=(UChar)b;
295 if(offsets!=NULL) {
296 *offsets++=sourceIndex;
297 }
298 } else {
299 /* write from dynamic window */
300 uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
301 if(c<=0xffff) {
302 *target++=(UChar)c;
303 if(offsets!=NULL) {
304 *offsets++=sourceIndex;
305 }
306 } else {
307 /* output surrogate pair */
308 *target++=(UChar)(0xd7c0+(c>>10));
309 if(target<targetLimit) {
310 *target++=(UChar)(0xdc00|(c&0x3ff));
311 if(offsets!=NULL) {
312 *offsets++=sourceIndex;
313 *offsets++=sourceIndex;
314 }
315 } else {
316 /* target overflow */
317 if(offsets!=NULL) {
318 *offsets++=sourceIndex;
319 }
320 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
321 cnv->UCharErrorBufferLength=1;
322 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
323 goto endloop;
324 }
325 }
326 }
327 sourceIndex=nextSourceIndex;
328 }
329 }
330
331 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
332 singleByteMode:
333 while(source<sourceLimit) {
334 if(target>=targetLimit) {
335 /* target is full */
336 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
337 break;
338 }
339 b=*source++;
340 ++nextSourceIndex;
341 switch(state) {
342 case readCommand:
343 /* redundant conditions are commented out */
344 /* here: b<0x20 because otherwise we would be in fastSingle */
345 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
346 /* CR/LF/TAB/NUL */
347 *target++=(UChar)b;
348 if(offsets!=NULL) {
349 *offsets++=sourceIndex;
350 }
351 sourceIndex=nextSourceIndex;
352 goto fastSingle;
353 } else if(SC0<=b) {
354 if(b<=SC7) {
355 dynamicWindow=(int8_t)(b-SC0);
356 sourceIndex=nextSourceIndex;
357 goto fastSingle;
358 } else /* if(SD0<=b && b<=SD7) */ {
359 dynamicWindow=(int8_t)(b-SD0);
360 state=defineOne;
361 }
362 } else if(/* SQ0<=b && */ b<=SQ7) {
363 quoteWindow=(int8_t)(b-SQ0);
364 state=quoteOne;
365 } else if(b==SDX) {
366 state=definePairOne;
367 } else if(b==SQU) {
368 state=quotePairOne;
369 } else if(b==SCU) {
370 sourceIndex=nextSourceIndex;
371 isSingleByteMode=FALSE;
372 goto fastUnicode;
373 } else /* Srs */ {
374 /* callback(illegal) */
375 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
376 cnv->toUBytes[0]=b;
377 cnv->toULength=1;
378 goto endloop;
379 }
380
381 /* store the first byte of a multibyte sequence in toUBytes[] */
382 cnv->toUBytes[0]=b;
383 cnv->toULength=1;
384 break;
385 case quotePairOne:
386 byteOne=b;
387 cnv->toUBytes[1]=b;
388 cnv->toULength=2;
389 state=quotePairTwo;
390 break;
391 case quotePairTwo:
392 *target++=(UChar)((byteOne<<8)|b);
393 if(offsets!=NULL) {
394 *offsets++=sourceIndex;
395 }
396 sourceIndex=nextSourceIndex;
397 state=readCommand;
398 goto fastSingle;
399 case quoteOne:
400 if(b<0x80) {
401 /* all static offsets are in the BMP */
402 *target++=(UChar)(staticOffsets[quoteWindow]+b);
403 if(offsets!=NULL) {
404 *offsets++=sourceIndex;
405 }
406 } else {
407 /* write from dynamic window */
408 uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
409 if(c<=0xffff) {
410 *target++=(UChar)c;
411 if(offsets!=NULL) {
412 *offsets++=sourceIndex;
413 }
414 } else {
415 /* output surrogate pair */
416 *target++=(UChar)(0xd7c0+(c>>10));
417 if(target<targetLimit) {
418 *target++=(UChar)(0xdc00|(c&0x3ff));
419 if(offsets!=NULL) {
420 *offsets++=sourceIndex;
421 *offsets++=sourceIndex;
422 }
423 } else {
424 /* target overflow */
425 if(offsets!=NULL) {
426 *offsets++=sourceIndex;
427 }
428 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
429 cnv->UCharErrorBufferLength=1;
430 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
431 goto endloop;
432 }
433 }
434 }
435 sourceIndex=nextSourceIndex;
436 state=readCommand;
437 goto fastSingle;
438 case definePairOne:
439 dynamicWindow=(int8_t)((b>>5)&7);
440 byteOne=(uint8_t)(b&0x1f);
441 cnv->toUBytes[1]=b;
442 cnv->toULength=2;
443 state=definePairTwo;
444 break;
445 case definePairTwo:
446 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
447 sourceIndex=nextSourceIndex;
448 state=readCommand;
449 goto fastSingle;
450 case defineOne:
451 if(b==0) {
452 /* callback(illegal): Reserved window offset value 0 */
453 cnv->toUBytes[1]=b;
454 cnv->toULength=2;
455 goto endloop;
456 } else if(b<gapThreshold) {
457 scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
458 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
459 scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
460 } else if(b>=fixedThreshold) {
461 scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
462 } else {
463 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
464 cnv->toUBytes[1]=b;
465 cnv->toULength=2;
466 goto endloop;
467 }
468 sourceIndex=nextSourceIndex;
469 state=readCommand;
470 goto fastSingle;
471 }
472 }
473 } else {
474 /* fast path for Unicode mode */
475 if(state==readCommand) {
476 fastUnicode:
477 while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
478 *target++=(UChar)((b<<8)|source[1]);
479 if(offsets!=NULL) {
480 *offsets++=sourceIndex;
481 }
482 sourceIndex=nextSourceIndex;
483 nextSourceIndex+=2;
484 source+=2;
485 }
486 }
487
488 /* normal state machine for Unicode mode */
489 /* unicodeByteMode: */
490 while(source<sourceLimit) {
491 if(target>=targetLimit) {
492 /* target is full */
493 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
494 break;
495 }
496 b=*source++;
497 ++nextSourceIndex;
498 switch(state) {
499 case readCommand:
500 if((uint8_t)(b-UC0)>(Urs-UC0)) {
501 byteOne=b;
502 cnv->toUBytes[0]=b;
503 cnv->toULength=1;
504 state=quotePairTwo;
505 } else if(/* UC0<=b && */ b<=UC7) {
506 dynamicWindow=(int8_t)(b-UC0);
507 sourceIndex=nextSourceIndex;
508 isSingleByteMode=TRUE;
509 goto fastSingle;
510 } else if(/* UD0<=b && */ b<=UD7) {
511 dynamicWindow=(int8_t)(b-UD0);
512 isSingleByteMode=TRUE;
513 cnv->toUBytes[0]=b;
514 cnv->toULength=1;
515 state=defineOne;
516 goto singleByteMode;
517 } else if(b==UDX) {
518 isSingleByteMode=TRUE;
519 cnv->toUBytes[0]=b;
520 cnv->toULength=1;
521 state=definePairOne;
522 goto singleByteMode;
523 } else if(b==UQU) {
524 cnv->toUBytes[0]=b;
525 cnv->toULength=1;
526 state=quotePairOne;
527 } else /* Urs */ {
528 /* callback(illegal) */
529 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
530 cnv->toUBytes[0]=b;
531 cnv->toULength=1;
532 goto endloop;
533 }
534 break;
535 case quotePairOne:
536 byteOne=b;
537 cnv->toUBytes[1]=b;
538 cnv->toULength=2;
539 state=quotePairTwo;
540 break;
541 case quotePairTwo:
542 *target++=(UChar)((byteOne<<8)|b);
543 if(offsets!=NULL) {
544 *offsets++=sourceIndex;
545 }
546 sourceIndex=nextSourceIndex;
547 state=readCommand;
548 goto fastUnicode;
549 }
550 }
551 }
552 endloop:
553
554 /* set the converter state back into UConverter */
555 if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
556 /* reset to deal with the next character */
557 state=readCommand;
558 } else if(state==readCommand) {
559 /* not in a multi-byte sequence, reset toULength */
560 cnv->toULength=0;
561 }
562 scsu->toUIsSingleByteMode=isSingleByteMode;
563 scsu->toUState=state;
564 scsu->toUQuoteWindow=quoteWindow;
565 scsu->toUDynamicWindow=dynamicWindow;
566 scsu->toUByteOne=byteOne;
567
568 /* write back the updated pointers */
569 pArgs->source=(const char *)source;
570 pArgs->target=target;
571 pArgs->offsets=offsets;
572 return;
573 }
574
575 /*
576 * Identical to _SCSUToUnicodeWithOffsets but without offset handling.
577 * If a change is made in the original function, then either
578 * change this function the same way or
579 * re-copy the original function and remove the variables
580 * offsets, sourceIndex, and nextSourceIndex.
581 */
582 static void
_SCSUToUnicode(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)583 _SCSUToUnicode(UConverterToUnicodeArgs *pArgs,
584 UErrorCode *pErrorCode) {
585 UConverter *cnv;
586 SCSUData *scsu;
587 const uint8_t *source, *sourceLimit;
588 UChar *target;
589 const UChar *targetLimit;
590 UBool isSingleByteMode;
591 uint8_t state, byteOne;
592 int8_t quoteWindow, dynamicWindow;
593
594 uint8_t b;
595
596 /* set up the local pointers */
597 cnv=pArgs->converter;
598 scsu=(SCSUData *)cnv->extraInfo;
599
600 source=(const uint8_t *)pArgs->source;
601 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
602 target=pArgs->target;
603 targetLimit=pArgs->targetLimit;
604
605 /* get the state machine state */
606 isSingleByteMode=scsu->toUIsSingleByteMode;
607 state=scsu->toUState;
608 quoteWindow=scsu->toUQuoteWindow;
609 dynamicWindow=scsu->toUDynamicWindow;
610 byteOne=scsu->toUByteOne;
611
612 /*
613 * conversion "loop"
614 *
615 * For performance, this is not a normal C loop.
616 * Instead, there are two code blocks for the two SCSU modes.
617 * The function branches to either one, and a change of the mode is done with a goto to
618 * the other branch.
619 *
620 * Each branch has two conventional loops:
621 * - a fast-path loop for the most common codes in the mode
622 * - a loop for all other codes in the mode
623 * When the fast-path runs into a code that it cannot handle, its loop ends and it
624 * runs into the following loop to handle the other codes.
625 * The end of the input or output buffer is also handled by the slower loop.
626 * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
627 *
628 * The callback handling is done by returning with an error code.
629 * The conversion framework actually calls the callback function.
630 */
631 if(isSingleByteMode) {
632 /* fast path for single-byte mode */
633 if(state==readCommand) {
634 fastSingle:
635 while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
636 ++source;
637 if(b<=0x7f) {
638 /* write US-ASCII graphic character or DEL */
639 *target++=(UChar)b;
640 } else {
641 /* write from dynamic window */
642 uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
643 if(c<=0xffff) {
644 *target++=(UChar)c;
645 } else {
646 /* output surrogate pair */
647 *target++=(UChar)(0xd7c0+(c>>10));
648 if(target<targetLimit) {
649 *target++=(UChar)(0xdc00|(c&0x3ff));
650 } else {
651 /* target overflow */
652 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
653 cnv->UCharErrorBufferLength=1;
654 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
655 goto endloop;
656 }
657 }
658 }
659 }
660 }
661
662 /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
663 singleByteMode:
664 while(source<sourceLimit) {
665 if(target>=targetLimit) {
666 /* target is full */
667 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
668 break;
669 }
670 b=*source++;
671 switch(state) {
672 case readCommand:
673 /* redundant conditions are commented out */
674 /* here: b<0x20 because otherwise we would be in fastSingle */
675 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
676 /* CR/LF/TAB/NUL */
677 *target++=(UChar)b;
678 goto fastSingle;
679 } else if(SC0<=b) {
680 if(b<=SC7) {
681 dynamicWindow=(int8_t)(b-SC0);
682 goto fastSingle;
683 } else /* if(SD0<=b && b<=SD7) */ {
684 dynamicWindow=(int8_t)(b-SD0);
685 state=defineOne;
686 }
687 } else if(/* SQ0<=b && */ b<=SQ7) {
688 quoteWindow=(int8_t)(b-SQ0);
689 state=quoteOne;
690 } else if(b==SDX) {
691 state=definePairOne;
692 } else if(b==SQU) {
693 state=quotePairOne;
694 } else if(b==SCU) {
695 isSingleByteMode=FALSE;
696 goto fastUnicode;
697 } else /* Srs */ {
698 /* callback(illegal) */
699 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
700 cnv->toUBytes[0]=b;
701 cnv->toULength=1;
702 goto endloop;
703 }
704
705 /* store the first byte of a multibyte sequence in toUBytes[] */
706 cnv->toUBytes[0]=b;
707 cnv->toULength=1;
708 break;
709 case quotePairOne:
710 byteOne=b;
711 cnv->toUBytes[1]=b;
712 cnv->toULength=2;
713 state=quotePairTwo;
714 break;
715 case quotePairTwo:
716 *target++=(UChar)((byteOne<<8)|b);
717 state=readCommand;
718 goto fastSingle;
719 case quoteOne:
720 if(b<0x80) {
721 /* all static offsets are in the BMP */
722 *target++=(UChar)(staticOffsets[quoteWindow]+b);
723 } else {
724 /* write from dynamic window */
725 uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
726 if(c<=0xffff) {
727 *target++=(UChar)c;
728 } else {
729 /* output surrogate pair */
730 *target++=(UChar)(0xd7c0+(c>>10));
731 if(target<targetLimit) {
732 *target++=(UChar)(0xdc00|(c&0x3ff));
733 } else {
734 /* target overflow */
735 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
736 cnv->UCharErrorBufferLength=1;
737 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
738 goto endloop;
739 }
740 }
741 }
742 state=readCommand;
743 goto fastSingle;
744 case definePairOne:
745 dynamicWindow=(int8_t)((b>>5)&7);
746 byteOne=(uint8_t)(b&0x1f);
747 cnv->toUBytes[1]=b;
748 cnv->toULength=2;
749 state=definePairTwo;
750 break;
751 case definePairTwo:
752 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
753 state=readCommand;
754 goto fastSingle;
755 case defineOne:
756 if(b==0) {
757 /* callback(illegal): Reserved window offset value 0 */
758 cnv->toUBytes[1]=b;
759 cnv->toULength=2;
760 goto endloop;
761 } else if(b<gapThreshold) {
762 scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
763 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
764 scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
765 } else if(b>=fixedThreshold) {
766 scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
767 } else {
768 /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
769 cnv->toUBytes[1]=b;
770 cnv->toULength=2;
771 goto endloop;
772 }
773 state=readCommand;
774 goto fastSingle;
775 }
776 }
777 } else {
778 /* fast path for Unicode mode */
779 if(state==readCommand) {
780 fastUnicode:
781 while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
782 *target++=(UChar)((b<<8)|source[1]);
783 source+=2;
784 }
785 }
786
787 /* normal state machine for Unicode mode */
788 /* unicodeByteMode: */
789 while(source<sourceLimit) {
790 if(target>=targetLimit) {
791 /* target is full */
792 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
793 break;
794 }
795 b=*source++;
796 switch(state) {
797 case readCommand:
798 if((uint8_t)(b-UC0)>(Urs-UC0)) {
799 byteOne=b;
800 cnv->toUBytes[0]=b;
801 cnv->toULength=1;
802 state=quotePairTwo;
803 } else if(/* UC0<=b && */ b<=UC7) {
804 dynamicWindow=(int8_t)(b-UC0);
805 isSingleByteMode=TRUE;
806 goto fastSingle;
807 } else if(/* UD0<=b && */ b<=UD7) {
808 dynamicWindow=(int8_t)(b-UD0);
809 isSingleByteMode=TRUE;
810 cnv->toUBytes[0]=b;
811 cnv->toULength=1;
812 state=defineOne;
813 goto singleByteMode;
814 } else if(b==UDX) {
815 isSingleByteMode=TRUE;
816 cnv->toUBytes[0]=b;
817 cnv->toULength=1;
818 state=definePairOne;
819 goto singleByteMode;
820 } else if(b==UQU) {
821 cnv->toUBytes[0]=b;
822 cnv->toULength=1;
823 state=quotePairOne;
824 } else /* Urs */ {
825 /* callback(illegal) */
826 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
827 cnv->toUBytes[0]=b;
828 cnv->toULength=1;
829 goto endloop;
830 }
831 break;
832 case quotePairOne:
833 byteOne=b;
834 cnv->toUBytes[1]=b;
835 cnv->toULength=2;
836 state=quotePairTwo;
837 break;
838 case quotePairTwo:
839 *target++=(UChar)((byteOne<<8)|b);
840 state=readCommand;
841 goto fastUnicode;
842 }
843 }
844 }
845 endloop:
846
847 /* set the converter state back into UConverter */
848 if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
849 /* reset to deal with the next character */
850 state=readCommand;
851 } else if(state==readCommand) {
852 /* not in a multi-byte sequence, reset toULength */
853 cnv->toULength=0;
854 }
855 scsu->toUIsSingleByteMode=isSingleByteMode;
856 scsu->toUState=state;
857 scsu->toUQuoteWindow=quoteWindow;
858 scsu->toUDynamicWindow=dynamicWindow;
859 scsu->toUByteOne=byteOne;
860
861 /* write back the updated pointers */
862 pArgs->source=(const char *)source;
863 pArgs->target=target;
864 return;
865 }
866
867 /* SCSU-from-Unicode conversion functions ----------------------------------- */
868
869 /*
870 * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve
871 * reasonable results. The lookahead is minimal.
872 * Many cases are simple:
873 * A character fits directly into the current mode, a dynamic or static window,
874 * or is not compressible. These cases are tested first.
875 * Real compression heuristics are applied to the rest, in code branches for
876 * single/Unicode mode and BMP/supplementary code points.
877 * The heuristics used here are extremely simple.
878 */
879
880 /* get the number of the window that this character is in, or -1 */
881 static int8_t
getWindow(const uint32_t offsets[8],uint32_t c)882 getWindow(const uint32_t offsets[8], uint32_t c) {
883 int i;
884 for(i=0; i<8; ++i) {
885 if((uint32_t)(c-offsets[i])<=0x7f) {
886 return (int8_t)(i);
887 }
888 }
889 return -1;
890 }
891
892 /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */
893 static UBool
isInOffsetWindowOrDirect(uint32_t offset,uint32_t c)894 isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) {
895 return (UBool)(c<=offset+0x7f &&
896 (c>=offset || (c<=0x7f &&
897 (c>=0x20 || (1UL<<c)&0x2601))));
898 /* binary 0010 0110 0000 0001,
899 check for b==0xd || b==0xa || b==9 || b==0 */
900 }
901
902 /*
903 * getNextDynamicWindow returns the next dynamic window to be redefined
904 */
905 static int8_t
getNextDynamicWindow(SCSUData * scsu)906 getNextDynamicWindow(SCSUData *scsu) {
907 int8_t window=scsu->windowUse[scsu->nextWindowUseIndex];
908 if(++scsu->nextWindowUseIndex==8) {
909 scsu->nextWindowUseIndex=0;
910 }
911 return window;
912 }
913
914 /*
915 * useDynamicWindow() adjusts
916 * windowUse[] and nextWindowUseIndex for the algorithm to choose
917 * the next dynamic window to be defined;
918 * a subclass may override it and provide its own algorithm.
919 */
920 static void
useDynamicWindow(SCSUData * scsu,int8_t window)921 useDynamicWindow(SCSUData *scsu, int8_t window) {
922 /*
923 * move the existing window, which just became the most recently used one,
924 * up in windowUse[] to nextWindowUseIndex-1
925 */
926
927 /* first, find the index of the window - backwards to favor the more recently used windows */
928 int i, j;
929
930 i=scsu->nextWindowUseIndex;
931 do {
932 if(--i<0) {
933 i=7;
934 }
935 } while(scsu->windowUse[i]!=window);
936
937 /* now copy each windowUse[i+1] to [i] */
938 j=i+1;
939 if(j==8) {
940 j=0;
941 }
942 while(j!=scsu->nextWindowUseIndex) {
943 scsu->windowUse[i]=scsu->windowUse[j];
944 i=j;
945 if(++j==8) { j=0; }
946 }
947
948 /* finally, set the window into the most recently used index */
949 scsu->windowUse[i]=window;
950 }
951
952 /*
953 * calculate the offset and the code for a dynamic window that contains the character
954 * takes fixed offsets into account
955 * the offset of the window is stored in the offset variable,
956 * the code is returned
957 *
958 * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX, subtract 0x200 to get the true code
959 */
960 static int
getDynamicOffset(uint32_t c,uint32_t * pOffset)961 getDynamicOffset(uint32_t c, uint32_t *pOffset) {
962 int i;
963
964 for(i=0; i<7; ++i) {
965 if((uint32_t)(c-fixedOffsets[i])<=0x7f) {
966 *pOffset=fixedOffsets[i];
967 return 0xf9+i;
968 }
969 }
970
971 if(c<0x80) {
972 /* No dynamic window for US-ASCII. */
973 return -1;
974 } else if(c<0x3400 ||
975 (uint32_t)(c-0x10000)<(0x14000-0x10000) ||
976 (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000)
977 ) {
978 /* This character is in a code range for a "small", i.e., reasonably windowable, script. */
979 *pOffset=c&0x7fffff80;
980 return (int)(c>>7);
981 } else if(0xe000<=c && c!=0xfeff && c<0xfff0) {
982 /* For these characters we need to take the gapOffset into account. */
983 *pOffset=c&0x7fffff80;
984 return (int)((c-gapOffset)>>7);
985 } else {
986 return -1;
987 }
988 }
989
990 /*
991 * Idea for compression:
992 * - save SCSUData and other state before really starting work
993 * - at endloop, see if compression could be better with just unicode mode
994 * - don't do this if a callback has been called
995 * - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning
996 * - different buffer handling!
997 *
998 * Drawback or need for corrective handling:
999 * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and
1000 * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible
1001 * not only for compression but also for HTML/XML documents with following charset/encoding announcers.
1002 *
1003 * How to achieve both?
1004 * - Only replace the result after an SDX or SCU?
1005 */
1006
1007 static void
_SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)1008 _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1009 UErrorCode *pErrorCode) {
1010 UConverter *cnv;
1011 SCSUData *scsu;
1012 const UChar *source, *sourceLimit;
1013 uint8_t *target;
1014 int32_t targetCapacity;
1015 int32_t *offsets;
1016
1017 UBool isSingleByteMode;
1018 uint8_t dynamicWindow;
1019 uint32_t currentOffset;
1020
1021 uint32_t c, delta;
1022
1023 int32_t sourceIndex, nextSourceIndex;
1024
1025 int32_t length;
1026
1027 /* variables for compression heuristics */
1028 uint32_t offset;
1029 UChar lead, trail;
1030 int code;
1031 int8_t window;
1032
1033 /* set up the local pointers */
1034 cnv=pArgs->converter;
1035 scsu=(SCSUData *)cnv->extraInfo;
1036
1037 /* set up the local pointers */
1038 source=pArgs->source;
1039 sourceLimit=pArgs->sourceLimit;
1040 target=(uint8_t *)pArgs->target;
1041 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
1042 offsets=pArgs->offsets;
1043
1044 /* get the state machine state */
1045 isSingleByteMode=scsu->fromUIsSingleByteMode;
1046 dynamicWindow=scsu->fromUDynamicWindow;
1047 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1048
1049 c=cnv->fromUChar32;
1050
1051 /* sourceIndex=-1 if the current character began in the previous buffer */
1052 sourceIndex= c==0 ? 0 : -1;
1053 nextSourceIndex=0;
1054
1055 /* similar conversion "loop" as in toUnicode */
1056 loop:
1057 if(isSingleByteMode) {
1058 if(c!=0 && targetCapacity>0) {
1059 goto getTrailSingle;
1060 }
1061
1062 /* state machine for single-byte mode */
1063 /* singleByteMode: */
1064 while(source<sourceLimit) {
1065 if(targetCapacity<=0) {
1066 /* target is full */
1067 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1068 break;
1069 }
1070 c=*source++;
1071 ++nextSourceIndex;
1072
1073 if((c-0x20)<=0x5f) {
1074 /* pass US-ASCII graphic character through */
1075 *target++=(uint8_t)c;
1076 if(offsets!=NULL) {
1077 *offsets++=sourceIndex;
1078 }
1079 --targetCapacity;
1080 } else if(c<0x20) {
1081 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1082 /* CR/LF/TAB/NUL */
1083 *target++=(uint8_t)c;
1084 if(offsets!=NULL) {
1085 *offsets++=sourceIndex;
1086 }
1087 --targetCapacity;
1088 } else {
1089 /* quote C0 control character */
1090 c|=SQ0<<8;
1091 length=2;
1092 goto outputBytes;
1093 }
1094 } else if((delta=c-currentOffset)<=0x7f) {
1095 /* use the current dynamic window */
1096 *target++=(uint8_t)(delta|0x80);
1097 if(offsets!=NULL) {
1098 *offsets++=sourceIndex;
1099 }
1100 --targetCapacity;
1101 } else if(UTF_IS_SURROGATE(c)) {
1102 if(UTF_IS_SURROGATE_FIRST(c)) {
1103 getTrailSingle:
1104 lead=(UChar)c;
1105 if(source<sourceLimit) {
1106 /* test the following code unit */
1107 trail=*source;
1108 if(UTF_IS_SECOND_SURROGATE(trail)) {
1109 ++source;
1110 ++nextSourceIndex;
1111 c=UTF16_GET_PAIR_VALUE(c, trail);
1112 /* convert this surrogate code point */
1113 /* exit this condition tree */
1114 } else {
1115 /* this is an unmatched lead code unit (1st surrogate) */
1116 /* callback(illegal) */
1117 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1118 goto endloop;
1119 }
1120 } else {
1121 /* no more input */
1122 break;
1123 }
1124 } else {
1125 /* this is an unmatched trail code unit (2nd surrogate) */
1126 /* callback(illegal) */
1127 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1128 goto endloop;
1129 }
1130
1131 /* compress supplementary character U+10000..U+10ffff */
1132 if((delta=c-currentOffset)<=0x7f) {
1133 /* use the current dynamic window */
1134 *target++=(uint8_t)(delta|0x80);
1135 if(offsets!=NULL) {
1136 *offsets++=sourceIndex;
1137 }
1138 --targetCapacity;
1139 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1140 /* there is a dynamic window that contains this character, change to it */
1141 dynamicWindow=window;
1142 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1143 useDynamicWindow(scsu, dynamicWindow);
1144 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1145 length=2;
1146 goto outputBytes;
1147 } else if((code=getDynamicOffset(c, &offset))>=0) {
1148 /* might check if there are more characters in this window to come */
1149 /* define an extended window with this character */
1150 code-=0x200;
1151 dynamicWindow=getNextDynamicWindow(scsu);
1152 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1153 useDynamicWindow(scsu, dynamicWindow);
1154 c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1155 length=4;
1156 goto outputBytes;
1157 } else {
1158 /* change to Unicode mode and output this (lead, trail) pair */
1159 isSingleByteMode=FALSE;
1160 *target++=(uint8_t)SCU;
1161 if(offsets!=NULL) {
1162 *offsets++=sourceIndex;
1163 }
1164 --targetCapacity;
1165 c=((uint32_t)lead<<16)|trail;
1166 length=4;
1167 goto outputBytes;
1168 }
1169 } else if(c<0xa0) {
1170 /* quote C1 control character */
1171 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
1172 length=2;
1173 goto outputBytes;
1174 } else if(c==0xfeff || c>=0xfff0) {
1175 /* quote signature character=byte order mark and specials */
1176 c|=SQU<<16;
1177 length=3;
1178 goto outputBytes;
1179 } else {
1180 /* compress all other BMP characters */
1181 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1182 /* there is a window defined that contains this character - switch to it or quote from it? */
1183 if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
1184 /* change to dynamic window */
1185 dynamicWindow=window;
1186 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1187 useDynamicWindow(scsu, dynamicWindow);
1188 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1189 length=2;
1190 goto outputBytes;
1191 } else {
1192 /* quote from dynamic window */
1193 c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
1194 length=2;
1195 goto outputBytes;
1196 }
1197 } else if((window=getWindow(staticOffsets, c))>=0) {
1198 /* quote from static window */
1199 c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
1200 length=2;
1201 goto outputBytes;
1202 } else if((code=getDynamicOffset(c, &offset))>=0) {
1203 /* define a dynamic window with this character */
1204 dynamicWindow=getNextDynamicWindow(scsu);
1205 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1206 useDynamicWindow(scsu, dynamicWindow);
1207 c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1208 length=3;
1209 goto outputBytes;
1210 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
1211 (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1212 ) {
1213 /*
1214 * this character is not compressible (a BMP ideograph or similar);
1215 * switch to Unicode mode if this is the last character in the block
1216 * or there is at least one more ideograph following immediately
1217 */
1218 isSingleByteMode=FALSE;
1219 c|=SCU<<16;
1220 length=3;
1221 goto outputBytes;
1222 } else {
1223 /* quote Unicode */
1224 c|=SQU<<16;
1225 length=3;
1226 goto outputBytes;
1227 }
1228 }
1229
1230 /* normal end of conversion: prepare for a new character */
1231 c=0;
1232 sourceIndex=nextSourceIndex;
1233 }
1234 } else {
1235 if(c!=0 && targetCapacity>0) {
1236 goto getTrailUnicode;
1237 }
1238
1239 /* state machine for Unicode mode */
1240 /* unicodeByteMode: */
1241 while(source<sourceLimit) {
1242 if(targetCapacity<=0) {
1243 /* target is full */
1244 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1245 break;
1246 }
1247 c=*source++;
1248 ++nextSourceIndex;
1249
1250 if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
1251 /* not compressible, write character directly */
1252 if(targetCapacity>=2) {
1253 *target++=(uint8_t)(c>>8);
1254 *target++=(uint8_t)c;
1255 if(offsets!=NULL) {
1256 *offsets++=sourceIndex;
1257 *offsets++=sourceIndex;
1258 }
1259 targetCapacity-=2;
1260 } else {
1261 length=2;
1262 goto outputBytes;
1263 }
1264 } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1265 /* compress BMP character if the following one is not an uncompressible ideograph */
1266 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
1267 if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
1268 /* ASCII digit or letter */
1269 isSingleByteMode=TRUE;
1270 c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
1271 length=2;
1272 goto outputBytes;
1273 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1274 /* there is a dynamic window that contains this character, change to it */
1275 isSingleByteMode=TRUE;
1276 dynamicWindow=window;
1277 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1278 useDynamicWindow(scsu, dynamicWindow);
1279 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1280 length=2;
1281 goto outputBytes;
1282 } else if((code=getDynamicOffset(c, &offset))>=0) {
1283 /* define a dynamic window with this character */
1284 isSingleByteMode=TRUE;
1285 dynamicWindow=getNextDynamicWindow(scsu);
1286 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1287 useDynamicWindow(scsu, dynamicWindow);
1288 c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1289 length=3;
1290 goto outputBytes;
1291 }
1292 }
1293
1294 /* don't know how to compress this character, just write it directly */
1295 length=2;
1296 goto outputBytes;
1297 } else if(c<0xe000) {
1298 /* c is a surrogate */
1299 if(UTF_IS_SURROGATE_FIRST(c)) {
1300 getTrailUnicode:
1301 lead=(UChar)c;
1302 if(source<sourceLimit) {
1303 /* test the following code unit */
1304 trail=*source;
1305 if(UTF_IS_SECOND_SURROGATE(trail)) {
1306 ++source;
1307 ++nextSourceIndex;
1308 c=UTF16_GET_PAIR_VALUE(c, trail);
1309 /* convert this surrogate code point */
1310 /* exit this condition tree */
1311 } else {
1312 /* this is an unmatched lead code unit (1st surrogate) */
1313 /* callback(illegal) */
1314 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1315 goto endloop;
1316 }
1317 } else {
1318 /* no more input */
1319 break;
1320 }
1321 } else {
1322 /* this is an unmatched trail code unit (2nd surrogate) */
1323 /* callback(illegal) */
1324 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1325 goto endloop;
1326 }
1327
1328 /* compress supplementary character */
1329 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
1330 !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1331 ) {
1332 /*
1333 * there is a dynamic window that contains this character and
1334 * the following character is not uncompressible,
1335 * change to the window
1336 */
1337 isSingleByteMode=TRUE;
1338 dynamicWindow=window;
1339 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1340 useDynamicWindow(scsu, dynamicWindow);
1341 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1342 length=2;
1343 goto outputBytes;
1344 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
1345 (code=getDynamicOffset(c, &offset))>=0
1346 ) {
1347 /* two supplementary characters in (probably) the same window - define an extended one */
1348 isSingleByteMode=TRUE;
1349 code-=0x200;
1350 dynamicWindow=getNextDynamicWindow(scsu);
1351 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1352 useDynamicWindow(scsu, dynamicWindow);
1353 c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1354 length=4;
1355 goto outputBytes;
1356 } else {
1357 /* don't know how to compress this character, just write it directly */
1358 c=((uint32_t)lead<<16)|trail;
1359 length=4;
1360 goto outputBytes;
1361 }
1362 } else /* 0xe000<=c<0xf300 */ {
1363 /* quote to avoid SCSU tags */
1364 c|=UQU<<16;
1365 length=3;
1366 goto outputBytes;
1367 }
1368
1369 /* normal end of conversion: prepare for a new character */
1370 c=0;
1371 sourceIndex=nextSourceIndex;
1372 }
1373 }
1374 endloop:
1375
1376 /* set the converter state back into UConverter */
1377 scsu->fromUIsSingleByteMode=isSingleByteMode;
1378 scsu->fromUDynamicWindow=dynamicWindow;
1379
1380 cnv->fromUChar32=c;
1381
1382 /* write back the updated pointers */
1383 pArgs->source=source;
1384 pArgs->target=(char *)target;
1385 pArgs->offsets=offsets;
1386 return;
1387
1388 outputBytes:
1389 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1390 /* from the first if in the loop we know that targetCapacity>0 */
1391 if(length<=targetCapacity) {
1392 if(offsets==NULL) {
1393 switch(length) {
1394 /* each branch falls through to the next one */
1395 case 4:
1396 *target++=(uint8_t)(c>>24);
1397 case 3:
1398 *target++=(uint8_t)(c>>16);
1399 case 2:
1400 *target++=(uint8_t)(c>>8);
1401 case 1:
1402 *target++=(uint8_t)c;
1403 default:
1404 /* will never occur */
1405 break;
1406 }
1407 } else {
1408 switch(length) {
1409 /* each branch falls through to the next one */
1410 case 4:
1411 *target++=(uint8_t)(c>>24);
1412 *offsets++=sourceIndex;
1413 case 3:
1414 *target++=(uint8_t)(c>>16);
1415 *offsets++=sourceIndex;
1416 case 2:
1417 *target++=(uint8_t)(c>>8);
1418 *offsets++=sourceIndex;
1419 case 1:
1420 *target++=(uint8_t)c;
1421 *offsets++=sourceIndex;
1422 default:
1423 /* will never occur */
1424 break;
1425 }
1426 }
1427 targetCapacity-=length;
1428
1429 /* normal end of conversion: prepare for a new character */
1430 c=0;
1431 sourceIndex=nextSourceIndex;
1432 goto loop;
1433 } else {
1434 uint8_t *p;
1435
1436 /*
1437 * We actually do this backwards here:
1438 * In order to save an intermediate variable, we output
1439 * first to the overflow buffer what does not fit into the
1440 * regular target.
1441 */
1442 /* we know that 0<=targetCapacity<length<=4 */
1443 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1444 length-=targetCapacity;
1445 p=(uint8_t *)cnv->charErrorBuffer;
1446 switch(length) {
1447 /* each branch falls through to the next one */
1448 case 4:
1449 *p++=(uint8_t)(c>>24);
1450 case 3:
1451 *p++=(uint8_t)(c>>16);
1452 case 2:
1453 *p++=(uint8_t)(c>>8);
1454 case 1:
1455 *p=(uint8_t)c;
1456 default:
1457 /* will never occur */
1458 break;
1459 }
1460 cnv->charErrorBufferLength=(int8_t)length;
1461
1462 /* now output what fits into the regular target */
1463 c>>=8*length; /* length was reduced by targetCapacity */
1464 switch(targetCapacity) {
1465 /* each branch falls through to the next one */
1466 case 3:
1467 *target++=(uint8_t)(c>>16);
1468 if(offsets!=NULL) {
1469 *offsets++=sourceIndex;
1470 }
1471 case 2:
1472 *target++=(uint8_t)(c>>8);
1473 if(offsets!=NULL) {
1474 *offsets++=sourceIndex;
1475 }
1476 case 1:
1477 *target++=(uint8_t)c;
1478 if(offsets!=NULL) {
1479 *offsets++=sourceIndex;
1480 }
1481 default:
1482 break;
1483 }
1484
1485 /* target overflow */
1486 targetCapacity=0;
1487 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1488 c=0;
1489 goto endloop;
1490 }
1491 }
1492
1493 /*
1494 * Identical to _SCSUFromUnicodeWithOffsets but without offset handling.
1495 * If a change is made in the original function, then either
1496 * change this function the same way or
1497 * re-copy the original function and remove the variables
1498 * offsets, sourceIndex, and nextSourceIndex.
1499 */
1500 static void
_SCSUFromUnicode(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)1501 _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs,
1502 UErrorCode *pErrorCode) {
1503 UConverter *cnv;
1504 SCSUData *scsu;
1505 const UChar *source, *sourceLimit;
1506 uint8_t *target;
1507 int32_t targetCapacity;
1508
1509 UBool isSingleByteMode;
1510 uint8_t dynamicWindow;
1511 uint32_t currentOffset;
1512
1513 uint32_t c, delta;
1514
1515 int32_t length;
1516
1517 /* variables for compression heuristics */
1518 uint32_t offset;
1519 UChar lead, trail;
1520 int code;
1521 int8_t window;
1522
1523 /* set up the local pointers */
1524 cnv=pArgs->converter;
1525 scsu=(SCSUData *)cnv->extraInfo;
1526
1527 /* set up the local pointers */
1528 source=pArgs->source;
1529 sourceLimit=pArgs->sourceLimit;
1530 target=(uint8_t *)pArgs->target;
1531 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
1532
1533 /* get the state machine state */
1534 isSingleByteMode=scsu->fromUIsSingleByteMode;
1535 dynamicWindow=scsu->fromUDynamicWindow;
1536 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1537
1538 c=cnv->fromUChar32;
1539
1540 /* similar conversion "loop" as in toUnicode */
1541 loop:
1542 if(isSingleByteMode) {
1543 if(c!=0 && targetCapacity>0) {
1544 goto getTrailSingle;
1545 }
1546
1547 /* state machine for single-byte mode */
1548 /* singleByteMode: */
1549 while(source<sourceLimit) {
1550 if(targetCapacity<=0) {
1551 /* target is full */
1552 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1553 break;
1554 }
1555 c=*source++;
1556
1557 if((c-0x20)<=0x5f) {
1558 /* pass US-ASCII graphic character through */
1559 *target++=(uint8_t)c;
1560 --targetCapacity;
1561 } else if(c<0x20) {
1562 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1563 /* CR/LF/TAB/NUL */
1564 *target++=(uint8_t)c;
1565 --targetCapacity;
1566 } else {
1567 /* quote C0 control character */
1568 c|=SQ0<<8;
1569 length=2;
1570 goto outputBytes;
1571 }
1572 } else if((delta=c-currentOffset)<=0x7f) {
1573 /* use the current dynamic window */
1574 *target++=(uint8_t)(delta|0x80);
1575 --targetCapacity;
1576 } else if(UTF_IS_SURROGATE(c)) {
1577 if(UTF_IS_SURROGATE_FIRST(c)) {
1578 getTrailSingle:
1579 lead=(UChar)c;
1580 if(source<sourceLimit) {
1581 /* test the following code unit */
1582 trail=*source;
1583 if(UTF_IS_SECOND_SURROGATE(trail)) {
1584 ++source;
1585 c=UTF16_GET_PAIR_VALUE(c, trail);
1586 /* convert this surrogate code point */
1587 /* exit this condition tree */
1588 } else {
1589 /* this is an unmatched lead code unit (1st surrogate) */
1590 /* callback(illegal) */
1591 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1592 goto endloop;
1593 }
1594 } else {
1595 /* no more input */
1596 break;
1597 }
1598 } else {
1599 /* this is an unmatched trail code unit (2nd surrogate) */
1600 /* callback(illegal) */
1601 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1602 goto endloop;
1603 }
1604
1605 /* compress supplementary character U+10000..U+10ffff */
1606 if((delta=c-currentOffset)<=0x7f) {
1607 /* use the current dynamic window */
1608 *target++=(uint8_t)(delta|0x80);
1609 --targetCapacity;
1610 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1611 /* there is a dynamic window that contains this character, change to it */
1612 dynamicWindow=window;
1613 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1614 useDynamicWindow(scsu, dynamicWindow);
1615 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1616 length=2;
1617 goto outputBytes;
1618 } else if((code=getDynamicOffset(c, &offset))>=0) {
1619 /* might check if there are more characters in this window to come */
1620 /* define an extended window with this character */
1621 code-=0x200;
1622 dynamicWindow=getNextDynamicWindow(scsu);
1623 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1624 useDynamicWindow(scsu, dynamicWindow);
1625 c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1626 length=4;
1627 goto outputBytes;
1628 } else {
1629 /* change to Unicode mode and output this (lead, trail) pair */
1630 isSingleByteMode=FALSE;
1631 *target++=(uint8_t)SCU;
1632 --targetCapacity;
1633 c=((uint32_t)lead<<16)|trail;
1634 length=4;
1635 goto outputBytes;
1636 }
1637 } else if(c<0xa0) {
1638 /* quote C1 control character */
1639 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
1640 length=2;
1641 goto outputBytes;
1642 } else if(c==0xfeff || c>=0xfff0) {
1643 /* quote signature character=byte order mark and specials */
1644 c|=SQU<<16;
1645 length=3;
1646 goto outputBytes;
1647 } else {
1648 /* compress all other BMP characters */
1649 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1650 /* there is a window defined that contains this character - switch to it or quote from it? */
1651 if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
1652 /* change to dynamic window */
1653 dynamicWindow=window;
1654 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1655 useDynamicWindow(scsu, dynamicWindow);
1656 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1657 length=2;
1658 goto outputBytes;
1659 } else {
1660 /* quote from dynamic window */
1661 c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
1662 length=2;
1663 goto outputBytes;
1664 }
1665 } else if((window=getWindow(staticOffsets, c))>=0) {
1666 /* quote from static window */
1667 c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
1668 length=2;
1669 goto outputBytes;
1670 } else if((code=getDynamicOffset(c, &offset))>=0) {
1671 /* define a dynamic window with this character */
1672 dynamicWindow=getNextDynamicWindow(scsu);
1673 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1674 useDynamicWindow(scsu, dynamicWindow);
1675 c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1676 length=3;
1677 goto outputBytes;
1678 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
1679 (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1680 ) {
1681 /*
1682 * this character is not compressible (a BMP ideograph or similar);
1683 * switch to Unicode mode if this is the last character in the block
1684 * or there is at least one more ideograph following immediately
1685 */
1686 isSingleByteMode=FALSE;
1687 c|=SCU<<16;
1688 length=3;
1689 goto outputBytes;
1690 } else {
1691 /* quote Unicode */
1692 c|=SQU<<16;
1693 length=3;
1694 goto outputBytes;
1695 }
1696 }
1697
1698 /* normal end of conversion: prepare for a new character */
1699 c=0;
1700 }
1701 } else {
1702 if(c!=0 && targetCapacity>0) {
1703 goto getTrailUnicode;
1704 }
1705
1706 /* state machine for Unicode mode */
1707 /* unicodeByteMode: */
1708 while(source<sourceLimit) {
1709 if(targetCapacity<=0) {
1710 /* target is full */
1711 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1712 break;
1713 }
1714 c=*source++;
1715
1716 if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
1717 /* not compressible, write character directly */
1718 if(targetCapacity>=2) {
1719 *target++=(uint8_t)(c>>8);
1720 *target++=(uint8_t)c;
1721 targetCapacity-=2;
1722 } else {
1723 length=2;
1724 goto outputBytes;
1725 }
1726 } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1727 /* compress BMP character if the following one is not an uncompressible ideograph */
1728 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
1729 if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
1730 /* ASCII digit or letter */
1731 isSingleByteMode=TRUE;
1732 c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
1733 length=2;
1734 goto outputBytes;
1735 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1736 /* there is a dynamic window that contains this character, change to it */
1737 isSingleByteMode=TRUE;
1738 dynamicWindow=window;
1739 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1740 useDynamicWindow(scsu, dynamicWindow);
1741 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1742 length=2;
1743 goto outputBytes;
1744 } else if((code=getDynamicOffset(c, &offset))>=0) {
1745 /* define a dynamic window with this character */
1746 isSingleByteMode=TRUE;
1747 dynamicWindow=getNextDynamicWindow(scsu);
1748 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1749 useDynamicWindow(scsu, dynamicWindow);
1750 c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1751 length=3;
1752 goto outputBytes;
1753 }
1754 }
1755
1756 /* don't know how to compress this character, just write it directly */
1757 length=2;
1758 goto outputBytes;
1759 } else if(c<0xe000) {
1760 /* c is a surrogate */
1761 if(UTF_IS_SURROGATE_FIRST(c)) {
1762 getTrailUnicode:
1763 lead=(UChar)c;
1764 if(source<sourceLimit) {
1765 /* test the following code unit */
1766 trail=*source;
1767 if(UTF_IS_SECOND_SURROGATE(trail)) {
1768 ++source;
1769 c=UTF16_GET_PAIR_VALUE(c, trail);
1770 /* convert this surrogate code point */
1771 /* exit this condition tree */
1772 } else {
1773 /* this is an unmatched lead code unit (1st surrogate) */
1774 /* callback(illegal) */
1775 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1776 goto endloop;
1777 }
1778 } else {
1779 /* no more input */
1780 break;
1781 }
1782 } else {
1783 /* this is an unmatched trail code unit (2nd surrogate) */
1784 /* callback(illegal) */
1785 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1786 goto endloop;
1787 }
1788
1789 /* compress supplementary character */
1790 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
1791 !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1792 ) {
1793 /*
1794 * there is a dynamic window that contains this character and
1795 * the following character is not uncompressible,
1796 * change to the window
1797 */
1798 isSingleByteMode=TRUE;
1799 dynamicWindow=window;
1800 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1801 useDynamicWindow(scsu, dynamicWindow);
1802 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1803 length=2;
1804 goto outputBytes;
1805 } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
1806 (code=getDynamicOffset(c, &offset))>=0
1807 ) {
1808 /* two supplementary characters in (probably) the same window - define an extended one */
1809 isSingleByteMode=TRUE;
1810 code-=0x200;
1811 dynamicWindow=getNextDynamicWindow(scsu);
1812 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1813 useDynamicWindow(scsu, dynamicWindow);
1814 c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1815 length=4;
1816 goto outputBytes;
1817 } else {
1818 /* don't know how to compress this character, just write it directly */
1819 c=((uint32_t)lead<<16)|trail;
1820 length=4;
1821 goto outputBytes;
1822 }
1823 } else /* 0xe000<=c<0xf300 */ {
1824 /* quote to avoid SCSU tags */
1825 c|=UQU<<16;
1826 length=3;
1827 goto outputBytes;
1828 }
1829
1830 /* normal end of conversion: prepare for a new character */
1831 c=0;
1832 }
1833 }
1834 endloop:
1835
1836 /* set the converter state back into UConverter */
1837 scsu->fromUIsSingleByteMode=isSingleByteMode;
1838 scsu->fromUDynamicWindow=dynamicWindow;
1839
1840 cnv->fromUChar32=c;
1841
1842 /* write back the updated pointers */
1843 pArgs->source=source;
1844 pArgs->target=(char *)target;
1845 return;
1846
1847 outputBytes:
1848 /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1849 /* from the first if in the loop we know that targetCapacity>0 */
1850 if(length<=targetCapacity) {
1851 switch(length) {
1852 /* each branch falls through to the next one */
1853 case 4:
1854 *target++=(uint8_t)(c>>24);
1855 case 3:
1856 *target++=(uint8_t)(c>>16);
1857 case 2:
1858 *target++=(uint8_t)(c>>8);
1859 case 1:
1860 *target++=(uint8_t)c;
1861 default:
1862 /* will never occur */
1863 break;
1864 }
1865 targetCapacity-=length;
1866
1867 /* normal end of conversion: prepare for a new character */
1868 c=0;
1869 goto loop;
1870 } else {
1871 uint8_t *p;
1872
1873 /*
1874 * We actually do this backwards here:
1875 * In order to save an intermediate variable, we output
1876 * first to the overflow buffer what does not fit into the
1877 * regular target.
1878 */
1879 /* we know that 0<=targetCapacity<length<=4 */
1880 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1881 length-=targetCapacity;
1882 p=(uint8_t *)cnv->charErrorBuffer;
1883 switch(length) {
1884 /* each branch falls through to the next one */
1885 case 4:
1886 *p++=(uint8_t)(c>>24);
1887 case 3:
1888 *p++=(uint8_t)(c>>16);
1889 case 2:
1890 *p++=(uint8_t)(c>>8);
1891 case 1:
1892 *p=(uint8_t)c;
1893 default:
1894 /* will never occur */
1895 break;
1896 }
1897 cnv->charErrorBufferLength=(int8_t)length;
1898
1899 /* now output what fits into the regular target */
1900 c>>=8*length; /* length was reduced by targetCapacity */
1901 switch(targetCapacity) {
1902 /* each branch falls through to the next one */
1903 case 3:
1904 *target++=(uint8_t)(c>>16);
1905 case 2:
1906 *target++=(uint8_t)(c>>8);
1907 case 1:
1908 *target++=(uint8_t)c;
1909 default:
1910 break;
1911 }
1912
1913 /* target overflow */
1914 targetCapacity=0;
1915 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1916 c=0;
1917 goto endloop;
1918 }
1919 }
1920
1921 /* miscellaneous ------------------------------------------------------------ */
1922
1923 static const char *
_SCSUGetName(const UConverter * cnv)1924 _SCSUGetName(const UConverter *cnv) {
1925 SCSUData *scsu=(SCSUData *)cnv->extraInfo;
1926
1927 switch(scsu->locale) {
1928 case l_ja:
1929 return "SCSU,locale=ja";
1930 default:
1931 return "SCSU";
1932 }
1933 }
1934
1935 /* structure for SafeClone calculations */
1936 struct cloneSCSUStruct
1937 {
1938 UConverter cnv;
1939 SCSUData mydata;
1940 };
1941
1942 static UConverter *
_SCSUSafeClone(const UConverter * cnv,void * stackBuffer,int32_t * pBufferSize,UErrorCode * status)1943 _SCSUSafeClone(const UConverter *cnv,
1944 void *stackBuffer,
1945 int32_t *pBufferSize,
1946 UErrorCode *status)
1947 {
1948 struct cloneSCSUStruct * localClone;
1949 int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct);
1950
1951 if (U_FAILURE(*status)){
1952 return 0;
1953 }
1954
1955 if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
1956 *pBufferSize = bufferSizeNeeded;
1957 return 0;
1958 }
1959
1960 localClone = (struct cloneSCSUStruct *)stackBuffer;
1961 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
1962
1963 uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData));
1964 localClone->cnv.extraInfo = &localClone->mydata;
1965 localClone->cnv.isExtraLocal = TRUE;
1966
1967 return &localClone->cnv;
1968 }
1969
1970
1971 static const UConverterImpl _SCSUImpl={
1972 UCNV_SCSU,
1973
1974 NULL,
1975 NULL,
1976
1977 _SCSUOpen,
1978 _SCSUClose,
1979 _SCSUReset,
1980
1981 _SCSUToUnicode,
1982 _SCSUToUnicodeWithOffsets,
1983 _SCSUFromUnicode,
1984 _SCSUFromUnicodeWithOffsets,
1985 NULL,
1986
1987 NULL,
1988 _SCSUGetName,
1989 NULL,
1990 _SCSUSafeClone,
1991 ucnv_getCompleteUnicodeSet
1992 };
1993
1994 static const UConverterStaticData _SCSUStaticData={
1995 sizeof(UConverterStaticData),
1996 "SCSU",
1997 1212, /* CCSID for SCSU */
1998 UCNV_IBM, UCNV_SCSU,
1999 1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
2000 /*
2001 * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode
2002 * substitution string.
2003 */
2004 { 0x0e, 0xff, 0xfd, 0 }, 3,
2005 FALSE, FALSE,
2006 0,
2007 0,
2008 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
2009 };
2010
2011 const UConverterSharedData _SCSUData={
2012 sizeof(UConverterSharedData), ~((uint32_t)0),
2013 NULL, NULL, &_SCSUStaticData, FALSE, &_SCSUImpl,
2014 0
2015 };
2016
2017 #endif
2018